Commit 79442ed189acb8b949662676e750eda173c06f9b

Authored by Tang Chen
Committed by Linus Torvalds
1 parent 1402899e43

mm/memblock.c: introduce bottom-up allocation mode

The Linux kernel cannot migrate pages used by the kernel.  As a result,
kernel pages cannot be hot-removed.  So we cannot allocate hotpluggable
memory for the kernel.

ACPI SRAT (System Resource Affinity Table) contains the memory hotplug
info.  But before SRAT is parsed, memblock has already started to allocate
memory for the kernel.  So we need to prevent memblock from doing this.

In a memory hotplug system, any numa node the kernel resides in should be
unhotpluggable.  And for a modern server, each node could have at least
16GB memory.  So memory around the kernel image is highly likely
unhotpluggable.

So the basic idea is: Allocate memory from the end of the kernel image and
to the higher memory.  Since memory allocation before SRAT is parsed won't
be too much, it could highly likely be in the same node with kernel image.

The current memblock can only allocate memory top-down.  So this patch
introduces a new bottom-up allocation mode to allocate memory bottom-up.
And later when we use this allocation direction to allocate memory, we
will limit the start address above the kernel.

Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 108 additions and 3 deletions Inline Diff

include/linux/memblock.h
1 #ifndef _LINUX_MEMBLOCK_H 1 #ifndef _LINUX_MEMBLOCK_H
2 #define _LINUX_MEMBLOCK_H 2 #define _LINUX_MEMBLOCK_H
3 #ifdef __KERNEL__ 3 #ifdef __KERNEL__
4 4
5 #ifdef CONFIG_HAVE_MEMBLOCK 5 #ifdef CONFIG_HAVE_MEMBLOCK
6 /* 6 /*
7 * Logical memory blocks. 7 * Logical memory blocks.
8 * 8 *
9 * Copyright (C) 2001 Peter Bergner, IBM Corp. 9 * Copyright (C) 2001 Peter Bergner, IBM Corp.
10 * 10 *
11 * This program is free software; you can redistribute it and/or 11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License 12 * modify it under the terms of the GNU General Public License
13 * as published by the Free Software Foundation; either version 13 * as published by the Free Software Foundation; either version
14 * 2 of the License, or (at your option) any later version. 14 * 2 of the License, or (at your option) any later version.
15 */ 15 */
16 16
17 #include <linux/init.h> 17 #include <linux/init.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 19
20 #define INIT_MEMBLOCK_REGIONS 128 20 #define INIT_MEMBLOCK_REGIONS 128
21 21
22 struct memblock_region { 22 struct memblock_region {
23 phys_addr_t base; 23 phys_addr_t base;
24 phys_addr_t size; 24 phys_addr_t size;
25 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 25 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
26 int nid; 26 int nid;
27 #endif 27 #endif
28 }; 28 };
29 29
30 struct memblock_type { 30 struct memblock_type {
31 unsigned long cnt; /* number of regions */ 31 unsigned long cnt; /* number of regions */
32 unsigned long max; /* size of the allocated array */ 32 unsigned long max; /* size of the allocated array */
33 phys_addr_t total_size; /* size of all regions */ 33 phys_addr_t total_size; /* size of all regions */
34 struct memblock_region *regions; 34 struct memblock_region *regions;
35 }; 35 };
36 36
37 struct memblock { 37 struct memblock {
38 bool bottom_up; /* is bottom up direction? */
38 phys_addr_t current_limit; 39 phys_addr_t current_limit;
39 struct memblock_type memory; 40 struct memblock_type memory;
40 struct memblock_type reserved; 41 struct memblock_type reserved;
41 }; 42 };
42 43
43 extern struct memblock memblock; 44 extern struct memblock memblock;
44 extern int memblock_debug; 45 extern int memblock_debug;
45 46
46 #define memblock_dbg(fmt, ...) \ 47 #define memblock_dbg(fmt, ...) \
47 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 48 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
48 49
49 phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, 50 phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
50 phys_addr_t size, phys_addr_t align, int nid); 51 phys_addr_t size, phys_addr_t align, int nid);
51 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 52 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
52 phys_addr_t size, phys_addr_t align); 53 phys_addr_t size, phys_addr_t align);
53 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); 54 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
54 void memblock_allow_resize(void); 55 void memblock_allow_resize(void);
55 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); 56 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
56 int memblock_add(phys_addr_t base, phys_addr_t size); 57 int memblock_add(phys_addr_t base, phys_addr_t size);
57 int memblock_remove(phys_addr_t base, phys_addr_t size); 58 int memblock_remove(phys_addr_t base, phys_addr_t size);
58 int memblock_free(phys_addr_t base, phys_addr_t size); 59 int memblock_free(phys_addr_t base, phys_addr_t size);
59 int memblock_reserve(phys_addr_t base, phys_addr_t size); 60 int memblock_reserve(phys_addr_t base, phys_addr_t size);
60 void memblock_trim_memory(phys_addr_t align); 61 void memblock_trim_memory(phys_addr_t align);
61 62
62 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 63 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
63 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, 64 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
64 unsigned long *end_pfn); 65 unsigned long *end_pfn);
65 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, 66 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
66 unsigned long *out_end_pfn, int *out_nid); 67 unsigned long *out_end_pfn, int *out_nid);
67 68
68 /** 69 /**
69 * for_each_mem_pfn_range - early memory pfn range iterator 70 * for_each_mem_pfn_range - early memory pfn range iterator
70 * @i: an integer used as loop variable 71 * @i: an integer used as loop variable
71 * @nid: node selector, %MAX_NUMNODES for all nodes 72 * @nid: node selector, %MAX_NUMNODES for all nodes
72 * @p_start: ptr to ulong for start pfn of the range, can be %NULL 73 * @p_start: ptr to ulong for start pfn of the range, can be %NULL
73 * @p_end: ptr to ulong for end pfn of the range, can be %NULL 74 * @p_end: ptr to ulong for end pfn of the range, can be %NULL
74 * @p_nid: ptr to int for nid of the range, can be %NULL 75 * @p_nid: ptr to int for nid of the range, can be %NULL
75 * 76 *
76 * Walks over configured memory ranges. 77 * Walks over configured memory ranges.
77 */ 78 */
78 #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ 79 #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
79 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ 80 for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
80 i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) 81 i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
81 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 82 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
82 83
83 void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, 84 void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
84 phys_addr_t *out_end, int *out_nid); 85 phys_addr_t *out_end, int *out_nid);
85 86
86 /** 87 /**
87 * for_each_free_mem_range - iterate through free memblock areas 88 * for_each_free_mem_range - iterate through free memblock areas
88 * @i: u64 used as loop variable 89 * @i: u64 used as loop variable
89 * @nid: node selector, %MAX_NUMNODES for all nodes 90 * @nid: node selector, %MAX_NUMNODES for all nodes
90 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 91 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
91 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 92 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
92 * @p_nid: ptr to int for nid of the range, can be %NULL 93 * @p_nid: ptr to int for nid of the range, can be %NULL
93 * 94 *
94 * Walks over free (memory && !reserved) areas of memblock. Available as 95 * Walks over free (memory && !reserved) areas of memblock. Available as
95 * soon as memblock is initialized. 96 * soon as memblock is initialized.
96 */ 97 */
97 #define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ 98 #define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \
98 for (i = 0, \ 99 for (i = 0, \
99 __next_free_mem_range(&i, nid, p_start, p_end, p_nid); \ 100 __next_free_mem_range(&i, nid, p_start, p_end, p_nid); \
100 i != (u64)ULLONG_MAX; \ 101 i != (u64)ULLONG_MAX; \
101 __next_free_mem_range(&i, nid, p_start, p_end, p_nid)) 102 __next_free_mem_range(&i, nid, p_start, p_end, p_nid))
102 103
103 void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, 104 void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
104 phys_addr_t *out_end, int *out_nid); 105 phys_addr_t *out_end, int *out_nid);
105 106
106 /** 107 /**
107 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas 108 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
108 * @i: u64 used as loop variable 109 * @i: u64 used as loop variable
109 * @nid: node selector, %MAX_NUMNODES for all nodes 110 * @nid: node selector, %MAX_NUMNODES for all nodes
110 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL 111 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
111 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL 112 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
112 * @p_nid: ptr to int for nid of the range, can be %NULL 113 * @p_nid: ptr to int for nid of the range, can be %NULL
113 * 114 *
114 * Walks over free (memory && !reserved) areas of memblock in reverse 115 * Walks over free (memory && !reserved) areas of memblock in reverse
115 * order. Available as soon as memblock is initialized. 116 * order. Available as soon as memblock is initialized.
116 */ 117 */
117 #define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ 118 #define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
118 for (i = (u64)ULLONG_MAX, \ 119 for (i = (u64)ULLONG_MAX, \
119 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \ 120 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \
120 i != (u64)ULLONG_MAX; \ 121 i != (u64)ULLONG_MAX; \
121 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) 122 __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
122 123
123 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 124 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
124 int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); 125 int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
125 126
126 static inline void memblock_set_region_node(struct memblock_region *r, int nid) 127 static inline void memblock_set_region_node(struct memblock_region *r, int nid)
127 { 128 {
128 r->nid = nid; 129 r->nid = nid;
129 } 130 }
130 131
131 static inline int memblock_get_region_node(const struct memblock_region *r) 132 static inline int memblock_get_region_node(const struct memblock_region *r)
132 { 133 {
133 return r->nid; 134 return r->nid;
134 } 135 }
135 #else 136 #else
136 static inline void memblock_set_region_node(struct memblock_region *r, int nid) 137 static inline void memblock_set_region_node(struct memblock_region *r, int nid)
137 { 138 {
138 } 139 }
139 140
140 static inline int memblock_get_region_node(const struct memblock_region *r) 141 static inline int memblock_get_region_node(const struct memblock_region *r)
141 { 142 {
142 return 0; 143 return 0;
143 } 144 }
144 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 145 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
145 146
146 phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); 147 phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
147 phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); 148 phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
148 149
149 phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); 150 phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
151
152 #ifdef CONFIG_MOVABLE_NODE
153 /*
154 * Set the allocation direction to bottom-up or top-down.
155 */
156 static inline void memblock_set_bottom_up(bool enable)
157 {
158 memblock.bottom_up = enable;
159 }
160
161 /*
162 * Check if the allocation direction is bottom-up or not.
163 * if this is true, that said, memblock will allocate memory
164 * in bottom-up direction.
165 */
166 static inline bool memblock_bottom_up(void)
167 {
168 return memblock.bottom_up;
169 }
170 #else
171 static inline void memblock_set_bottom_up(bool enable) {}
172 static inline bool memblock_bottom_up(void) { return false; }
173 #endif
150 174
151 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ 175 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
152 #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) 176 #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
153 #define MEMBLOCK_ALLOC_ACCESSIBLE 0 177 #define MEMBLOCK_ALLOC_ACCESSIBLE 0
154 178
155 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, 179 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
156 phys_addr_t max_addr); 180 phys_addr_t max_addr);
157 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, 181 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
158 phys_addr_t max_addr); 182 phys_addr_t max_addr);
159 phys_addr_t memblock_phys_mem_size(void); 183 phys_addr_t memblock_phys_mem_size(void);
160 phys_addr_t memblock_mem_size(unsigned long limit_pfn); 184 phys_addr_t memblock_mem_size(unsigned long limit_pfn);
161 phys_addr_t memblock_start_of_DRAM(void); 185 phys_addr_t memblock_start_of_DRAM(void);
162 phys_addr_t memblock_end_of_DRAM(void); 186 phys_addr_t memblock_end_of_DRAM(void);
163 void memblock_enforce_memory_limit(phys_addr_t memory_limit); 187 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
164 int memblock_is_memory(phys_addr_t addr); 188 int memblock_is_memory(phys_addr_t addr);
165 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); 189 int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
166 int memblock_is_reserved(phys_addr_t addr); 190 int memblock_is_reserved(phys_addr_t addr);
167 int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); 191 int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
168 192
169 extern void __memblock_dump_all(void); 193 extern void __memblock_dump_all(void);
170 194
171 static inline void memblock_dump_all(void) 195 static inline void memblock_dump_all(void)
172 { 196 {
173 if (memblock_debug) 197 if (memblock_debug)
174 __memblock_dump_all(); 198 __memblock_dump_all();
175 } 199 }
176 200
177 /** 201 /**
178 * memblock_set_current_limit - Set the current allocation limit to allow 202 * memblock_set_current_limit - Set the current allocation limit to allow
179 * limiting allocations to what is currently 203 * limiting allocations to what is currently
180 * accessible during boot 204 * accessible during boot
181 * @limit: New limit value (physical address) 205 * @limit: New limit value (physical address)
182 */ 206 */
183 void memblock_set_current_limit(phys_addr_t limit); 207 void memblock_set_current_limit(phys_addr_t limit);
184 208
185 209
186 /* 210 /*
187 * pfn conversion functions 211 * pfn conversion functions
188 * 212 *
189 * While the memory MEMBLOCKs should always be page aligned, the reserved 213 * While the memory MEMBLOCKs should always be page aligned, the reserved
190 * MEMBLOCKs may not be. This accessor attempt to provide a very clear 214 * MEMBLOCKs may not be. This accessor attempt to provide a very clear
191 * idea of what they return for such non aligned MEMBLOCKs. 215 * idea of what they return for such non aligned MEMBLOCKs.
192 */ 216 */
193 217
194 /** 218 /**
195 * memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region 219 * memblock_region_memory_base_pfn - Return the lowest pfn intersecting with the memory region
196 * @reg: memblock_region structure 220 * @reg: memblock_region structure
197 */ 221 */
198 static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg) 222 static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
199 { 223 {
200 return PFN_UP(reg->base); 224 return PFN_UP(reg->base);
201 } 225 }
202 226
203 /** 227 /**
204 * memblock_region_memory_end_pfn - Return the end_pfn this region 228 * memblock_region_memory_end_pfn - Return the end_pfn this region
205 * @reg: memblock_region structure 229 * @reg: memblock_region structure
206 */ 230 */
207 static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg) 231 static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
208 { 232 {
209 return PFN_DOWN(reg->base + reg->size); 233 return PFN_DOWN(reg->base + reg->size);
210 } 234 }
211 235
212 /** 236 /**
213 * memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region 237 * memblock_region_reserved_base_pfn - Return the lowest pfn intersecting with the reserved region
214 * @reg: memblock_region structure 238 * @reg: memblock_region structure
215 */ 239 */
216 static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg) 240 static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
217 { 241 {
218 return PFN_DOWN(reg->base); 242 return PFN_DOWN(reg->base);
219 } 243 }
220 244
221 /** 245 /**
222 * memblock_region_reserved_end_pfn - Return the end_pfn this region 246 * memblock_region_reserved_end_pfn - Return the end_pfn this region
223 * @reg: memblock_region structure 247 * @reg: memblock_region structure
224 */ 248 */
225 static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg) 249 static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
226 { 250 {
227 return PFN_UP(reg->base + reg->size); 251 return PFN_UP(reg->base + reg->size);
228 } 252 }
229 253
230 #define for_each_memblock(memblock_type, region) \ 254 #define for_each_memblock(memblock_type, region) \
231 for (region = memblock.memblock_type.regions; \ 255 for (region = memblock.memblock_type.regions; \
232 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ 256 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
233 region++) 257 region++)
234 258
235 259
236 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK 260 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
237 #define __init_memblock __meminit 261 #define __init_memblock __meminit
238 #define __initdata_memblock __meminitdata 262 #define __initdata_memblock __meminitdata
239 #else 263 #else
240 #define __init_memblock 264 #define __init_memblock
241 #define __initdata_memblock 265 #define __initdata_memblock
242 #endif 266 #endif
243 267
244 #else 268 #else
245 static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) 269 static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
246 { 270 {
247 return 0; 271 return 0;
248 } 272 }
249 273
250 #endif /* CONFIG_HAVE_MEMBLOCK */ 274 #endif /* CONFIG_HAVE_MEMBLOCK */
251 275
252 #endif /* __KERNEL__ */ 276 #endif /* __KERNEL__ */
253 277
254 #endif /* _LINUX_MEMBLOCK_H */ 278 #endif /* _LINUX_MEMBLOCK_H */
255 279
1 #ifndef _LINUX_MM_H 1 #ifndef _LINUX_MM_H
2 #define _LINUX_MM_H 2 #define _LINUX_MM_H
3 3
4 #include <linux/errno.h> 4 #include <linux/errno.h>
5 5
6 #ifdef __KERNEL__ 6 #ifdef __KERNEL__
7 7
8 #include <linux/gfp.h> 8 #include <linux/gfp.h>
9 #include <linux/bug.h> 9 #include <linux/bug.h>
10 #include <linux/list.h> 10 #include <linux/list.h>
11 #include <linux/mmzone.h> 11 #include <linux/mmzone.h>
12 #include <linux/rbtree.h> 12 #include <linux/rbtree.h>
13 #include <linux/atomic.h> 13 #include <linux/atomic.h>
14 #include <linux/debug_locks.h> 14 #include <linux/debug_locks.h>
15 #include <linux/mm_types.h> 15 #include <linux/mm_types.h>
16 #include <linux/range.h> 16 #include <linux/range.h>
17 #include <linux/pfn.h> 17 #include <linux/pfn.h>
18 #include <linux/bit_spinlock.h> 18 #include <linux/bit_spinlock.h>
19 #include <linux/shrinker.h> 19 #include <linux/shrinker.h>
20 20
21 struct mempolicy; 21 struct mempolicy;
22 struct anon_vma; 22 struct anon_vma;
23 struct anon_vma_chain; 23 struct anon_vma_chain;
24 struct file_ra_state; 24 struct file_ra_state;
25 struct user_struct; 25 struct user_struct;
26 struct writeback_control; 26 struct writeback_control;
27 27
28 #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ 28 #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
29 extern unsigned long max_mapnr; 29 extern unsigned long max_mapnr;
30 30
31 static inline void set_max_mapnr(unsigned long limit) 31 static inline void set_max_mapnr(unsigned long limit)
32 { 32 {
33 max_mapnr = limit; 33 max_mapnr = limit;
34 } 34 }
35 #else 35 #else
36 static inline void set_max_mapnr(unsigned long limit) { } 36 static inline void set_max_mapnr(unsigned long limit) { }
37 #endif 37 #endif
38 38
39 extern unsigned long totalram_pages; 39 extern unsigned long totalram_pages;
40 extern void * high_memory; 40 extern void * high_memory;
41 extern int page_cluster; 41 extern int page_cluster;
42 42
43 #ifdef CONFIG_SYSCTL 43 #ifdef CONFIG_SYSCTL
44 extern int sysctl_legacy_va_layout; 44 extern int sysctl_legacy_va_layout;
45 #else 45 #else
46 #define sysctl_legacy_va_layout 0 46 #define sysctl_legacy_va_layout 0
47 #endif 47 #endif
48 48
49 #include <asm/page.h> 49 #include <asm/page.h>
50 #include <asm/pgtable.h> 50 #include <asm/pgtable.h>
51 #include <asm/processor.h> 51 #include <asm/processor.h>
52 52
53 #ifndef __pa_symbol
54 #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0))
55 #endif
56
53 extern unsigned long sysctl_user_reserve_kbytes; 57 extern unsigned long sysctl_user_reserve_kbytes;
54 extern unsigned long sysctl_admin_reserve_kbytes; 58 extern unsigned long sysctl_admin_reserve_kbytes;
55 59
56 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 60 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
57 61
58 /* to align the pointer to the (next) page boundary */ 62 /* to align the pointer to the (next) page boundary */
59 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) 63 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
60 64
61 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ 65 /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
62 #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)addr, PAGE_SIZE) 66 #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)addr, PAGE_SIZE)
63 67
64 /* 68 /*
65 * Linux kernel virtual memory manager primitives. 69 * Linux kernel virtual memory manager primitives.
66 * The idea being to have a "virtual" mm in the same way 70 * The idea being to have a "virtual" mm in the same way
67 * we have a virtual fs - giving a cleaner interface to the 71 * we have a virtual fs - giving a cleaner interface to the
68 * mm details, and allowing different kinds of memory mappings 72 * mm details, and allowing different kinds of memory mappings
69 * (from shared memory to executable loading to arbitrary 73 * (from shared memory to executable loading to arbitrary
70 * mmap() functions). 74 * mmap() functions).
71 */ 75 */
72 76
73 extern struct kmem_cache *vm_area_cachep; 77 extern struct kmem_cache *vm_area_cachep;
74 78
75 #ifndef CONFIG_MMU 79 #ifndef CONFIG_MMU
76 extern struct rb_root nommu_region_tree; 80 extern struct rb_root nommu_region_tree;
77 extern struct rw_semaphore nommu_region_sem; 81 extern struct rw_semaphore nommu_region_sem;
78 82
79 extern unsigned int kobjsize(const void *objp); 83 extern unsigned int kobjsize(const void *objp);
80 #endif 84 #endif
81 85
82 /* 86 /*
83 * vm_flags in vm_area_struct, see mm_types.h. 87 * vm_flags in vm_area_struct, see mm_types.h.
84 */ 88 */
85 #define VM_NONE 0x00000000 89 #define VM_NONE 0x00000000
86 90
87 #define VM_READ 0x00000001 /* currently active flags */ 91 #define VM_READ 0x00000001 /* currently active flags */
88 #define VM_WRITE 0x00000002 92 #define VM_WRITE 0x00000002
89 #define VM_EXEC 0x00000004 93 #define VM_EXEC 0x00000004
90 #define VM_SHARED 0x00000008 94 #define VM_SHARED 0x00000008
91 95
92 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ 96 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
93 #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ 97 #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
94 #define VM_MAYWRITE 0x00000020 98 #define VM_MAYWRITE 0x00000020
95 #define VM_MAYEXEC 0x00000040 99 #define VM_MAYEXEC 0x00000040
96 #define VM_MAYSHARE 0x00000080 100 #define VM_MAYSHARE 0x00000080
97 101
98 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 102 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */
99 #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 103 #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
100 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 104 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
101 105
102 #define VM_LOCKED 0x00002000 106 #define VM_LOCKED 0x00002000
103 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 107 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
104 108
105 /* Used by sys_madvise() */ 109 /* Used by sys_madvise() */
106 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ 110 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
107 #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ 111 #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
108 112
109 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 113 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
110 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 114 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
111 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 115 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
112 #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 116 #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
113 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 117 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
114 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 118 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
115 #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 119 #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
116 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 120 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
117 121
118 #ifdef CONFIG_MEM_SOFT_DIRTY 122 #ifdef CONFIG_MEM_SOFT_DIRTY
119 # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ 123 # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
120 #else 124 #else
121 # define VM_SOFTDIRTY 0 125 # define VM_SOFTDIRTY 0
122 #endif 126 #endif
123 127
124 #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 128 #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
125 #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ 129 #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
126 #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ 130 #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
127 #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 131 #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
128 132
129 #if defined(CONFIG_X86) 133 #if defined(CONFIG_X86)
130 # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ 134 # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
131 #elif defined(CONFIG_PPC) 135 #elif defined(CONFIG_PPC)
132 # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ 136 # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
133 #elif defined(CONFIG_PARISC) 137 #elif defined(CONFIG_PARISC)
134 # define VM_GROWSUP VM_ARCH_1 138 # define VM_GROWSUP VM_ARCH_1
135 #elif defined(CONFIG_METAG) 139 #elif defined(CONFIG_METAG)
136 # define VM_GROWSUP VM_ARCH_1 140 # define VM_GROWSUP VM_ARCH_1
137 #elif defined(CONFIG_IA64) 141 #elif defined(CONFIG_IA64)
138 # define VM_GROWSUP VM_ARCH_1 142 # define VM_GROWSUP VM_ARCH_1
139 #elif !defined(CONFIG_MMU) 143 #elif !defined(CONFIG_MMU)
140 # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ 144 # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
141 #endif 145 #endif
142 146
143 #ifndef VM_GROWSUP 147 #ifndef VM_GROWSUP
144 # define VM_GROWSUP VM_NONE 148 # define VM_GROWSUP VM_NONE
145 #endif 149 #endif
146 150
147 /* Bits set in the VMA until the stack is in its final location */ 151 /* Bits set in the VMA until the stack is in its final location */
148 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) 152 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
149 153
150 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 154 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
151 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 155 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
152 #endif 156 #endif
153 157
154 #ifdef CONFIG_STACK_GROWSUP 158 #ifdef CONFIG_STACK_GROWSUP
155 #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 159 #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
156 #else 160 #else
157 #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 161 #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
158 #endif 162 #endif
159 163
160 /* 164 /*
161 * Special vmas that are non-mergable, non-mlock()able. 165 * Special vmas that are non-mergable, non-mlock()able.
162 * Note: mm/huge_memory.c VM_NO_THP depends on this definition. 166 * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
163 */ 167 */
164 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) 168 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP)
165 169
166 /* 170 /*
167 * mapping from the currently active vm_flags protection bits (the 171 * mapping from the currently active vm_flags protection bits (the
168 * low four bits) to a page protection mask.. 172 * low four bits) to a page protection mask..
169 */ 173 */
170 extern pgprot_t protection_map[16]; 174 extern pgprot_t protection_map[16];
171 175
172 #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ 176 #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
173 #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ 177 #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
174 #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ 178 #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
175 #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ 179 #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
176 #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ 180 #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
177 #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ 181 #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
178 #define FAULT_FLAG_TRIED 0x40 /* second try */ 182 #define FAULT_FLAG_TRIED 0x40 /* second try */
179 #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ 183 #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */
180 184
181 /* 185 /*
182 * vm_fault is filled by the the pagefault handler and passed to the vma's 186 * vm_fault is filled by the the pagefault handler and passed to the vma's
183 * ->fault function. The vma's ->fault is responsible for returning a bitmask 187 * ->fault function. The vma's ->fault is responsible for returning a bitmask
184 * of VM_FAULT_xxx flags that give details about how the fault was handled. 188 * of VM_FAULT_xxx flags that give details about how the fault was handled.
185 * 189 *
186 * pgoff should be used in favour of virtual_address, if possible. If pgoff 190 * pgoff should be used in favour of virtual_address, if possible. If pgoff
187 * is used, one may implement ->remap_pages to get nonlinear mapping support. 191 * is used, one may implement ->remap_pages to get nonlinear mapping support.
188 */ 192 */
189 struct vm_fault { 193 struct vm_fault {
190 unsigned int flags; /* FAULT_FLAG_xxx flags */ 194 unsigned int flags; /* FAULT_FLAG_xxx flags */
191 pgoff_t pgoff; /* Logical page offset based on vma */ 195 pgoff_t pgoff; /* Logical page offset based on vma */
192 void __user *virtual_address; /* Faulting virtual address */ 196 void __user *virtual_address; /* Faulting virtual address */
193 197
194 struct page *page; /* ->fault handlers should return a 198 struct page *page; /* ->fault handlers should return a
195 * page here, unless VM_FAULT_NOPAGE 199 * page here, unless VM_FAULT_NOPAGE
196 * is set (which is also implied by 200 * is set (which is also implied by
197 * VM_FAULT_ERROR). 201 * VM_FAULT_ERROR).
198 */ 202 */
199 }; 203 };
200 204
201 /* 205 /*
202 * These are the virtual MM functions - opening of an area, closing and 206 * These are the virtual MM functions - opening of an area, closing and
203 * unmapping it (needed to keep files on disk up-to-date etc), pointer 207 * unmapping it (needed to keep files on disk up-to-date etc), pointer
204 * to the functions called when a no-page or a wp-page exception occurs. 208 * to the functions called when a no-page or a wp-page exception occurs.
205 */ 209 */
206 struct vm_operations_struct { 210 struct vm_operations_struct {
207 void (*open)(struct vm_area_struct * area); 211 void (*open)(struct vm_area_struct * area);
208 void (*close)(struct vm_area_struct * area); 212 void (*close)(struct vm_area_struct * area);
209 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 213 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
210 214
211 /* notification that a previously read-only page is about to become 215 /* notification that a previously read-only page is about to become
212 * writable, if an error is returned it will cause a SIGBUS */ 216 * writable, if an error is returned it will cause a SIGBUS */
213 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); 217 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
214 218
215 /* called by access_process_vm when get_user_pages() fails, typically 219 /* called by access_process_vm when get_user_pages() fails, typically
216 * for use by special VMAs that can switch between memory and hardware 220 * for use by special VMAs that can switch between memory and hardware
217 */ 221 */
218 int (*access)(struct vm_area_struct *vma, unsigned long addr, 222 int (*access)(struct vm_area_struct *vma, unsigned long addr,
219 void *buf, int len, int write); 223 void *buf, int len, int write);
220 #ifdef CONFIG_NUMA 224 #ifdef CONFIG_NUMA
221 /* 225 /*
222 * set_policy() op must add a reference to any non-NULL @new mempolicy 226 * set_policy() op must add a reference to any non-NULL @new mempolicy
223 * to hold the policy upon return. Caller should pass NULL @new to 227 * to hold the policy upon return. Caller should pass NULL @new to
224 * remove a policy and fall back to surrounding context--i.e. do not 228 * remove a policy and fall back to surrounding context--i.e. do not
225 * install a MPOL_DEFAULT policy, nor the task or system default 229 * install a MPOL_DEFAULT policy, nor the task or system default
226 * mempolicy. 230 * mempolicy.
227 */ 231 */
228 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); 232 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
229 233
230 /* 234 /*
231 * get_policy() op must add reference [mpol_get()] to any policy at 235 * get_policy() op must add reference [mpol_get()] to any policy at
232 * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure 236 * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
233 * in mm/mempolicy.c will do this automatically. 237 * in mm/mempolicy.c will do this automatically.
234 * get_policy() must NOT add a ref if the policy at (vma,addr) is not 238 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
235 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. 239 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
236 * If no [shared/vma] mempolicy exists at the addr, get_policy() op 240 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
237 * must return NULL--i.e., do not "fallback" to task or system default 241 * must return NULL--i.e., do not "fallback" to task or system default
238 * policy. 242 * policy.
239 */ 243 */
240 struct mempolicy *(*get_policy)(struct vm_area_struct *vma, 244 struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
241 unsigned long addr); 245 unsigned long addr);
242 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, 246 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
243 const nodemask_t *to, unsigned long flags); 247 const nodemask_t *to, unsigned long flags);
244 #endif 248 #endif
245 /* called by sys_remap_file_pages() to populate non-linear mapping */ 249 /* called by sys_remap_file_pages() to populate non-linear mapping */
246 int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, 250 int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
247 unsigned long size, pgoff_t pgoff); 251 unsigned long size, pgoff_t pgoff);
248 }; 252 };
249 253
250 struct mmu_gather; 254 struct mmu_gather;
251 struct inode; 255 struct inode;
252 256
253 #define page_private(page) ((page)->private) 257 #define page_private(page) ((page)->private)
254 #define set_page_private(page, v) ((page)->private = (v)) 258 #define set_page_private(page, v) ((page)->private = (v))
255 259
256 /* It's valid only if the page is free path or free_list */ 260 /* It's valid only if the page is free path or free_list */
257 static inline void set_freepage_migratetype(struct page *page, int migratetype) 261 static inline void set_freepage_migratetype(struct page *page, int migratetype)
258 { 262 {
259 page->index = migratetype; 263 page->index = migratetype;
260 } 264 }
261 265
262 /* It's valid only if the page is free path or free_list */ 266 /* It's valid only if the page is free path or free_list */
263 static inline int get_freepage_migratetype(struct page *page) 267 static inline int get_freepage_migratetype(struct page *page)
264 { 268 {
265 return page->index; 269 return page->index;
266 } 270 }
267 271
268 /* 272 /*
269 * FIXME: take this include out, include page-flags.h in 273 * FIXME: take this include out, include page-flags.h in
270 * files which need it (119 of them) 274 * files which need it (119 of them)
271 */ 275 */
272 #include <linux/page-flags.h> 276 #include <linux/page-flags.h>
273 #include <linux/huge_mm.h> 277 #include <linux/huge_mm.h>
274 278
275 /* 279 /*
276 * Methods to modify the page usage count. 280 * Methods to modify the page usage count.
277 * 281 *
278 * What counts for a page usage: 282 * What counts for a page usage:
279 * - cache mapping (page->mapping) 283 * - cache mapping (page->mapping)
280 * - private data (page->private) 284 * - private data (page->private)
281 * - page mapped in a task's page tables, each mapping 285 * - page mapped in a task's page tables, each mapping
282 * is counted separately 286 * is counted separately
283 * 287 *
284 * Also, many kernel routines increase the page count before a critical 288 * Also, many kernel routines increase the page count before a critical
285 * routine so they can be sure the page doesn't go away from under them. 289 * routine so they can be sure the page doesn't go away from under them.
286 */ 290 */
287 291
288 /* 292 /*
289 * Drop a ref, return true if the refcount fell to zero (the page has no users) 293 * Drop a ref, return true if the refcount fell to zero (the page has no users)
290 */ 294 */
291 static inline int put_page_testzero(struct page *page) 295 static inline int put_page_testzero(struct page *page)
292 { 296 {
293 VM_BUG_ON(atomic_read(&page->_count) == 0); 297 VM_BUG_ON(atomic_read(&page->_count) == 0);
294 return atomic_dec_and_test(&page->_count); 298 return atomic_dec_and_test(&page->_count);
295 } 299 }
296 300
297 /* 301 /*
298 * Try to grab a ref unless the page has a refcount of zero, return false if 302 * Try to grab a ref unless the page has a refcount of zero, return false if
299 * that is the case. 303 * that is the case.
300 * This can be called when MMU is off so it must not access 304 * This can be called when MMU is off so it must not access
301 * any of the virtual mappings. 305 * any of the virtual mappings.
302 */ 306 */
303 static inline int get_page_unless_zero(struct page *page) 307 static inline int get_page_unless_zero(struct page *page)
304 { 308 {
305 return atomic_inc_not_zero(&page->_count); 309 return atomic_inc_not_zero(&page->_count);
306 } 310 }
307 311
308 /* 312 /*
309 * Try to drop a ref unless the page has a refcount of one, return false if 313 * Try to drop a ref unless the page has a refcount of one, return false if
310 * that is the case. 314 * that is the case.
311 * This is to make sure that the refcount won't become zero after this drop. 315 * This is to make sure that the refcount won't become zero after this drop.
312 * This can be called when MMU is off so it must not access 316 * This can be called when MMU is off so it must not access
313 * any of the virtual mappings. 317 * any of the virtual mappings.
314 */ 318 */
315 static inline int put_page_unless_one(struct page *page) 319 static inline int put_page_unless_one(struct page *page)
316 { 320 {
317 return atomic_add_unless(&page->_count, -1, 1); 321 return atomic_add_unless(&page->_count, -1, 1);
318 } 322 }
319 323
320 extern int page_is_ram(unsigned long pfn); 324 extern int page_is_ram(unsigned long pfn);
321 325
322 /* Support for virtually mapped pages */ 326 /* Support for virtually mapped pages */
323 struct page *vmalloc_to_page(const void *addr); 327 struct page *vmalloc_to_page(const void *addr);
324 unsigned long vmalloc_to_pfn(const void *addr); 328 unsigned long vmalloc_to_pfn(const void *addr);
325 329
326 /* 330 /*
327 * Determine if an address is within the vmalloc range 331 * Determine if an address is within the vmalloc range
328 * 332 *
329 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there 333 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
330 * is no special casing required. 334 * is no special casing required.
331 */ 335 */
332 static inline int is_vmalloc_addr(const void *x) 336 static inline int is_vmalloc_addr(const void *x)
333 { 337 {
334 #ifdef CONFIG_MMU 338 #ifdef CONFIG_MMU
335 unsigned long addr = (unsigned long)x; 339 unsigned long addr = (unsigned long)x;
336 340
337 return addr >= VMALLOC_START && addr < VMALLOC_END; 341 return addr >= VMALLOC_START && addr < VMALLOC_END;
338 #else 342 #else
339 return 0; 343 return 0;
340 #endif 344 #endif
341 } 345 }
342 #ifdef CONFIG_MMU 346 #ifdef CONFIG_MMU
343 extern int is_vmalloc_or_module_addr(const void *x); 347 extern int is_vmalloc_or_module_addr(const void *x);
344 #else 348 #else
345 static inline int is_vmalloc_or_module_addr(const void *x) 349 static inline int is_vmalloc_or_module_addr(const void *x)
346 { 350 {
347 return 0; 351 return 0;
348 } 352 }
349 #endif 353 #endif
350 354
351 static inline void compound_lock(struct page *page) 355 static inline void compound_lock(struct page *page)
352 { 356 {
353 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 357 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
354 VM_BUG_ON(PageSlab(page)); 358 VM_BUG_ON(PageSlab(page));
355 bit_spin_lock(PG_compound_lock, &page->flags); 359 bit_spin_lock(PG_compound_lock, &page->flags);
356 #endif 360 #endif
357 } 361 }
358 362
359 static inline void compound_unlock(struct page *page) 363 static inline void compound_unlock(struct page *page)
360 { 364 {
361 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 365 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
362 VM_BUG_ON(PageSlab(page)); 366 VM_BUG_ON(PageSlab(page));
363 bit_spin_unlock(PG_compound_lock, &page->flags); 367 bit_spin_unlock(PG_compound_lock, &page->flags);
364 #endif 368 #endif
365 } 369 }
366 370
367 static inline unsigned long compound_lock_irqsave(struct page *page) 371 static inline unsigned long compound_lock_irqsave(struct page *page)
368 { 372 {
369 unsigned long uninitialized_var(flags); 373 unsigned long uninitialized_var(flags);
370 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 374 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
371 local_irq_save(flags); 375 local_irq_save(flags);
372 compound_lock(page); 376 compound_lock(page);
373 #endif 377 #endif
374 return flags; 378 return flags;
375 } 379 }
376 380
377 static inline void compound_unlock_irqrestore(struct page *page, 381 static inline void compound_unlock_irqrestore(struct page *page,
378 unsigned long flags) 382 unsigned long flags)
379 { 383 {
380 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 384 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
381 compound_unlock(page); 385 compound_unlock(page);
382 local_irq_restore(flags); 386 local_irq_restore(flags);
383 #endif 387 #endif
384 } 388 }
385 389
386 static inline struct page *compound_head(struct page *page) 390 static inline struct page *compound_head(struct page *page)
387 { 391 {
388 if (unlikely(PageTail(page))) 392 if (unlikely(PageTail(page)))
389 return page->first_page; 393 return page->first_page;
390 return page; 394 return page;
391 } 395 }
392 396
393 /* 397 /*
394 * The atomic page->_mapcount, starts from -1: so that transitions 398 * The atomic page->_mapcount, starts from -1: so that transitions
395 * both from it and to it can be tracked, using atomic_inc_and_test 399 * both from it and to it can be tracked, using atomic_inc_and_test
396 * and atomic_add_negative(-1). 400 * and atomic_add_negative(-1).
397 */ 401 */
398 static inline void page_mapcount_reset(struct page *page) 402 static inline void page_mapcount_reset(struct page *page)
399 { 403 {
400 atomic_set(&(page)->_mapcount, -1); 404 atomic_set(&(page)->_mapcount, -1);
401 } 405 }
402 406
403 static inline int page_mapcount(struct page *page) 407 static inline int page_mapcount(struct page *page)
404 { 408 {
405 return atomic_read(&(page)->_mapcount) + 1; 409 return atomic_read(&(page)->_mapcount) + 1;
406 } 410 }
407 411
408 static inline int page_count(struct page *page) 412 static inline int page_count(struct page *page)
409 { 413 {
410 return atomic_read(&compound_head(page)->_count); 414 return atomic_read(&compound_head(page)->_count);
411 } 415 }
412 416
413 static inline void get_huge_page_tail(struct page *page) 417 static inline void get_huge_page_tail(struct page *page)
414 { 418 {
415 /* 419 /*
416 * __split_huge_page_refcount() cannot run 420 * __split_huge_page_refcount() cannot run
417 * from under us. 421 * from under us.
418 */ 422 */
419 VM_BUG_ON(page_mapcount(page) < 0); 423 VM_BUG_ON(page_mapcount(page) < 0);
420 VM_BUG_ON(atomic_read(&page->_count) != 0); 424 VM_BUG_ON(atomic_read(&page->_count) != 0);
421 atomic_inc(&page->_mapcount); 425 atomic_inc(&page->_mapcount);
422 } 426 }
423 427
424 extern bool __get_page_tail(struct page *page); 428 extern bool __get_page_tail(struct page *page);
425 429
426 static inline void get_page(struct page *page) 430 static inline void get_page(struct page *page)
427 { 431 {
428 if (unlikely(PageTail(page))) 432 if (unlikely(PageTail(page)))
429 if (likely(__get_page_tail(page))) 433 if (likely(__get_page_tail(page)))
430 return; 434 return;
431 /* 435 /*
432 * Getting a normal page or the head of a compound page 436 * Getting a normal page or the head of a compound page
433 * requires to already have an elevated page->_count. 437 * requires to already have an elevated page->_count.
434 */ 438 */
435 VM_BUG_ON(atomic_read(&page->_count) <= 0); 439 VM_BUG_ON(atomic_read(&page->_count) <= 0);
436 atomic_inc(&page->_count); 440 atomic_inc(&page->_count);
437 } 441 }
438 442
439 static inline struct page *virt_to_head_page(const void *x) 443 static inline struct page *virt_to_head_page(const void *x)
440 { 444 {
441 struct page *page = virt_to_page(x); 445 struct page *page = virt_to_page(x);
442 return compound_head(page); 446 return compound_head(page);
443 } 447 }
444 448
445 /* 449 /*
446 * Setup the page count before being freed into the page allocator for 450 * Setup the page count before being freed into the page allocator for
447 * the first time (boot or memory hotplug) 451 * the first time (boot or memory hotplug)
448 */ 452 */
449 static inline void init_page_count(struct page *page) 453 static inline void init_page_count(struct page *page)
450 { 454 {
451 atomic_set(&page->_count, 1); 455 atomic_set(&page->_count, 1);
452 } 456 }
453 457
454 /* 458 /*
455 * PageBuddy() indicate that the page is free and in the buddy system 459 * PageBuddy() indicate that the page is free and in the buddy system
456 * (see mm/page_alloc.c). 460 * (see mm/page_alloc.c).
457 * 461 *
458 * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to 462 * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
459 * -2 so that an underflow of the page_mapcount() won't be mistaken 463 * -2 so that an underflow of the page_mapcount() won't be mistaken
460 * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very 464 * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
461 * efficiently by most CPU architectures. 465 * efficiently by most CPU architectures.
462 */ 466 */
463 #define PAGE_BUDDY_MAPCOUNT_VALUE (-128) 467 #define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
464 468
465 static inline int PageBuddy(struct page *page) 469 static inline int PageBuddy(struct page *page)
466 { 470 {
467 return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; 471 return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
468 } 472 }
469 473
470 static inline void __SetPageBuddy(struct page *page) 474 static inline void __SetPageBuddy(struct page *page)
471 { 475 {
472 VM_BUG_ON(atomic_read(&page->_mapcount) != -1); 476 VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
473 atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); 477 atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
474 } 478 }
475 479
476 static inline void __ClearPageBuddy(struct page *page) 480 static inline void __ClearPageBuddy(struct page *page)
477 { 481 {
478 VM_BUG_ON(!PageBuddy(page)); 482 VM_BUG_ON(!PageBuddy(page));
479 atomic_set(&page->_mapcount, -1); 483 atomic_set(&page->_mapcount, -1);
480 } 484 }
481 485
482 void put_page(struct page *page); 486 void put_page(struct page *page);
483 void put_pages_list(struct list_head *pages); 487 void put_pages_list(struct list_head *pages);
484 488
485 void split_page(struct page *page, unsigned int order); 489 void split_page(struct page *page, unsigned int order);
486 int split_free_page(struct page *page); 490 int split_free_page(struct page *page);
487 491
488 /* 492 /*
489 * Compound pages have a destructor function. Provide a 493 * Compound pages have a destructor function. Provide a
490 * prototype for that function and accessor functions. 494 * prototype for that function and accessor functions.
491 * These are _only_ valid on the head of a PG_compound page. 495 * These are _only_ valid on the head of a PG_compound page.
492 */ 496 */
493 typedef void compound_page_dtor(struct page *); 497 typedef void compound_page_dtor(struct page *);
494 498
495 static inline void set_compound_page_dtor(struct page *page, 499 static inline void set_compound_page_dtor(struct page *page,
496 compound_page_dtor *dtor) 500 compound_page_dtor *dtor)
497 { 501 {
498 page[1].lru.next = (void *)dtor; 502 page[1].lru.next = (void *)dtor;
499 } 503 }
500 504
501 static inline compound_page_dtor *get_compound_page_dtor(struct page *page) 505 static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
502 { 506 {
503 return (compound_page_dtor *)page[1].lru.next; 507 return (compound_page_dtor *)page[1].lru.next;
504 } 508 }
505 509
506 static inline int compound_order(struct page *page) 510 static inline int compound_order(struct page *page)
507 { 511 {
508 if (!PageHead(page)) 512 if (!PageHead(page))
509 return 0; 513 return 0;
510 return (unsigned long)page[1].lru.prev; 514 return (unsigned long)page[1].lru.prev;
511 } 515 }
512 516
513 static inline void set_compound_order(struct page *page, unsigned long order) 517 static inline void set_compound_order(struct page *page, unsigned long order)
514 { 518 {
515 page[1].lru.prev = (void *)order; 519 page[1].lru.prev = (void *)order;
516 } 520 }
517 521
518 #ifdef CONFIG_MMU 522 #ifdef CONFIG_MMU
519 /* 523 /*
520 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 524 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
521 * servicing faults for write access. In the normal case, do always want 525 * servicing faults for write access. In the normal case, do always want
522 * pte_mkwrite. But get_user_pages can cause write faults for mappings 526 * pte_mkwrite. But get_user_pages can cause write faults for mappings
523 * that do not have writing enabled, when used by access_process_vm. 527 * that do not have writing enabled, when used by access_process_vm.
524 */ 528 */
525 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) 529 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
526 { 530 {
527 if (likely(vma->vm_flags & VM_WRITE)) 531 if (likely(vma->vm_flags & VM_WRITE))
528 pte = pte_mkwrite(pte); 532 pte = pte_mkwrite(pte);
529 return pte; 533 return pte;
530 } 534 }
531 #endif 535 #endif
532 536
533 /* 537 /*
534 * Multiple processes may "see" the same page. E.g. for untouched 538 * Multiple processes may "see" the same page. E.g. for untouched
535 * mappings of /dev/null, all processes see the same page full of 539 * mappings of /dev/null, all processes see the same page full of
536 * zeroes, and text pages of executables and shared libraries have 540 * zeroes, and text pages of executables and shared libraries have
537 * only one copy in memory, at most, normally. 541 * only one copy in memory, at most, normally.
538 * 542 *
539 * For the non-reserved pages, page_count(page) denotes a reference count. 543 * For the non-reserved pages, page_count(page) denotes a reference count.
540 * page_count() == 0 means the page is free. page->lru is then used for 544 * page_count() == 0 means the page is free. page->lru is then used for
541 * freelist management in the buddy allocator. 545 * freelist management in the buddy allocator.
542 * page_count() > 0 means the page has been allocated. 546 * page_count() > 0 means the page has been allocated.
543 * 547 *
544 * Pages are allocated by the slab allocator in order to provide memory 548 * Pages are allocated by the slab allocator in order to provide memory
545 * to kmalloc and kmem_cache_alloc. In this case, the management of the 549 * to kmalloc and kmem_cache_alloc. In this case, the management of the
546 * page, and the fields in 'struct page' are the responsibility of mm/slab.c 550 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
547 * unless a particular usage is carefully commented. (the responsibility of 551 * unless a particular usage is carefully commented. (the responsibility of
548 * freeing the kmalloc memory is the caller's, of course). 552 * freeing the kmalloc memory is the caller's, of course).
549 * 553 *
550 * A page may be used by anyone else who does a __get_free_page(). 554 * A page may be used by anyone else who does a __get_free_page().
551 * In this case, page_count still tracks the references, and should only 555 * In this case, page_count still tracks the references, and should only
552 * be used through the normal accessor functions. The top bits of page->flags 556 * be used through the normal accessor functions. The top bits of page->flags
553 * and page->virtual store page management information, but all other fields 557 * and page->virtual store page management information, but all other fields
554 * are unused and could be used privately, carefully. The management of this 558 * are unused and could be used privately, carefully. The management of this
555 * page is the responsibility of the one who allocated it, and those who have 559 * page is the responsibility of the one who allocated it, and those who have
556 * subsequently been given references to it. 560 * subsequently been given references to it.
557 * 561 *
558 * The other pages (we may call them "pagecache pages") are completely 562 * The other pages (we may call them "pagecache pages") are completely
559 * managed by the Linux memory manager: I/O, buffers, swapping etc. 563 * managed by the Linux memory manager: I/O, buffers, swapping etc.
560 * The following discussion applies only to them. 564 * The following discussion applies only to them.
561 * 565 *
562 * A pagecache page contains an opaque `private' member, which belongs to the 566 * A pagecache page contains an opaque `private' member, which belongs to the
563 * page's address_space. Usually, this is the address of a circular list of 567 * page's address_space. Usually, this is the address of a circular list of
564 * the page's disk buffers. PG_private must be set to tell the VM to call 568 * the page's disk buffers. PG_private must be set to tell the VM to call
565 * into the filesystem to release these pages. 569 * into the filesystem to release these pages.
566 * 570 *
567 * A page may belong to an inode's memory mapping. In this case, page->mapping 571 * A page may belong to an inode's memory mapping. In this case, page->mapping
568 * is the pointer to the inode, and page->index is the file offset of the page, 572 * is the pointer to the inode, and page->index is the file offset of the page,
569 * in units of PAGE_CACHE_SIZE. 573 * in units of PAGE_CACHE_SIZE.
570 * 574 *
571 * If pagecache pages are not associated with an inode, they are said to be 575 * If pagecache pages are not associated with an inode, they are said to be
572 * anonymous pages. These may become associated with the swapcache, and in that 576 * anonymous pages. These may become associated with the swapcache, and in that
573 * case PG_swapcache is set, and page->private is an offset into the swapcache. 577 * case PG_swapcache is set, and page->private is an offset into the swapcache.
574 * 578 *
575 * In either case (swapcache or inode backed), the pagecache itself holds one 579 * In either case (swapcache or inode backed), the pagecache itself holds one
576 * reference to the page. Setting PG_private should also increment the 580 * reference to the page. Setting PG_private should also increment the
577 * refcount. The each user mapping also has a reference to the page. 581 * refcount. The each user mapping also has a reference to the page.
578 * 582 *
579 * The pagecache pages are stored in a per-mapping radix tree, which is 583 * The pagecache pages are stored in a per-mapping radix tree, which is
580 * rooted at mapping->page_tree, and indexed by offset. 584 * rooted at mapping->page_tree, and indexed by offset.
581 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space 585 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
582 * lists, we instead now tag pages as dirty/writeback in the radix tree. 586 * lists, we instead now tag pages as dirty/writeback in the radix tree.
583 * 587 *
584 * All pagecache pages may be subject to I/O: 588 * All pagecache pages may be subject to I/O:
585 * - inode pages may need to be read from disk, 589 * - inode pages may need to be read from disk,
586 * - inode pages which have been modified and are MAP_SHARED may need 590 * - inode pages which have been modified and are MAP_SHARED may need
587 * to be written back to the inode on disk, 591 * to be written back to the inode on disk,
588 * - anonymous pages (including MAP_PRIVATE file mappings) which have been 592 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
589 * modified may need to be swapped out to swap space and (later) to be read 593 * modified may need to be swapped out to swap space and (later) to be read
590 * back into memory. 594 * back into memory.
591 */ 595 */
592 596
593 /* 597 /*
594 * The zone field is never updated after free_area_init_core() 598 * The zone field is never updated after free_area_init_core()
595 * sets it, so none of the operations on it need to be atomic. 599 * sets it, so none of the operations on it need to be atomic.
596 */ 600 */
597 601
598 /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ 602 /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
599 #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 603 #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
600 #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 604 #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
601 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 605 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
602 #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) 606 #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
603 607
604 /* 608 /*
605 * Define the bit shifts to access each section. For non-existent 609 * Define the bit shifts to access each section. For non-existent
606 * sections we define the shift as 0; that plus a 0 mask ensures 610 * sections we define the shift as 0; that plus a 0 mask ensures
607 * the compiler will optimise away reference to them. 611 * the compiler will optimise away reference to them.
608 */ 612 */
609 #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 613 #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
610 #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 614 #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
611 #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 615 #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
612 #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) 616 #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
613 617
614 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 618 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
615 #ifdef NODE_NOT_IN_PAGE_FLAGS 619 #ifdef NODE_NOT_IN_PAGE_FLAGS
616 #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) 620 #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
617 #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ 621 #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \
618 SECTIONS_PGOFF : ZONES_PGOFF) 622 SECTIONS_PGOFF : ZONES_PGOFF)
619 #else 623 #else
620 #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) 624 #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
621 #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ 625 #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \
622 NODES_PGOFF : ZONES_PGOFF) 626 NODES_PGOFF : ZONES_PGOFF)
623 #endif 627 #endif
624 628
625 #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) 629 #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
626 630
627 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS 631 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
628 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS 632 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
629 #endif 633 #endif
630 634
631 #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 635 #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
632 #define NODES_MASK ((1UL << NODES_WIDTH) - 1) 636 #define NODES_MASK ((1UL << NODES_WIDTH) - 1)
633 #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 637 #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
634 #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) 638 #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1)
635 #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 639 #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
636 640
637 static inline enum zone_type page_zonenum(const struct page *page) 641 static inline enum zone_type page_zonenum(const struct page *page)
638 { 642 {
639 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 643 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
640 } 644 }
641 645
642 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 646 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
643 #define SECTION_IN_PAGE_FLAGS 647 #define SECTION_IN_PAGE_FLAGS
644 #endif 648 #endif
645 649
646 /* 650 /*
647 * The identification function is mainly used by the buddy allocator for 651 * The identification function is mainly used by the buddy allocator for
648 * determining if two pages could be buddies. We are not really identifying 652 * determining if two pages could be buddies. We are not really identifying
649 * the zone since we could be using the section number id if we do not have 653 * the zone since we could be using the section number id if we do not have
650 * node id available in page flags. 654 * node id available in page flags.
651 * We only guarantee that it will return the same value for two combinable 655 * We only guarantee that it will return the same value for two combinable
652 * pages in a zone. 656 * pages in a zone.
653 */ 657 */
654 static inline int page_zone_id(struct page *page) 658 static inline int page_zone_id(struct page *page)
655 { 659 {
656 return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; 660 return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
657 } 661 }
658 662
659 static inline int zone_to_nid(struct zone *zone) 663 static inline int zone_to_nid(struct zone *zone)
660 { 664 {
661 #ifdef CONFIG_NUMA 665 #ifdef CONFIG_NUMA
662 return zone->node; 666 return zone->node;
663 #else 667 #else
664 return 0; 668 return 0;
665 #endif 669 #endif
666 } 670 }
667 671
668 #ifdef NODE_NOT_IN_PAGE_FLAGS 672 #ifdef NODE_NOT_IN_PAGE_FLAGS
669 extern int page_to_nid(const struct page *page); 673 extern int page_to_nid(const struct page *page);
670 #else 674 #else
671 static inline int page_to_nid(const struct page *page) 675 static inline int page_to_nid(const struct page *page)
672 { 676 {
673 return (page->flags >> NODES_PGSHIFT) & NODES_MASK; 677 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
674 } 678 }
675 #endif 679 #endif
676 680
677 #ifdef CONFIG_NUMA_BALANCING 681 #ifdef CONFIG_NUMA_BALANCING
678 static inline int cpu_pid_to_cpupid(int cpu, int pid) 682 static inline int cpu_pid_to_cpupid(int cpu, int pid)
679 { 683 {
680 return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); 684 return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
681 } 685 }
682 686
683 static inline int cpupid_to_pid(int cpupid) 687 static inline int cpupid_to_pid(int cpupid)
684 { 688 {
685 return cpupid & LAST__PID_MASK; 689 return cpupid & LAST__PID_MASK;
686 } 690 }
687 691
688 static inline int cpupid_to_cpu(int cpupid) 692 static inline int cpupid_to_cpu(int cpupid)
689 { 693 {
690 return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; 694 return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
691 } 695 }
692 696
693 static inline int cpupid_to_nid(int cpupid) 697 static inline int cpupid_to_nid(int cpupid)
694 { 698 {
695 return cpu_to_node(cpupid_to_cpu(cpupid)); 699 return cpu_to_node(cpupid_to_cpu(cpupid));
696 } 700 }
697 701
698 static inline bool cpupid_pid_unset(int cpupid) 702 static inline bool cpupid_pid_unset(int cpupid)
699 { 703 {
700 return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); 704 return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
701 } 705 }
702 706
703 static inline bool cpupid_cpu_unset(int cpupid) 707 static inline bool cpupid_cpu_unset(int cpupid)
704 { 708 {
705 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); 709 return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
706 } 710 }
707 711
708 static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) 712 static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
709 { 713 {
710 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); 714 return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
711 } 715 }
712 716
713 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) 717 #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
714 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS 718 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
715 static inline int page_cpupid_xchg_last(struct page *page, int cpupid) 719 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
716 { 720 {
717 return xchg(&page->_last_cpupid, cpupid); 721 return xchg(&page->_last_cpupid, cpupid);
718 } 722 }
719 723
720 static inline int page_cpupid_last(struct page *page) 724 static inline int page_cpupid_last(struct page *page)
721 { 725 {
722 return page->_last_cpupid; 726 return page->_last_cpupid;
723 } 727 }
724 static inline void page_cpupid_reset_last(struct page *page) 728 static inline void page_cpupid_reset_last(struct page *page)
725 { 729 {
726 page->_last_cpupid = -1; 730 page->_last_cpupid = -1;
727 } 731 }
728 #else 732 #else
729 static inline int page_cpupid_last(struct page *page) 733 static inline int page_cpupid_last(struct page *page)
730 { 734 {
731 return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; 735 return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
732 } 736 }
733 737
734 extern int page_cpupid_xchg_last(struct page *page, int cpupid); 738 extern int page_cpupid_xchg_last(struct page *page, int cpupid);
735 739
736 static inline void page_cpupid_reset_last(struct page *page) 740 static inline void page_cpupid_reset_last(struct page *page)
737 { 741 {
738 int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; 742 int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
739 743
740 page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); 744 page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
741 page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; 745 page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
742 } 746 }
743 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ 747 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
744 #else /* !CONFIG_NUMA_BALANCING */ 748 #else /* !CONFIG_NUMA_BALANCING */
745 static inline int page_cpupid_xchg_last(struct page *page, int cpupid) 749 static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
746 { 750 {
747 return page_to_nid(page); /* XXX */ 751 return page_to_nid(page); /* XXX */
748 } 752 }
749 753
750 static inline int page_cpupid_last(struct page *page) 754 static inline int page_cpupid_last(struct page *page)
751 { 755 {
752 return page_to_nid(page); /* XXX */ 756 return page_to_nid(page); /* XXX */
753 } 757 }
754 758
755 static inline int cpupid_to_nid(int cpupid) 759 static inline int cpupid_to_nid(int cpupid)
756 { 760 {
757 return -1; 761 return -1;
758 } 762 }
759 763
760 static inline int cpupid_to_pid(int cpupid) 764 static inline int cpupid_to_pid(int cpupid)
761 { 765 {
762 return -1; 766 return -1;
763 } 767 }
764 768
765 static inline int cpupid_to_cpu(int cpupid) 769 static inline int cpupid_to_cpu(int cpupid)
766 { 770 {
767 return -1; 771 return -1;
768 } 772 }
769 773
770 static inline int cpu_pid_to_cpupid(int nid, int pid) 774 static inline int cpu_pid_to_cpupid(int nid, int pid)
771 { 775 {
772 return -1; 776 return -1;
773 } 777 }
774 778
775 static inline bool cpupid_pid_unset(int cpupid) 779 static inline bool cpupid_pid_unset(int cpupid)
776 { 780 {
777 return 1; 781 return 1;
778 } 782 }
779 783
780 static inline void page_cpupid_reset_last(struct page *page) 784 static inline void page_cpupid_reset_last(struct page *page)
781 { 785 {
782 } 786 }
783 787
784 static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) 788 static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
785 { 789 {
786 return false; 790 return false;
787 } 791 }
788 #endif /* CONFIG_NUMA_BALANCING */ 792 #endif /* CONFIG_NUMA_BALANCING */
789 793
790 static inline struct zone *page_zone(const struct page *page) 794 static inline struct zone *page_zone(const struct page *page)
791 { 795 {
792 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 796 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
793 } 797 }
794 798
795 #ifdef SECTION_IN_PAGE_FLAGS 799 #ifdef SECTION_IN_PAGE_FLAGS
796 static inline void set_page_section(struct page *page, unsigned long section) 800 static inline void set_page_section(struct page *page, unsigned long section)
797 { 801 {
798 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); 802 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
799 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; 803 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
800 } 804 }
801 805
802 static inline unsigned long page_to_section(const struct page *page) 806 static inline unsigned long page_to_section(const struct page *page)
803 { 807 {
804 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; 808 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
805 } 809 }
806 #endif 810 #endif
807 811
808 static inline void set_page_zone(struct page *page, enum zone_type zone) 812 static inline void set_page_zone(struct page *page, enum zone_type zone)
809 { 813 {
810 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); 814 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
811 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; 815 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
812 } 816 }
813 817
814 static inline void set_page_node(struct page *page, unsigned long node) 818 static inline void set_page_node(struct page *page, unsigned long node)
815 { 819 {
816 page->flags &= ~(NODES_MASK << NODES_PGSHIFT); 820 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
817 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; 821 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
818 } 822 }
819 823
820 static inline void set_page_links(struct page *page, enum zone_type zone, 824 static inline void set_page_links(struct page *page, enum zone_type zone,
821 unsigned long node, unsigned long pfn) 825 unsigned long node, unsigned long pfn)
822 { 826 {
823 set_page_zone(page, zone); 827 set_page_zone(page, zone);
824 set_page_node(page, node); 828 set_page_node(page, node);
825 #ifdef SECTION_IN_PAGE_FLAGS 829 #ifdef SECTION_IN_PAGE_FLAGS
826 set_page_section(page, pfn_to_section_nr(pfn)); 830 set_page_section(page, pfn_to_section_nr(pfn));
827 #endif 831 #endif
828 } 832 }
829 833
830 /* 834 /*
831 * Some inline functions in vmstat.h depend on page_zone() 835 * Some inline functions in vmstat.h depend on page_zone()
832 */ 836 */
833 #include <linux/vmstat.h> 837 #include <linux/vmstat.h>
834 838
835 static __always_inline void *lowmem_page_address(const struct page *page) 839 static __always_inline void *lowmem_page_address(const struct page *page)
836 { 840 {
837 return __va(PFN_PHYS(page_to_pfn(page))); 841 return __va(PFN_PHYS(page_to_pfn(page)));
838 } 842 }
839 843
840 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) 844 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
841 #define HASHED_PAGE_VIRTUAL 845 #define HASHED_PAGE_VIRTUAL
842 #endif 846 #endif
843 847
844 #if defined(WANT_PAGE_VIRTUAL) 848 #if defined(WANT_PAGE_VIRTUAL)
845 #define page_address(page) ((page)->virtual) 849 #define page_address(page) ((page)->virtual)
846 #define set_page_address(page, address) \ 850 #define set_page_address(page, address) \
847 do { \ 851 do { \
848 (page)->virtual = (address); \ 852 (page)->virtual = (address); \
849 } while(0) 853 } while(0)
850 #define page_address_init() do { } while(0) 854 #define page_address_init() do { } while(0)
851 #endif 855 #endif
852 856
853 #if defined(HASHED_PAGE_VIRTUAL) 857 #if defined(HASHED_PAGE_VIRTUAL)
854 void *page_address(const struct page *page); 858 void *page_address(const struct page *page);
855 void set_page_address(struct page *page, void *virtual); 859 void set_page_address(struct page *page, void *virtual);
856 void page_address_init(void); 860 void page_address_init(void);
857 #endif 861 #endif
858 862
859 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) 863 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
860 #define page_address(page) lowmem_page_address(page) 864 #define page_address(page) lowmem_page_address(page)
861 #define set_page_address(page, address) do { } while(0) 865 #define set_page_address(page, address) do { } while(0)
862 #define page_address_init() do { } while(0) 866 #define page_address_init() do { } while(0)
863 #endif 867 #endif
864 868
865 /* 869 /*
866 * On an anonymous page mapped into a user virtual memory area, 870 * On an anonymous page mapped into a user virtual memory area,
867 * page->mapping points to its anon_vma, not to a struct address_space; 871 * page->mapping points to its anon_vma, not to a struct address_space;
868 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. 872 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
869 * 873 *
870 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, 874 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
871 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; 875 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
872 * and then page->mapping points, not to an anon_vma, but to a private 876 * and then page->mapping points, not to an anon_vma, but to a private
873 * structure which KSM associates with that merged page. See ksm.h. 877 * structure which KSM associates with that merged page. See ksm.h.
874 * 878 *
875 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. 879 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
876 * 880 *
877 * Please note that, confusingly, "page_mapping" refers to the inode 881 * Please note that, confusingly, "page_mapping" refers to the inode
878 * address_space which maps the page from disk; whereas "page_mapped" 882 * address_space which maps the page from disk; whereas "page_mapped"
879 * refers to user virtual address space into which the page is mapped. 883 * refers to user virtual address space into which the page is mapped.
880 */ 884 */
881 #define PAGE_MAPPING_ANON 1 885 #define PAGE_MAPPING_ANON 1
882 #define PAGE_MAPPING_KSM 2 886 #define PAGE_MAPPING_KSM 2
883 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) 887 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
884 888
885 extern struct address_space *page_mapping(struct page *page); 889 extern struct address_space *page_mapping(struct page *page);
886 890
887 /* Neutral page->mapping pointer to address_space or anon_vma or other */ 891 /* Neutral page->mapping pointer to address_space or anon_vma or other */
888 static inline void *page_rmapping(struct page *page) 892 static inline void *page_rmapping(struct page *page)
889 { 893 {
890 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); 894 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
891 } 895 }
892 896
893 extern struct address_space *__page_file_mapping(struct page *); 897 extern struct address_space *__page_file_mapping(struct page *);
894 898
895 static inline 899 static inline
896 struct address_space *page_file_mapping(struct page *page) 900 struct address_space *page_file_mapping(struct page *page)
897 { 901 {
898 if (unlikely(PageSwapCache(page))) 902 if (unlikely(PageSwapCache(page)))
899 return __page_file_mapping(page); 903 return __page_file_mapping(page);
900 904
901 return page->mapping; 905 return page->mapping;
902 } 906 }
903 907
904 static inline int PageAnon(struct page *page) 908 static inline int PageAnon(struct page *page)
905 { 909 {
906 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; 910 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
907 } 911 }
908 912
909 /* 913 /*
910 * Return the pagecache index of the passed page. Regular pagecache pages 914 * Return the pagecache index of the passed page. Regular pagecache pages
911 * use ->index whereas swapcache pages use ->private 915 * use ->index whereas swapcache pages use ->private
912 */ 916 */
913 static inline pgoff_t page_index(struct page *page) 917 static inline pgoff_t page_index(struct page *page)
914 { 918 {
915 if (unlikely(PageSwapCache(page))) 919 if (unlikely(PageSwapCache(page)))
916 return page_private(page); 920 return page_private(page);
917 return page->index; 921 return page->index;
918 } 922 }
919 923
920 extern pgoff_t __page_file_index(struct page *page); 924 extern pgoff_t __page_file_index(struct page *page);
921 925
922 /* 926 /*
923 * Return the file index of the page. Regular pagecache pages use ->index 927 * Return the file index of the page. Regular pagecache pages use ->index
924 * whereas swapcache pages use swp_offset(->private) 928 * whereas swapcache pages use swp_offset(->private)
925 */ 929 */
926 static inline pgoff_t page_file_index(struct page *page) 930 static inline pgoff_t page_file_index(struct page *page)
927 { 931 {
928 if (unlikely(PageSwapCache(page))) 932 if (unlikely(PageSwapCache(page)))
929 return __page_file_index(page); 933 return __page_file_index(page);
930 934
931 return page->index; 935 return page->index;
932 } 936 }
933 937
934 /* 938 /*
935 * Return true if this page is mapped into pagetables. 939 * Return true if this page is mapped into pagetables.
936 */ 940 */
937 static inline int page_mapped(struct page *page) 941 static inline int page_mapped(struct page *page)
938 { 942 {
939 return atomic_read(&(page)->_mapcount) >= 0; 943 return atomic_read(&(page)->_mapcount) >= 0;
940 } 944 }
941 945
942 /* 946 /*
943 * Different kinds of faults, as returned by handle_mm_fault(). 947 * Different kinds of faults, as returned by handle_mm_fault().
944 * Used to decide whether a process gets delivered SIGBUS or 948 * Used to decide whether a process gets delivered SIGBUS or
945 * just gets major/minor fault counters bumped up. 949 * just gets major/minor fault counters bumped up.
946 */ 950 */
947 951
948 #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ 952 #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
949 953
950 #define VM_FAULT_OOM 0x0001 954 #define VM_FAULT_OOM 0x0001
951 #define VM_FAULT_SIGBUS 0x0002 955 #define VM_FAULT_SIGBUS 0x0002
952 #define VM_FAULT_MAJOR 0x0004 956 #define VM_FAULT_MAJOR 0x0004
953 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 957 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
954 #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ 958 #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
955 #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ 959 #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
956 960
957 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 961 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
958 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 962 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
959 #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ 963 #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
960 #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ 964 #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
961 965
962 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 966 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
963 967
964 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ 968 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
965 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) 969 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
966 970
967 /* Encode hstate index for a hwpoisoned large page */ 971 /* Encode hstate index for a hwpoisoned large page */
968 #define VM_FAULT_SET_HINDEX(x) ((x) << 12) 972 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
969 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) 973 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
970 974
971 /* 975 /*
972 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 976 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
973 */ 977 */
974 extern void pagefault_out_of_memory(void); 978 extern void pagefault_out_of_memory(void);
975 979
976 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) 980 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
977 981
978 /* 982 /*
979 * Flags passed to show_mem() and show_free_areas() to suppress output in 983 * Flags passed to show_mem() and show_free_areas() to suppress output in
980 * various contexts. 984 * various contexts.
981 */ 985 */
982 #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ 986 #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
983 #define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ 987 #define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */
984 988
985 extern void show_free_areas(unsigned int flags); 989 extern void show_free_areas(unsigned int flags);
986 extern bool skip_free_areas_node(unsigned int flags, int nid); 990 extern bool skip_free_areas_node(unsigned int flags, int nid);
987 991
988 int shmem_zero_setup(struct vm_area_struct *); 992 int shmem_zero_setup(struct vm_area_struct *);
989 993
990 extern int can_do_mlock(void); 994 extern int can_do_mlock(void);
991 extern int user_shm_lock(size_t, struct user_struct *); 995 extern int user_shm_lock(size_t, struct user_struct *);
992 extern void user_shm_unlock(size_t, struct user_struct *); 996 extern void user_shm_unlock(size_t, struct user_struct *);
993 997
994 /* 998 /*
995 * Parameter block passed down to zap_pte_range in exceptional cases. 999 * Parameter block passed down to zap_pte_range in exceptional cases.
996 */ 1000 */
997 struct zap_details { 1001 struct zap_details {
998 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ 1002 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
999 struct address_space *check_mapping; /* Check page->mapping if set */ 1003 struct address_space *check_mapping; /* Check page->mapping if set */
1000 pgoff_t first_index; /* Lowest page->index to unmap */ 1004 pgoff_t first_index; /* Lowest page->index to unmap */
1001 pgoff_t last_index; /* Highest page->index to unmap */ 1005 pgoff_t last_index; /* Highest page->index to unmap */
1002 }; 1006 };
1003 1007
1004 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 1008 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
1005 pte_t pte); 1009 pte_t pte);
1006 1010
1007 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1011 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1008 unsigned long size); 1012 unsigned long size);
1009 void zap_page_range(struct vm_area_struct *vma, unsigned long address, 1013 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
1010 unsigned long size, struct zap_details *); 1014 unsigned long size, struct zap_details *);
1011 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 1015 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1012 unsigned long start, unsigned long end); 1016 unsigned long start, unsigned long end);
1013 1017
1014 /** 1018 /**
1015 * mm_walk - callbacks for walk_page_range 1019 * mm_walk - callbacks for walk_page_range
1016 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry 1020 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
1017 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry 1021 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1018 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1022 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1019 * this handler is required to be able to handle 1023 * this handler is required to be able to handle
1020 * pmd_trans_huge() pmds. They may simply choose to 1024 * pmd_trans_huge() pmds. They may simply choose to
1021 * split_huge_page() instead of handling it explicitly. 1025 * split_huge_page() instead of handling it explicitly.
1022 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 1026 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
1023 * @pte_hole: if set, called for each hole at all levels 1027 * @pte_hole: if set, called for each hole at all levels
1024 * @hugetlb_entry: if set, called for each hugetlb entry 1028 * @hugetlb_entry: if set, called for each hugetlb entry
1025 * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry 1029 * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
1026 * is used. 1030 * is used.
1027 * 1031 *
1028 * (see walk_page_range for more details) 1032 * (see walk_page_range for more details)
1029 */ 1033 */
1030 struct mm_walk { 1034 struct mm_walk {
1031 int (*pgd_entry)(pgd_t *pgd, unsigned long addr, 1035 int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
1032 unsigned long next, struct mm_walk *walk); 1036 unsigned long next, struct mm_walk *walk);
1033 int (*pud_entry)(pud_t *pud, unsigned long addr, 1037 int (*pud_entry)(pud_t *pud, unsigned long addr,
1034 unsigned long next, struct mm_walk *walk); 1038 unsigned long next, struct mm_walk *walk);
1035 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1039 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1036 unsigned long next, struct mm_walk *walk); 1040 unsigned long next, struct mm_walk *walk);
1037 int (*pte_entry)(pte_t *pte, unsigned long addr, 1041 int (*pte_entry)(pte_t *pte, unsigned long addr,
1038 unsigned long next, struct mm_walk *walk); 1042 unsigned long next, struct mm_walk *walk);
1039 int (*pte_hole)(unsigned long addr, unsigned long next, 1043 int (*pte_hole)(unsigned long addr, unsigned long next,
1040 struct mm_walk *walk); 1044 struct mm_walk *walk);
1041 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 1045 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
1042 unsigned long addr, unsigned long next, 1046 unsigned long addr, unsigned long next,
1043 struct mm_walk *walk); 1047 struct mm_walk *walk);
1044 struct mm_struct *mm; 1048 struct mm_struct *mm;
1045 void *private; 1049 void *private;
1046 }; 1050 };
1047 1051
1048 int walk_page_range(unsigned long addr, unsigned long end, 1052 int walk_page_range(unsigned long addr, unsigned long end,
1049 struct mm_walk *walk); 1053 struct mm_walk *walk);
1050 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 1054 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
1051 unsigned long end, unsigned long floor, unsigned long ceiling); 1055 unsigned long end, unsigned long floor, unsigned long ceiling);
1052 int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 1056 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
1053 struct vm_area_struct *vma); 1057 struct vm_area_struct *vma);
1054 void unmap_mapping_range(struct address_space *mapping, 1058 void unmap_mapping_range(struct address_space *mapping,
1055 loff_t const holebegin, loff_t const holelen, int even_cows); 1059 loff_t const holebegin, loff_t const holelen, int even_cows);
1056 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 1060 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
1057 unsigned long *pfn); 1061 unsigned long *pfn);
1058 int follow_phys(struct vm_area_struct *vma, unsigned long address, 1062 int follow_phys(struct vm_area_struct *vma, unsigned long address,
1059 unsigned int flags, unsigned long *prot, resource_size_t *phys); 1063 unsigned int flags, unsigned long *prot, resource_size_t *phys);
1060 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 1064 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
1061 void *buf, int len, int write); 1065 void *buf, int len, int write);
1062 1066
1063 static inline void unmap_shared_mapping_range(struct address_space *mapping, 1067 static inline void unmap_shared_mapping_range(struct address_space *mapping,
1064 loff_t const holebegin, loff_t const holelen) 1068 loff_t const holebegin, loff_t const holelen)
1065 { 1069 {
1066 unmap_mapping_range(mapping, holebegin, holelen, 0); 1070 unmap_mapping_range(mapping, holebegin, holelen, 0);
1067 } 1071 }
1068 1072
1069 extern void truncate_pagecache(struct inode *inode, loff_t new); 1073 extern void truncate_pagecache(struct inode *inode, loff_t new);
1070 extern void truncate_setsize(struct inode *inode, loff_t newsize); 1074 extern void truncate_setsize(struct inode *inode, loff_t newsize);
1071 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); 1075 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
1072 int truncate_inode_page(struct address_space *mapping, struct page *page); 1076 int truncate_inode_page(struct address_space *mapping, struct page *page);
1073 int generic_error_remove_page(struct address_space *mapping, struct page *page); 1077 int generic_error_remove_page(struct address_space *mapping, struct page *page);
1074 int invalidate_inode_page(struct page *page); 1078 int invalidate_inode_page(struct page *page);
1075 1079
1076 #ifdef CONFIG_MMU 1080 #ifdef CONFIG_MMU
1077 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 1081 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1078 unsigned long address, unsigned int flags); 1082 unsigned long address, unsigned int flags);
1079 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 1083 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1080 unsigned long address, unsigned int fault_flags); 1084 unsigned long address, unsigned int fault_flags);
1081 #else 1085 #else
1082 static inline int handle_mm_fault(struct mm_struct *mm, 1086 static inline int handle_mm_fault(struct mm_struct *mm,
1083 struct vm_area_struct *vma, unsigned long address, 1087 struct vm_area_struct *vma, unsigned long address,
1084 unsigned int flags) 1088 unsigned int flags)
1085 { 1089 {
1086 /* should never happen if there's no MMU */ 1090 /* should never happen if there's no MMU */
1087 BUG(); 1091 BUG();
1088 return VM_FAULT_SIGBUS; 1092 return VM_FAULT_SIGBUS;
1089 } 1093 }
1090 static inline int fixup_user_fault(struct task_struct *tsk, 1094 static inline int fixup_user_fault(struct task_struct *tsk,
1091 struct mm_struct *mm, unsigned long address, 1095 struct mm_struct *mm, unsigned long address,
1092 unsigned int fault_flags) 1096 unsigned int fault_flags)
1093 { 1097 {
1094 /* should never happen if there's no MMU */ 1098 /* should never happen if there's no MMU */
1095 BUG(); 1099 BUG();
1096 return -EFAULT; 1100 return -EFAULT;
1097 } 1101 }
1098 #endif 1102 #endif
1099 1103
1100 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); 1104 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
1101 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, 1105 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
1102 void *buf, int len, int write); 1106 void *buf, int len, int write);
1103 1107
1104 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1108 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1105 unsigned long start, unsigned long nr_pages, 1109 unsigned long start, unsigned long nr_pages,
1106 unsigned int foll_flags, struct page **pages, 1110 unsigned int foll_flags, struct page **pages,
1107 struct vm_area_struct **vmas, int *nonblocking); 1111 struct vm_area_struct **vmas, int *nonblocking);
1108 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1112 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1109 unsigned long start, unsigned long nr_pages, 1113 unsigned long start, unsigned long nr_pages,
1110 int write, int force, struct page **pages, 1114 int write, int force, struct page **pages,
1111 struct vm_area_struct **vmas); 1115 struct vm_area_struct **vmas);
1112 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1116 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1113 struct page **pages); 1117 struct page **pages);
1114 struct kvec; 1118 struct kvec;
1115 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, 1119 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
1116 struct page **pages); 1120 struct page **pages);
1117 int get_kernel_page(unsigned long start, int write, struct page **pages); 1121 int get_kernel_page(unsigned long start, int write, struct page **pages);
1118 struct page *get_dump_page(unsigned long addr); 1122 struct page *get_dump_page(unsigned long addr);
1119 1123
1120 extern int try_to_release_page(struct page * page, gfp_t gfp_mask); 1124 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
1121 extern void do_invalidatepage(struct page *page, unsigned int offset, 1125 extern void do_invalidatepage(struct page *page, unsigned int offset,
1122 unsigned int length); 1126 unsigned int length);
1123 1127
1124 int __set_page_dirty_nobuffers(struct page *page); 1128 int __set_page_dirty_nobuffers(struct page *page);
1125 int __set_page_dirty_no_writeback(struct page *page); 1129 int __set_page_dirty_no_writeback(struct page *page);
1126 int redirty_page_for_writepage(struct writeback_control *wbc, 1130 int redirty_page_for_writepage(struct writeback_control *wbc,
1127 struct page *page); 1131 struct page *page);
1128 void account_page_dirtied(struct page *page, struct address_space *mapping); 1132 void account_page_dirtied(struct page *page, struct address_space *mapping);
1129 void account_page_writeback(struct page *page); 1133 void account_page_writeback(struct page *page);
1130 int set_page_dirty(struct page *page); 1134 int set_page_dirty(struct page *page);
1131 int set_page_dirty_lock(struct page *page); 1135 int set_page_dirty_lock(struct page *page);
1132 int clear_page_dirty_for_io(struct page *page); 1136 int clear_page_dirty_for_io(struct page *page);
1133 1137
1134 /* Is the vma a continuation of the stack vma above it? */ 1138 /* Is the vma a continuation of the stack vma above it? */
1135 static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) 1139 static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
1136 { 1140 {
1137 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); 1141 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
1138 } 1142 }
1139 1143
1140 static inline int stack_guard_page_start(struct vm_area_struct *vma, 1144 static inline int stack_guard_page_start(struct vm_area_struct *vma,
1141 unsigned long addr) 1145 unsigned long addr)
1142 { 1146 {
1143 return (vma->vm_flags & VM_GROWSDOWN) && 1147 return (vma->vm_flags & VM_GROWSDOWN) &&
1144 (vma->vm_start == addr) && 1148 (vma->vm_start == addr) &&
1145 !vma_growsdown(vma->vm_prev, addr); 1149 !vma_growsdown(vma->vm_prev, addr);
1146 } 1150 }
1147 1151
1148 /* Is the vma a continuation of the stack vma below it? */ 1152 /* Is the vma a continuation of the stack vma below it? */
1149 static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr) 1153 static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr)
1150 { 1154 {
1151 return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP); 1155 return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP);
1152 } 1156 }
1153 1157
1154 static inline int stack_guard_page_end(struct vm_area_struct *vma, 1158 static inline int stack_guard_page_end(struct vm_area_struct *vma,
1155 unsigned long addr) 1159 unsigned long addr)
1156 { 1160 {
1157 return (vma->vm_flags & VM_GROWSUP) && 1161 return (vma->vm_flags & VM_GROWSUP) &&
1158 (vma->vm_end == addr) && 1162 (vma->vm_end == addr) &&
1159 !vma_growsup(vma->vm_next, addr); 1163 !vma_growsup(vma->vm_next, addr);
1160 } 1164 }
1161 1165
1162 extern pid_t 1166 extern pid_t
1163 vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); 1167 vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
1164 1168
1165 extern unsigned long move_page_tables(struct vm_area_struct *vma, 1169 extern unsigned long move_page_tables(struct vm_area_struct *vma,
1166 unsigned long old_addr, struct vm_area_struct *new_vma, 1170 unsigned long old_addr, struct vm_area_struct *new_vma,
1167 unsigned long new_addr, unsigned long len, 1171 unsigned long new_addr, unsigned long len,
1168 bool need_rmap_locks); 1172 bool need_rmap_locks);
1169 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, 1173 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
1170 unsigned long end, pgprot_t newprot, 1174 unsigned long end, pgprot_t newprot,
1171 int dirty_accountable, int prot_numa); 1175 int dirty_accountable, int prot_numa);
1172 extern int mprotect_fixup(struct vm_area_struct *vma, 1176 extern int mprotect_fixup(struct vm_area_struct *vma,
1173 struct vm_area_struct **pprev, unsigned long start, 1177 struct vm_area_struct **pprev, unsigned long start,
1174 unsigned long end, unsigned long newflags); 1178 unsigned long end, unsigned long newflags);
1175 1179
1176 /* 1180 /*
1177 * doesn't attempt to fault and will return short. 1181 * doesn't attempt to fault and will return short.
1178 */ 1182 */
1179 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 1183 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1180 struct page **pages); 1184 struct page **pages);
1181 /* 1185 /*
1182 * per-process(per-mm_struct) statistics. 1186 * per-process(per-mm_struct) statistics.
1183 */ 1187 */
1184 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) 1188 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
1185 { 1189 {
1186 long val = atomic_long_read(&mm->rss_stat.count[member]); 1190 long val = atomic_long_read(&mm->rss_stat.count[member]);
1187 1191
1188 #ifdef SPLIT_RSS_COUNTING 1192 #ifdef SPLIT_RSS_COUNTING
1189 /* 1193 /*
1190 * counter is updated in asynchronous manner and may go to minus. 1194 * counter is updated in asynchronous manner and may go to minus.
1191 * But it's never be expected number for users. 1195 * But it's never be expected number for users.
1192 */ 1196 */
1193 if (val < 0) 1197 if (val < 0)
1194 val = 0; 1198 val = 0;
1195 #endif 1199 #endif
1196 return (unsigned long)val; 1200 return (unsigned long)val;
1197 } 1201 }
1198 1202
1199 static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 1203 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
1200 { 1204 {
1201 atomic_long_add(value, &mm->rss_stat.count[member]); 1205 atomic_long_add(value, &mm->rss_stat.count[member]);
1202 } 1206 }
1203 1207
1204 static inline void inc_mm_counter(struct mm_struct *mm, int member) 1208 static inline void inc_mm_counter(struct mm_struct *mm, int member)
1205 { 1209 {
1206 atomic_long_inc(&mm->rss_stat.count[member]); 1210 atomic_long_inc(&mm->rss_stat.count[member]);
1207 } 1211 }
1208 1212
1209 static inline void dec_mm_counter(struct mm_struct *mm, int member) 1213 static inline void dec_mm_counter(struct mm_struct *mm, int member)
1210 { 1214 {
1211 atomic_long_dec(&mm->rss_stat.count[member]); 1215 atomic_long_dec(&mm->rss_stat.count[member]);
1212 } 1216 }
1213 1217
1214 static inline unsigned long get_mm_rss(struct mm_struct *mm) 1218 static inline unsigned long get_mm_rss(struct mm_struct *mm)
1215 { 1219 {
1216 return get_mm_counter(mm, MM_FILEPAGES) + 1220 return get_mm_counter(mm, MM_FILEPAGES) +
1217 get_mm_counter(mm, MM_ANONPAGES); 1221 get_mm_counter(mm, MM_ANONPAGES);
1218 } 1222 }
1219 1223
1220 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) 1224 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
1221 { 1225 {
1222 return max(mm->hiwater_rss, get_mm_rss(mm)); 1226 return max(mm->hiwater_rss, get_mm_rss(mm));
1223 } 1227 }
1224 1228
1225 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) 1229 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
1226 { 1230 {
1227 return max(mm->hiwater_vm, mm->total_vm); 1231 return max(mm->hiwater_vm, mm->total_vm);
1228 } 1232 }
1229 1233
1230 static inline void update_hiwater_rss(struct mm_struct *mm) 1234 static inline void update_hiwater_rss(struct mm_struct *mm)
1231 { 1235 {
1232 unsigned long _rss = get_mm_rss(mm); 1236 unsigned long _rss = get_mm_rss(mm);
1233 1237
1234 if ((mm)->hiwater_rss < _rss) 1238 if ((mm)->hiwater_rss < _rss)
1235 (mm)->hiwater_rss = _rss; 1239 (mm)->hiwater_rss = _rss;
1236 } 1240 }
1237 1241
1238 static inline void update_hiwater_vm(struct mm_struct *mm) 1242 static inline void update_hiwater_vm(struct mm_struct *mm)
1239 { 1243 {
1240 if (mm->hiwater_vm < mm->total_vm) 1244 if (mm->hiwater_vm < mm->total_vm)
1241 mm->hiwater_vm = mm->total_vm; 1245 mm->hiwater_vm = mm->total_vm;
1242 } 1246 }
1243 1247
1244 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, 1248 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
1245 struct mm_struct *mm) 1249 struct mm_struct *mm)
1246 { 1250 {
1247 unsigned long hiwater_rss = get_mm_hiwater_rss(mm); 1251 unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
1248 1252
1249 if (*maxrss < hiwater_rss) 1253 if (*maxrss < hiwater_rss)
1250 *maxrss = hiwater_rss; 1254 *maxrss = hiwater_rss;
1251 } 1255 }
1252 1256
1253 #if defined(SPLIT_RSS_COUNTING) 1257 #if defined(SPLIT_RSS_COUNTING)
1254 void sync_mm_rss(struct mm_struct *mm); 1258 void sync_mm_rss(struct mm_struct *mm);
1255 #else 1259 #else
1256 static inline void sync_mm_rss(struct mm_struct *mm) 1260 static inline void sync_mm_rss(struct mm_struct *mm)
1257 { 1261 {
1258 } 1262 }
1259 #endif 1263 #endif
1260 1264
1261 int vma_wants_writenotify(struct vm_area_struct *vma); 1265 int vma_wants_writenotify(struct vm_area_struct *vma);
1262 1266
1263 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1267 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1264 spinlock_t **ptl); 1268 spinlock_t **ptl);
1265 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1269 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1266 spinlock_t **ptl) 1270 spinlock_t **ptl)
1267 { 1271 {
1268 pte_t *ptep; 1272 pte_t *ptep;
1269 __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); 1273 __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
1270 return ptep; 1274 return ptep;
1271 } 1275 }
1272 1276
1273 #ifdef __PAGETABLE_PUD_FOLDED 1277 #ifdef __PAGETABLE_PUD_FOLDED
1274 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, 1278 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
1275 unsigned long address) 1279 unsigned long address)
1276 { 1280 {
1277 return 0; 1281 return 0;
1278 } 1282 }
1279 #else 1283 #else
1280 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 1284 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
1281 #endif 1285 #endif
1282 1286
1283 #ifdef __PAGETABLE_PMD_FOLDED 1287 #ifdef __PAGETABLE_PMD_FOLDED
1284 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 1288 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1285 unsigned long address) 1289 unsigned long address)
1286 { 1290 {
1287 return 0; 1291 return 0;
1288 } 1292 }
1289 #else 1293 #else
1290 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1294 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1291 #endif 1295 #endif
1292 1296
1293 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 1297 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
1294 pmd_t *pmd, unsigned long address); 1298 pmd_t *pmd, unsigned long address);
1295 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); 1299 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
1296 1300
1297 /* 1301 /*
1298 * The following ifdef needed to get the 4level-fixup.h header to work. 1302 * The following ifdef needed to get the 4level-fixup.h header to work.
1299 * Remove it when 4level-fixup.h has been removed. 1303 * Remove it when 4level-fixup.h has been removed.
1300 */ 1304 */
1301 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) 1305 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
1302 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 1306 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1303 { 1307 {
1304 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? 1308 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
1305 NULL: pud_offset(pgd, address); 1309 NULL: pud_offset(pgd, address);
1306 } 1310 }
1307 1311
1308 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 1312 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
1309 { 1313 {
1310 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? 1314 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
1311 NULL: pmd_offset(pud, address); 1315 NULL: pmd_offset(pud, address);
1312 } 1316 }
1313 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ 1317 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
1314 1318
1315 #if USE_SPLIT_PTLOCKS 1319 #if USE_SPLIT_PTLOCKS
1316 /* 1320 /*
1317 * We tuck a spinlock to guard each pagetable page into its struct page, 1321 * We tuck a spinlock to guard each pagetable page into its struct page,
1318 * at page->private, with BUILD_BUG_ON to make sure that this will not 1322 * at page->private, with BUILD_BUG_ON to make sure that this will not
1319 * overflow into the next struct page (as it might with DEBUG_SPINLOCK). 1323 * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
1320 * When freeing, reset page->mapping so free_pages_check won't complain. 1324 * When freeing, reset page->mapping so free_pages_check won't complain.
1321 */ 1325 */
1322 #define __pte_lockptr(page) &((page)->ptl) 1326 #define __pte_lockptr(page) &((page)->ptl)
1323 #define pte_lock_init(_page) do { \ 1327 #define pte_lock_init(_page) do { \
1324 spin_lock_init(__pte_lockptr(_page)); \ 1328 spin_lock_init(__pte_lockptr(_page)); \
1325 } while (0) 1329 } while (0)
1326 #define pte_lock_deinit(page) ((page)->mapping = NULL) 1330 #define pte_lock_deinit(page) ((page)->mapping = NULL)
1327 #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) 1331 #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
1328 #else /* !USE_SPLIT_PTLOCKS */ 1332 #else /* !USE_SPLIT_PTLOCKS */
1329 /* 1333 /*
1330 * We use mm->page_table_lock to guard all pagetable pages of the mm. 1334 * We use mm->page_table_lock to guard all pagetable pages of the mm.
1331 */ 1335 */
1332 #define pte_lock_init(page) do {} while (0) 1336 #define pte_lock_init(page) do {} while (0)
1333 #define pte_lock_deinit(page) do {} while (0) 1337 #define pte_lock_deinit(page) do {} while (0)
1334 #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) 1338 #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
1335 #endif /* USE_SPLIT_PTLOCKS */ 1339 #endif /* USE_SPLIT_PTLOCKS */
1336 1340
1337 static inline void pgtable_page_ctor(struct page *page) 1341 static inline void pgtable_page_ctor(struct page *page)
1338 { 1342 {
1339 pte_lock_init(page); 1343 pte_lock_init(page);
1340 inc_zone_page_state(page, NR_PAGETABLE); 1344 inc_zone_page_state(page, NR_PAGETABLE);
1341 } 1345 }
1342 1346
1343 static inline void pgtable_page_dtor(struct page *page) 1347 static inline void pgtable_page_dtor(struct page *page)
1344 { 1348 {
1345 pte_lock_deinit(page); 1349 pte_lock_deinit(page);
1346 dec_zone_page_state(page, NR_PAGETABLE); 1350 dec_zone_page_state(page, NR_PAGETABLE);
1347 } 1351 }
1348 1352
1349 #define pte_offset_map_lock(mm, pmd, address, ptlp) \ 1353 #define pte_offset_map_lock(mm, pmd, address, ptlp) \
1350 ({ \ 1354 ({ \
1351 spinlock_t *__ptl = pte_lockptr(mm, pmd); \ 1355 spinlock_t *__ptl = pte_lockptr(mm, pmd); \
1352 pte_t *__pte = pte_offset_map(pmd, address); \ 1356 pte_t *__pte = pte_offset_map(pmd, address); \
1353 *(ptlp) = __ptl; \ 1357 *(ptlp) = __ptl; \
1354 spin_lock(__ptl); \ 1358 spin_lock(__ptl); \
1355 __pte; \ 1359 __pte; \
1356 }) 1360 })
1357 1361
1358 #define pte_unmap_unlock(pte, ptl) do { \ 1362 #define pte_unmap_unlock(pte, ptl) do { \
1359 spin_unlock(ptl); \ 1363 spin_unlock(ptl); \
1360 pte_unmap(pte); \ 1364 pte_unmap(pte); \
1361 } while (0) 1365 } while (0)
1362 1366
1363 #define pte_alloc_map(mm, vma, pmd, address) \ 1367 #define pte_alloc_map(mm, vma, pmd, address) \
1364 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ 1368 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \
1365 pmd, address))? \ 1369 pmd, address))? \
1366 NULL: pte_offset_map(pmd, address)) 1370 NULL: pte_offset_map(pmd, address))
1367 1371
1368 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ 1372 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \
1369 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ 1373 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \
1370 pmd, address))? \ 1374 pmd, address))? \
1371 NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) 1375 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
1372 1376
1373 #define pte_alloc_kernel(pmd, address) \ 1377 #define pte_alloc_kernel(pmd, address) \
1374 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ 1378 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
1375 NULL: pte_offset_kernel(pmd, address)) 1379 NULL: pte_offset_kernel(pmd, address))
1376 1380
1377 extern void free_area_init(unsigned long * zones_size); 1381 extern void free_area_init(unsigned long * zones_size);
1378 extern void free_area_init_node(int nid, unsigned long * zones_size, 1382 extern void free_area_init_node(int nid, unsigned long * zones_size,
1379 unsigned long zone_start_pfn, unsigned long *zholes_size); 1383 unsigned long zone_start_pfn, unsigned long *zholes_size);
1380 extern void free_initmem(void); 1384 extern void free_initmem(void);
1381 1385
1382 /* 1386 /*
1383 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) 1387 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
1384 * into the buddy system. The freed pages will be poisoned with pattern 1388 * into the buddy system. The freed pages will be poisoned with pattern
1385 * "poison" if it's within range [0, UCHAR_MAX]. 1389 * "poison" if it's within range [0, UCHAR_MAX].
1386 * Return pages freed into the buddy system. 1390 * Return pages freed into the buddy system.
1387 */ 1391 */
1388 extern unsigned long free_reserved_area(void *start, void *end, 1392 extern unsigned long free_reserved_area(void *start, void *end,
1389 int poison, char *s); 1393 int poison, char *s);
1390 1394
1391 #ifdef CONFIG_HIGHMEM 1395 #ifdef CONFIG_HIGHMEM
1392 /* 1396 /*
1393 * Free a highmem page into the buddy system, adjusting totalhigh_pages 1397 * Free a highmem page into the buddy system, adjusting totalhigh_pages
1394 * and totalram_pages. 1398 * and totalram_pages.
1395 */ 1399 */
1396 extern void free_highmem_page(struct page *page); 1400 extern void free_highmem_page(struct page *page);
1397 #endif 1401 #endif
1398 1402
1399 extern void adjust_managed_page_count(struct page *page, long count); 1403 extern void adjust_managed_page_count(struct page *page, long count);
1400 extern void mem_init_print_info(const char *str); 1404 extern void mem_init_print_info(const char *str);
1401 1405
1402 /* Free the reserved page into the buddy system, so it gets managed. */ 1406 /* Free the reserved page into the buddy system, so it gets managed. */
1403 static inline void __free_reserved_page(struct page *page) 1407 static inline void __free_reserved_page(struct page *page)
1404 { 1408 {
1405 ClearPageReserved(page); 1409 ClearPageReserved(page);
1406 init_page_count(page); 1410 init_page_count(page);
1407 __free_page(page); 1411 __free_page(page);
1408 } 1412 }
1409 1413
1410 static inline void free_reserved_page(struct page *page) 1414 static inline void free_reserved_page(struct page *page)
1411 { 1415 {
1412 __free_reserved_page(page); 1416 __free_reserved_page(page);
1413 adjust_managed_page_count(page, 1); 1417 adjust_managed_page_count(page, 1);
1414 } 1418 }
1415 1419
1416 static inline void mark_page_reserved(struct page *page) 1420 static inline void mark_page_reserved(struct page *page)
1417 { 1421 {
1418 SetPageReserved(page); 1422 SetPageReserved(page);
1419 adjust_managed_page_count(page, -1); 1423 adjust_managed_page_count(page, -1);
1420 } 1424 }
1421 1425
1422 /* 1426 /*
1423 * Default method to free all the __init memory into the buddy system. 1427 * Default method to free all the __init memory into the buddy system.
1424 * The freed pages will be poisoned with pattern "poison" if it's within 1428 * The freed pages will be poisoned with pattern "poison" if it's within
1425 * range [0, UCHAR_MAX]. 1429 * range [0, UCHAR_MAX].
1426 * Return pages freed into the buddy system. 1430 * Return pages freed into the buddy system.
1427 */ 1431 */
1428 static inline unsigned long free_initmem_default(int poison) 1432 static inline unsigned long free_initmem_default(int poison)
1429 { 1433 {
1430 extern char __init_begin[], __init_end[]; 1434 extern char __init_begin[], __init_end[];
1431 1435
1432 return free_reserved_area(&__init_begin, &__init_end, 1436 return free_reserved_area(&__init_begin, &__init_end,
1433 poison, "unused kernel"); 1437 poison, "unused kernel");
1434 } 1438 }
1435 1439
1436 static inline unsigned long get_num_physpages(void) 1440 static inline unsigned long get_num_physpages(void)
1437 { 1441 {
1438 int nid; 1442 int nid;
1439 unsigned long phys_pages = 0; 1443 unsigned long phys_pages = 0;
1440 1444
1441 for_each_online_node(nid) 1445 for_each_online_node(nid)
1442 phys_pages += node_present_pages(nid); 1446 phys_pages += node_present_pages(nid);
1443 1447
1444 return phys_pages; 1448 return phys_pages;
1445 } 1449 }
1446 1450
1447 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1451 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1448 /* 1452 /*
1449 * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its 1453 * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
1450 * zones, allocate the backing mem_map and account for memory holes in a more 1454 * zones, allocate the backing mem_map and account for memory holes in a more
1451 * architecture independent manner. This is a substitute for creating the 1455 * architecture independent manner. This is a substitute for creating the
1452 * zone_sizes[] and zholes_size[] arrays and passing them to 1456 * zone_sizes[] and zholes_size[] arrays and passing them to
1453 * free_area_init_node() 1457 * free_area_init_node()
1454 * 1458 *
1455 * An architecture is expected to register range of page frames backed by 1459 * An architecture is expected to register range of page frames backed by
1456 * physical memory with memblock_add[_node]() before calling 1460 * physical memory with memblock_add[_node]() before calling
1457 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic 1461 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
1458 * usage, an architecture is expected to do something like 1462 * usage, an architecture is expected to do something like
1459 * 1463 *
1460 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, 1464 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
1461 * max_highmem_pfn}; 1465 * max_highmem_pfn};
1462 * for_each_valid_physical_page_range() 1466 * for_each_valid_physical_page_range()
1463 * memblock_add_node(base, size, nid) 1467 * memblock_add_node(base, size, nid)
1464 * free_area_init_nodes(max_zone_pfns); 1468 * free_area_init_nodes(max_zone_pfns);
1465 * 1469 *
1466 * free_bootmem_with_active_regions() calls free_bootmem_node() for each 1470 * free_bootmem_with_active_regions() calls free_bootmem_node() for each
1467 * registered physical page range. Similarly 1471 * registered physical page range. Similarly
1468 * sparse_memory_present_with_active_regions() calls memory_present() for 1472 * sparse_memory_present_with_active_regions() calls memory_present() for
1469 * each range when SPARSEMEM is enabled. 1473 * each range when SPARSEMEM is enabled.
1470 * 1474 *
1471 * See mm/page_alloc.c for more information on each function exposed by 1475 * See mm/page_alloc.c for more information on each function exposed by
1472 * CONFIG_HAVE_MEMBLOCK_NODE_MAP. 1476 * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
1473 */ 1477 */
1474 extern void free_area_init_nodes(unsigned long *max_zone_pfn); 1478 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
1475 unsigned long node_map_pfn_alignment(void); 1479 unsigned long node_map_pfn_alignment(void);
1476 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, 1480 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
1477 unsigned long end_pfn); 1481 unsigned long end_pfn);
1478 extern unsigned long absent_pages_in_range(unsigned long start_pfn, 1482 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
1479 unsigned long end_pfn); 1483 unsigned long end_pfn);
1480 extern void get_pfn_range_for_nid(unsigned int nid, 1484 extern void get_pfn_range_for_nid(unsigned int nid,
1481 unsigned long *start_pfn, unsigned long *end_pfn); 1485 unsigned long *start_pfn, unsigned long *end_pfn);
1482 extern unsigned long find_min_pfn_with_active_regions(void); 1486 extern unsigned long find_min_pfn_with_active_regions(void);
1483 extern void free_bootmem_with_active_regions(int nid, 1487 extern void free_bootmem_with_active_regions(int nid,
1484 unsigned long max_low_pfn); 1488 unsigned long max_low_pfn);
1485 extern void sparse_memory_present_with_active_regions(int nid); 1489 extern void sparse_memory_present_with_active_regions(int nid);
1486 1490
1487 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1491 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
1488 1492
1489 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ 1493 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
1490 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) 1494 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
1491 static inline int __early_pfn_to_nid(unsigned long pfn) 1495 static inline int __early_pfn_to_nid(unsigned long pfn)
1492 { 1496 {
1493 return 0; 1497 return 0;
1494 } 1498 }
1495 #else 1499 #else
1496 /* please see mm/page_alloc.c */ 1500 /* please see mm/page_alloc.c */
1497 extern int __meminit early_pfn_to_nid(unsigned long pfn); 1501 extern int __meminit early_pfn_to_nid(unsigned long pfn);
1498 #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 1502 #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
1499 /* there is a per-arch backend function. */ 1503 /* there is a per-arch backend function. */
1500 extern int __meminit __early_pfn_to_nid(unsigned long pfn); 1504 extern int __meminit __early_pfn_to_nid(unsigned long pfn);
1501 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 1505 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1502 #endif 1506 #endif
1503 1507
1504 extern void set_dma_reserve(unsigned long new_dma_reserve); 1508 extern void set_dma_reserve(unsigned long new_dma_reserve);
1505 extern void memmap_init_zone(unsigned long, int, unsigned long, 1509 extern void memmap_init_zone(unsigned long, int, unsigned long,
1506 unsigned long, enum memmap_context); 1510 unsigned long, enum memmap_context);
1507 extern void setup_per_zone_wmarks(void); 1511 extern void setup_per_zone_wmarks(void);
1508 extern int __meminit init_per_zone_wmark_min(void); 1512 extern int __meminit init_per_zone_wmark_min(void);
1509 extern void mem_init(void); 1513 extern void mem_init(void);
1510 extern void __init mmap_init(void); 1514 extern void __init mmap_init(void);
1511 extern void show_mem(unsigned int flags); 1515 extern void show_mem(unsigned int flags);
1512 extern void si_meminfo(struct sysinfo * val); 1516 extern void si_meminfo(struct sysinfo * val);
1513 extern void si_meminfo_node(struct sysinfo *val, int nid); 1517 extern void si_meminfo_node(struct sysinfo *val, int nid);
1514 1518
1515 extern __printf(3, 4) 1519 extern __printf(3, 4)
1516 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); 1520 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
1517 1521
1518 extern void setup_per_cpu_pageset(void); 1522 extern void setup_per_cpu_pageset(void);
1519 1523
1520 extern void zone_pcp_update(struct zone *zone); 1524 extern void zone_pcp_update(struct zone *zone);
1521 extern void zone_pcp_reset(struct zone *zone); 1525 extern void zone_pcp_reset(struct zone *zone);
1522 1526
1523 /* page_alloc.c */ 1527 /* page_alloc.c */
1524 extern int min_free_kbytes; 1528 extern int min_free_kbytes;
1525 1529
1526 /* nommu.c */ 1530 /* nommu.c */
1527 extern atomic_long_t mmap_pages_allocated; 1531 extern atomic_long_t mmap_pages_allocated;
1528 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); 1532 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
1529 1533
1530 /* interval_tree.c */ 1534 /* interval_tree.c */
1531 void vma_interval_tree_insert(struct vm_area_struct *node, 1535 void vma_interval_tree_insert(struct vm_area_struct *node,
1532 struct rb_root *root); 1536 struct rb_root *root);
1533 void vma_interval_tree_insert_after(struct vm_area_struct *node, 1537 void vma_interval_tree_insert_after(struct vm_area_struct *node,
1534 struct vm_area_struct *prev, 1538 struct vm_area_struct *prev,
1535 struct rb_root *root); 1539 struct rb_root *root);
1536 void vma_interval_tree_remove(struct vm_area_struct *node, 1540 void vma_interval_tree_remove(struct vm_area_struct *node,
1537 struct rb_root *root); 1541 struct rb_root *root);
1538 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, 1542 struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
1539 unsigned long start, unsigned long last); 1543 unsigned long start, unsigned long last);
1540 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, 1544 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
1541 unsigned long start, unsigned long last); 1545 unsigned long start, unsigned long last);
1542 1546
1543 #define vma_interval_tree_foreach(vma, root, start, last) \ 1547 #define vma_interval_tree_foreach(vma, root, start, last) \
1544 for (vma = vma_interval_tree_iter_first(root, start, last); \ 1548 for (vma = vma_interval_tree_iter_first(root, start, last); \
1545 vma; vma = vma_interval_tree_iter_next(vma, start, last)) 1549 vma; vma = vma_interval_tree_iter_next(vma, start, last))
1546 1550
1547 static inline void vma_nonlinear_insert(struct vm_area_struct *vma, 1551 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
1548 struct list_head *list) 1552 struct list_head *list)
1549 { 1553 {
1550 list_add_tail(&vma->shared.nonlinear, list); 1554 list_add_tail(&vma->shared.nonlinear, list);
1551 } 1555 }
1552 1556
1553 void anon_vma_interval_tree_insert(struct anon_vma_chain *node, 1557 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
1554 struct rb_root *root); 1558 struct rb_root *root);
1555 void anon_vma_interval_tree_remove(struct anon_vma_chain *node, 1559 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
1556 struct rb_root *root); 1560 struct rb_root *root);
1557 struct anon_vma_chain *anon_vma_interval_tree_iter_first( 1561 struct anon_vma_chain *anon_vma_interval_tree_iter_first(
1558 struct rb_root *root, unsigned long start, unsigned long last); 1562 struct rb_root *root, unsigned long start, unsigned long last);
1559 struct anon_vma_chain *anon_vma_interval_tree_iter_next( 1563 struct anon_vma_chain *anon_vma_interval_tree_iter_next(
1560 struct anon_vma_chain *node, unsigned long start, unsigned long last); 1564 struct anon_vma_chain *node, unsigned long start, unsigned long last);
1561 #ifdef CONFIG_DEBUG_VM_RB 1565 #ifdef CONFIG_DEBUG_VM_RB
1562 void anon_vma_interval_tree_verify(struct anon_vma_chain *node); 1566 void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
1563 #endif 1567 #endif
1564 1568
1565 #define anon_vma_interval_tree_foreach(avc, root, start, last) \ 1569 #define anon_vma_interval_tree_foreach(avc, root, start, last) \
1566 for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ 1570 for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
1567 avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) 1571 avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
1568 1572
1569 /* mmap.c */ 1573 /* mmap.c */
1570 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 1574 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
1571 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, 1575 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1572 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); 1576 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
1573 extern struct vm_area_struct *vma_merge(struct mm_struct *, 1577 extern struct vm_area_struct *vma_merge(struct mm_struct *,
1574 struct vm_area_struct *prev, unsigned long addr, unsigned long end, 1578 struct vm_area_struct *prev, unsigned long addr, unsigned long end,
1575 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, 1579 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
1576 struct mempolicy *); 1580 struct mempolicy *);
1577 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); 1581 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
1578 extern int split_vma(struct mm_struct *, 1582 extern int split_vma(struct mm_struct *,
1579 struct vm_area_struct *, unsigned long addr, int new_below); 1583 struct vm_area_struct *, unsigned long addr, int new_below);
1580 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 1584 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
1581 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, 1585 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
1582 struct rb_node **, struct rb_node *); 1586 struct rb_node **, struct rb_node *);
1583 extern void unlink_file_vma(struct vm_area_struct *); 1587 extern void unlink_file_vma(struct vm_area_struct *);
1584 extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 1588 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1585 unsigned long addr, unsigned long len, pgoff_t pgoff, 1589 unsigned long addr, unsigned long len, pgoff_t pgoff,
1586 bool *need_rmap_locks); 1590 bool *need_rmap_locks);
1587 extern void exit_mmap(struct mm_struct *); 1591 extern void exit_mmap(struct mm_struct *);
1588 1592
1589 extern int mm_take_all_locks(struct mm_struct *mm); 1593 extern int mm_take_all_locks(struct mm_struct *mm);
1590 extern void mm_drop_all_locks(struct mm_struct *mm); 1594 extern void mm_drop_all_locks(struct mm_struct *mm);
1591 1595
1592 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 1596 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
1593 extern struct file *get_mm_exe_file(struct mm_struct *mm); 1597 extern struct file *get_mm_exe_file(struct mm_struct *mm);
1594 1598
1595 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); 1599 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
1596 extern int install_special_mapping(struct mm_struct *mm, 1600 extern int install_special_mapping(struct mm_struct *mm,
1597 unsigned long addr, unsigned long len, 1601 unsigned long addr, unsigned long len,
1598 unsigned long flags, struct page **pages); 1602 unsigned long flags, struct page **pages);
1599 1603
1600 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1604 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1601 1605
1602 extern unsigned long mmap_region(struct file *file, unsigned long addr, 1606 extern unsigned long mmap_region(struct file *file, unsigned long addr,
1603 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); 1607 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
1604 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1608 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1605 unsigned long len, unsigned long prot, unsigned long flags, 1609 unsigned long len, unsigned long prot, unsigned long flags,
1606 unsigned long pgoff, unsigned long *populate); 1610 unsigned long pgoff, unsigned long *populate);
1607 extern int do_munmap(struct mm_struct *, unsigned long, size_t); 1611 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
1608 1612
1609 #ifdef CONFIG_MMU 1613 #ifdef CONFIG_MMU
1610 extern int __mm_populate(unsigned long addr, unsigned long len, 1614 extern int __mm_populate(unsigned long addr, unsigned long len,
1611 int ignore_errors); 1615 int ignore_errors);
1612 static inline void mm_populate(unsigned long addr, unsigned long len) 1616 static inline void mm_populate(unsigned long addr, unsigned long len)
1613 { 1617 {
1614 /* Ignore errors */ 1618 /* Ignore errors */
1615 (void) __mm_populate(addr, len, 1); 1619 (void) __mm_populate(addr, len, 1);
1616 } 1620 }
1617 #else 1621 #else
1618 static inline void mm_populate(unsigned long addr, unsigned long len) {} 1622 static inline void mm_populate(unsigned long addr, unsigned long len) {}
1619 #endif 1623 #endif
1620 1624
1621 /* These take the mm semaphore themselves */ 1625 /* These take the mm semaphore themselves */
1622 extern unsigned long vm_brk(unsigned long, unsigned long); 1626 extern unsigned long vm_brk(unsigned long, unsigned long);
1623 extern int vm_munmap(unsigned long, size_t); 1627 extern int vm_munmap(unsigned long, size_t);
1624 extern unsigned long vm_mmap(struct file *, unsigned long, 1628 extern unsigned long vm_mmap(struct file *, unsigned long,
1625 unsigned long, unsigned long, 1629 unsigned long, unsigned long,
1626 unsigned long, unsigned long); 1630 unsigned long, unsigned long);
1627 1631
1628 struct vm_unmapped_area_info { 1632 struct vm_unmapped_area_info {
1629 #define VM_UNMAPPED_AREA_TOPDOWN 1 1633 #define VM_UNMAPPED_AREA_TOPDOWN 1
1630 unsigned long flags; 1634 unsigned long flags;
1631 unsigned long length; 1635 unsigned long length;
1632 unsigned long low_limit; 1636 unsigned long low_limit;
1633 unsigned long high_limit; 1637 unsigned long high_limit;
1634 unsigned long align_mask; 1638 unsigned long align_mask;
1635 unsigned long align_offset; 1639 unsigned long align_offset;
1636 }; 1640 };
1637 1641
1638 extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); 1642 extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
1639 extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); 1643 extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
1640 1644
1641 /* 1645 /*
1642 * Search for an unmapped address range. 1646 * Search for an unmapped address range.
1643 * 1647 *
1644 * We are looking for a range that: 1648 * We are looking for a range that:
1645 * - does not intersect with any VMA; 1649 * - does not intersect with any VMA;
1646 * - is contained within the [low_limit, high_limit) interval; 1650 * - is contained within the [low_limit, high_limit) interval;
1647 * - is at least the desired size. 1651 * - is at least the desired size.
1648 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) 1652 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
1649 */ 1653 */
1650 static inline unsigned long 1654 static inline unsigned long
1651 vm_unmapped_area(struct vm_unmapped_area_info *info) 1655 vm_unmapped_area(struct vm_unmapped_area_info *info)
1652 { 1656 {
1653 if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN)) 1657 if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
1654 return unmapped_area(info); 1658 return unmapped_area(info);
1655 else 1659 else
1656 return unmapped_area_topdown(info); 1660 return unmapped_area_topdown(info);
1657 } 1661 }
1658 1662
1659 /* truncate.c */ 1663 /* truncate.c */
1660 extern void truncate_inode_pages(struct address_space *, loff_t); 1664 extern void truncate_inode_pages(struct address_space *, loff_t);
1661 extern void truncate_inode_pages_range(struct address_space *, 1665 extern void truncate_inode_pages_range(struct address_space *,
1662 loff_t lstart, loff_t lend); 1666 loff_t lstart, loff_t lend);
1663 1667
1664 /* generic vm_area_ops exported for stackable file systems */ 1668 /* generic vm_area_ops exported for stackable file systems */
1665 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 1669 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
1666 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1670 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1667 1671
1668 /* mm/page-writeback.c */ 1672 /* mm/page-writeback.c */
1669 int write_one_page(struct page *page, int wait); 1673 int write_one_page(struct page *page, int wait);
1670 void task_dirty_inc(struct task_struct *tsk); 1674 void task_dirty_inc(struct task_struct *tsk);
1671 1675
1672 /* readahead.c */ 1676 /* readahead.c */
1673 #define VM_MAX_READAHEAD 128 /* kbytes */ 1677 #define VM_MAX_READAHEAD 128 /* kbytes */
1674 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ 1678 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
1675 1679
1676 int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 1680 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
1677 pgoff_t offset, unsigned long nr_to_read); 1681 pgoff_t offset, unsigned long nr_to_read);
1678 1682
1679 void page_cache_sync_readahead(struct address_space *mapping, 1683 void page_cache_sync_readahead(struct address_space *mapping,
1680 struct file_ra_state *ra, 1684 struct file_ra_state *ra,
1681 struct file *filp, 1685 struct file *filp,
1682 pgoff_t offset, 1686 pgoff_t offset,
1683 unsigned long size); 1687 unsigned long size);
1684 1688
1685 void page_cache_async_readahead(struct address_space *mapping, 1689 void page_cache_async_readahead(struct address_space *mapping,
1686 struct file_ra_state *ra, 1690 struct file_ra_state *ra,
1687 struct file *filp, 1691 struct file *filp,
1688 struct page *pg, 1692 struct page *pg,
1689 pgoff_t offset, 1693 pgoff_t offset,
1690 unsigned long size); 1694 unsigned long size);
1691 1695
1692 unsigned long max_sane_readahead(unsigned long nr); 1696 unsigned long max_sane_readahead(unsigned long nr);
1693 unsigned long ra_submit(struct file_ra_state *ra, 1697 unsigned long ra_submit(struct file_ra_state *ra,
1694 struct address_space *mapping, 1698 struct address_space *mapping,
1695 struct file *filp); 1699 struct file *filp);
1696 1700
1697 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ 1701 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
1698 extern int expand_stack(struct vm_area_struct *vma, unsigned long address); 1702 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
1699 1703
1700 /* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ 1704 /* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */
1701 extern int expand_downwards(struct vm_area_struct *vma, 1705 extern int expand_downwards(struct vm_area_struct *vma,
1702 unsigned long address); 1706 unsigned long address);
1703 #if VM_GROWSUP 1707 #if VM_GROWSUP
1704 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); 1708 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
1705 #else 1709 #else
1706 #define expand_upwards(vma, address) do { } while (0) 1710 #define expand_upwards(vma, address) do { } while (0)
1707 #endif 1711 #endif
1708 1712
1709 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1713 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1710 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); 1714 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
1711 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, 1715 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
1712 struct vm_area_struct **pprev); 1716 struct vm_area_struct **pprev);
1713 1717
1714 /* Look up the first VMA which intersects the interval start_addr..end_addr-1, 1718 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
1715 NULL if none. Assume start_addr < end_addr. */ 1719 NULL if none. Assume start_addr < end_addr. */
1716 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) 1720 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
1717 { 1721 {
1718 struct vm_area_struct * vma = find_vma(mm,start_addr); 1722 struct vm_area_struct * vma = find_vma(mm,start_addr);
1719 1723
1720 if (vma && end_addr <= vma->vm_start) 1724 if (vma && end_addr <= vma->vm_start)
1721 vma = NULL; 1725 vma = NULL;
1722 return vma; 1726 return vma;
1723 } 1727 }
1724 1728
1725 static inline unsigned long vma_pages(struct vm_area_struct *vma) 1729 static inline unsigned long vma_pages(struct vm_area_struct *vma)
1726 { 1730 {
1727 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 1731 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
1728 } 1732 }
1729 1733
1730 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ 1734 /* Look up the first VMA which exactly match the interval vm_start ... vm_end */
1731 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, 1735 static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
1732 unsigned long vm_start, unsigned long vm_end) 1736 unsigned long vm_start, unsigned long vm_end)
1733 { 1737 {
1734 struct vm_area_struct *vma = find_vma(mm, vm_start); 1738 struct vm_area_struct *vma = find_vma(mm, vm_start);
1735 1739
1736 if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) 1740 if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
1737 vma = NULL; 1741 vma = NULL;
1738 1742
1739 return vma; 1743 return vma;
1740 } 1744 }
1741 1745
1742 #ifdef CONFIG_MMU 1746 #ifdef CONFIG_MMU
1743 pgprot_t vm_get_page_prot(unsigned long vm_flags); 1747 pgprot_t vm_get_page_prot(unsigned long vm_flags);
1744 #else 1748 #else
1745 static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) 1749 static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
1746 { 1750 {
1747 return __pgprot(0); 1751 return __pgprot(0);
1748 } 1752 }
1749 #endif 1753 #endif
1750 1754
1751 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE 1755 #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
1752 unsigned long change_prot_numa(struct vm_area_struct *vma, 1756 unsigned long change_prot_numa(struct vm_area_struct *vma,
1753 unsigned long start, unsigned long end); 1757 unsigned long start, unsigned long end);
1754 #endif 1758 #endif
1755 1759
1756 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); 1760 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1757 int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 1761 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1758 unsigned long pfn, unsigned long size, pgprot_t); 1762 unsigned long pfn, unsigned long size, pgprot_t);
1759 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 1763 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
1760 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1764 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1761 unsigned long pfn); 1765 unsigned long pfn);
1762 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1766 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1763 unsigned long pfn); 1767 unsigned long pfn);
1764 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 1768 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
1765 1769
1766 1770
1767 struct page *follow_page_mask(struct vm_area_struct *vma, 1771 struct page *follow_page_mask(struct vm_area_struct *vma,
1768 unsigned long address, unsigned int foll_flags, 1772 unsigned long address, unsigned int foll_flags,
1769 unsigned int *page_mask); 1773 unsigned int *page_mask);
1770 1774
1771 static inline struct page *follow_page(struct vm_area_struct *vma, 1775 static inline struct page *follow_page(struct vm_area_struct *vma,
1772 unsigned long address, unsigned int foll_flags) 1776 unsigned long address, unsigned int foll_flags)
1773 { 1777 {
1774 unsigned int unused_page_mask; 1778 unsigned int unused_page_mask;
1775 return follow_page_mask(vma, address, foll_flags, &unused_page_mask); 1779 return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
1776 } 1780 }
1777 1781
1778 #define FOLL_WRITE 0x01 /* check pte is writable */ 1782 #define FOLL_WRITE 0x01 /* check pte is writable */
1779 #define FOLL_TOUCH 0x02 /* mark page accessed */ 1783 #define FOLL_TOUCH 0x02 /* mark page accessed */
1780 #define FOLL_GET 0x04 /* do get_page on page */ 1784 #define FOLL_GET 0x04 /* do get_page on page */
1781 #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ 1785 #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
1782 #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ 1786 #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
1783 #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO 1787 #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
1784 * and return without waiting upon it */ 1788 * and return without waiting upon it */
1785 #define FOLL_MLOCK 0x40 /* mark page as mlocked */ 1789 #define FOLL_MLOCK 0x40 /* mark page as mlocked */
1786 #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ 1790 #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
1787 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 1791 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1788 #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 1792 #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
1789 #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 1793 #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
1790 1794
1791 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 1795 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1792 void *data); 1796 void *data);
1793 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, 1797 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
1794 unsigned long size, pte_fn_t fn, void *data); 1798 unsigned long size, pte_fn_t fn, void *data);
1795 1799
1796 #ifdef CONFIG_PROC_FS 1800 #ifdef CONFIG_PROC_FS
1797 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); 1801 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
1798 #else 1802 #else
1799 static inline void vm_stat_account(struct mm_struct *mm, 1803 static inline void vm_stat_account(struct mm_struct *mm,
1800 unsigned long flags, struct file *file, long pages) 1804 unsigned long flags, struct file *file, long pages)
1801 { 1805 {
1802 mm->total_vm += pages; 1806 mm->total_vm += pages;
1803 } 1807 }
1804 #endif /* CONFIG_PROC_FS */ 1808 #endif /* CONFIG_PROC_FS */
1805 1809
1806 #ifdef CONFIG_DEBUG_PAGEALLOC 1810 #ifdef CONFIG_DEBUG_PAGEALLOC
1807 extern void kernel_map_pages(struct page *page, int numpages, int enable); 1811 extern void kernel_map_pages(struct page *page, int numpages, int enable);
1808 #ifdef CONFIG_HIBERNATION 1812 #ifdef CONFIG_HIBERNATION
1809 extern bool kernel_page_present(struct page *page); 1813 extern bool kernel_page_present(struct page *page);
1810 #endif /* CONFIG_HIBERNATION */ 1814 #endif /* CONFIG_HIBERNATION */
1811 #else 1815 #else
1812 static inline void 1816 static inline void
1813 kernel_map_pages(struct page *page, int numpages, int enable) {} 1817 kernel_map_pages(struct page *page, int numpages, int enable) {}
1814 #ifdef CONFIG_HIBERNATION 1818 #ifdef CONFIG_HIBERNATION
1815 static inline bool kernel_page_present(struct page *page) { return true; } 1819 static inline bool kernel_page_present(struct page *page) { return true; }
1816 #endif /* CONFIG_HIBERNATION */ 1820 #endif /* CONFIG_HIBERNATION */
1817 #endif 1821 #endif
1818 1822
1819 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); 1823 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
1820 #ifdef __HAVE_ARCH_GATE_AREA 1824 #ifdef __HAVE_ARCH_GATE_AREA
1821 int in_gate_area_no_mm(unsigned long addr); 1825 int in_gate_area_no_mm(unsigned long addr);
1822 int in_gate_area(struct mm_struct *mm, unsigned long addr); 1826 int in_gate_area(struct mm_struct *mm, unsigned long addr);
1823 #else 1827 #else
1824 int in_gate_area_no_mm(unsigned long addr); 1828 int in_gate_area_no_mm(unsigned long addr);
1825 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) 1829 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
1826 #endif /* __HAVE_ARCH_GATE_AREA */ 1830 #endif /* __HAVE_ARCH_GATE_AREA */
1827 1831
1828 #ifdef CONFIG_SYSCTL 1832 #ifdef CONFIG_SYSCTL
1829 extern int sysctl_drop_caches; 1833 extern int sysctl_drop_caches;
1830 int drop_caches_sysctl_handler(struct ctl_table *, int, 1834 int drop_caches_sysctl_handler(struct ctl_table *, int,
1831 void __user *, size_t *, loff_t *); 1835 void __user *, size_t *, loff_t *);
1832 #endif 1836 #endif
1833 1837
1834 unsigned long shrink_slab(struct shrink_control *shrink, 1838 unsigned long shrink_slab(struct shrink_control *shrink,
1835 unsigned long nr_pages_scanned, 1839 unsigned long nr_pages_scanned,
1836 unsigned long lru_pages); 1840 unsigned long lru_pages);
1837 1841
1838 #ifndef CONFIG_MMU 1842 #ifndef CONFIG_MMU
1839 #define randomize_va_space 0 1843 #define randomize_va_space 0
1840 #else 1844 #else
1841 extern int randomize_va_space; 1845 extern int randomize_va_space;
1842 #endif 1846 #endif
1843 1847
1844 const char * arch_vma_name(struct vm_area_struct *vma); 1848 const char * arch_vma_name(struct vm_area_struct *vma);
1845 void print_vma_addr(char *prefix, unsigned long rip); 1849 void print_vma_addr(char *prefix, unsigned long rip);
1846 1850
1847 void sparse_mem_maps_populate_node(struct page **map_map, 1851 void sparse_mem_maps_populate_node(struct page **map_map,
1848 unsigned long pnum_begin, 1852 unsigned long pnum_begin,
1849 unsigned long pnum_end, 1853 unsigned long pnum_end,
1850 unsigned long map_count, 1854 unsigned long map_count,
1851 int nodeid); 1855 int nodeid);
1852 1856
1853 struct page *sparse_mem_map_populate(unsigned long pnum, int nid); 1857 struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
1854 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); 1858 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
1855 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); 1859 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
1856 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); 1860 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
1857 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); 1861 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
1858 void *vmemmap_alloc_block(unsigned long size, int node); 1862 void *vmemmap_alloc_block(unsigned long size, int node);
1859 void *vmemmap_alloc_block_buf(unsigned long size, int node); 1863 void *vmemmap_alloc_block_buf(unsigned long size, int node);
1860 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); 1864 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
1861 int vmemmap_populate_basepages(unsigned long start, unsigned long end, 1865 int vmemmap_populate_basepages(unsigned long start, unsigned long end,
1862 int node); 1866 int node);
1863 int vmemmap_populate(unsigned long start, unsigned long end, int node); 1867 int vmemmap_populate(unsigned long start, unsigned long end, int node);
1864 void vmemmap_populate_print_last(void); 1868 void vmemmap_populate_print_last(void);
1865 #ifdef CONFIG_MEMORY_HOTPLUG 1869 #ifdef CONFIG_MEMORY_HOTPLUG
1866 void vmemmap_free(unsigned long start, unsigned long end); 1870 void vmemmap_free(unsigned long start, unsigned long end);
1867 #endif 1871 #endif
1868 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, 1872 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
1869 unsigned long size); 1873 unsigned long size);
1870 1874
1871 enum mf_flags { 1875 enum mf_flags {
1872 MF_COUNT_INCREASED = 1 << 0, 1876 MF_COUNT_INCREASED = 1 << 0,
1873 MF_ACTION_REQUIRED = 1 << 1, 1877 MF_ACTION_REQUIRED = 1 << 1,
1874 MF_MUST_KILL = 1 << 2, 1878 MF_MUST_KILL = 1 << 2,
1875 MF_SOFT_OFFLINE = 1 << 3, 1879 MF_SOFT_OFFLINE = 1 << 3,
1876 }; 1880 };
1877 extern int memory_failure(unsigned long pfn, int trapno, int flags); 1881 extern int memory_failure(unsigned long pfn, int trapno, int flags);
1878 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); 1882 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
1879 extern int unpoison_memory(unsigned long pfn); 1883 extern int unpoison_memory(unsigned long pfn);
1880 extern int sysctl_memory_failure_early_kill; 1884 extern int sysctl_memory_failure_early_kill;
1881 extern int sysctl_memory_failure_recovery; 1885 extern int sysctl_memory_failure_recovery;
1882 extern void shake_page(struct page *p, int access); 1886 extern void shake_page(struct page *p, int access);
1883 extern atomic_long_t num_poisoned_pages; 1887 extern atomic_long_t num_poisoned_pages;
1884 extern int soft_offline_page(struct page *page, int flags); 1888 extern int soft_offline_page(struct page *page, int flags);
1885 1889
1886 extern void dump_page(struct page *page); 1890 extern void dump_page(struct page *page);
1887 1891
1888 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 1892 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
1889 extern void clear_huge_page(struct page *page, 1893 extern void clear_huge_page(struct page *page,
1890 unsigned long addr, 1894 unsigned long addr,
1891 unsigned int pages_per_huge_page); 1895 unsigned int pages_per_huge_page);
1892 extern void copy_user_huge_page(struct page *dst, struct page *src, 1896 extern void copy_user_huge_page(struct page *dst, struct page *src,
1893 unsigned long addr, struct vm_area_struct *vma, 1897 unsigned long addr, struct vm_area_struct *vma,
1894 unsigned int pages_per_huge_page); 1898 unsigned int pages_per_huge_page);
1895 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 1899 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
1896 1900
1897 #ifdef CONFIG_DEBUG_PAGEALLOC 1901 #ifdef CONFIG_DEBUG_PAGEALLOC
1898 extern unsigned int _debug_guardpage_minorder; 1902 extern unsigned int _debug_guardpage_minorder;
1899 1903
1900 static inline unsigned int debug_guardpage_minorder(void) 1904 static inline unsigned int debug_guardpage_minorder(void)
1901 { 1905 {
1902 return _debug_guardpage_minorder; 1906 return _debug_guardpage_minorder;
1903 } 1907 }
1904 1908
1905 static inline bool page_is_guard(struct page *page) 1909 static inline bool page_is_guard(struct page *page)
1906 { 1910 {
1907 return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 1911 return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
1908 } 1912 }
1909 #else 1913 #else
1910 static inline unsigned int debug_guardpage_minorder(void) { return 0; } 1914 static inline unsigned int debug_guardpage_minorder(void) { return 0; }
1911 static inline bool page_is_guard(struct page *page) { return false; } 1915 static inline bool page_is_guard(struct page *page) { return false; }
1912 #endif /* CONFIG_DEBUG_PAGEALLOC */ 1916 #endif /* CONFIG_DEBUG_PAGEALLOC */
1913 1917
1914 #if MAX_NUMNODES > 1 1918 #if MAX_NUMNODES > 1
1915 void __init setup_nr_node_ids(void); 1919 void __init setup_nr_node_ids(void);
1916 #else 1920 #else
1917 static inline void setup_nr_node_ids(void) {} 1921 static inline void setup_nr_node_ids(void) {}
1918 #endif 1922 #endif
1919 1923
1920 #endif /* __KERNEL__ */ 1924 #endif /* __KERNEL__ */
1921 #endif /* _LINUX_MM_H */ 1925 #endif /* _LINUX_MM_H */
1922 1926
1 /* 1 /*
2 * Procedures for maintaining information about logical memory blocks. 2 * Procedures for maintaining information about logical memory blocks.
3 * 3 *
4 * Peter Bergner, IBM Corp. June 2001. 4 * Peter Bergner, IBM Corp. June 2001.
5 * Copyright (C) 2001 Peter Bergner. 5 * Copyright (C) 2001 Peter Bergner.
6 * 6 *
7 * This program is free software; you can redistribute it and/or 7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License 8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version. 10 * 2 of the License, or (at your option) any later version.
11 */ 11 */
12 12
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/init.h> 15 #include <linux/init.h>
16 #include <linux/bitops.h> 16 #include <linux/bitops.h>
17 #include <linux/poison.h> 17 #include <linux/poison.h>
18 #include <linux/pfn.h> 18 #include <linux/pfn.h>
19 #include <linux/debugfs.h> 19 #include <linux/debugfs.h>
20 #include <linux/seq_file.h> 20 #include <linux/seq_file.h>
21 #include <linux/memblock.h> 21 #include <linux/memblock.h>
22 22
23 #include <asm-generic/sections.h>
24
23 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 25 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 26 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25 27
26 struct memblock memblock __initdata_memblock = { 28 struct memblock memblock __initdata_memblock = {
27 .memory.regions = memblock_memory_init_regions, 29 .memory.regions = memblock_memory_init_regions,
28 .memory.cnt = 1, /* empty dummy entry */ 30 .memory.cnt = 1, /* empty dummy entry */
29 .memory.max = INIT_MEMBLOCK_REGIONS, 31 .memory.max = INIT_MEMBLOCK_REGIONS,
30 32
31 .reserved.regions = memblock_reserved_init_regions, 33 .reserved.regions = memblock_reserved_init_regions,
32 .reserved.cnt = 1, /* empty dummy entry */ 34 .reserved.cnt = 1, /* empty dummy entry */
33 .reserved.max = INIT_MEMBLOCK_REGIONS, 35 .reserved.max = INIT_MEMBLOCK_REGIONS,
34 36
37 .bottom_up = false,
35 .current_limit = MEMBLOCK_ALLOC_ANYWHERE, 38 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36 }; 39 };
37 40
38 int memblock_debug __initdata_memblock; 41 int memblock_debug __initdata_memblock;
39 static int memblock_can_resize __initdata_memblock; 42 static int memblock_can_resize __initdata_memblock;
40 static int memblock_memory_in_slab __initdata_memblock = 0; 43 static int memblock_memory_in_slab __initdata_memblock = 0;
41 static int memblock_reserved_in_slab __initdata_memblock = 0; 44 static int memblock_reserved_in_slab __initdata_memblock = 0;
42 45
43 /* inline so we don't get a warning when pr_debug is compiled out */ 46 /* inline so we don't get a warning when pr_debug is compiled out */
44 static __init_memblock const char * 47 static __init_memblock const char *
45 memblock_type_name(struct memblock_type *type) 48 memblock_type_name(struct memblock_type *type)
46 { 49 {
47 if (type == &memblock.memory) 50 if (type == &memblock.memory)
48 return "memory"; 51 return "memory";
49 else if (type == &memblock.reserved) 52 else if (type == &memblock.reserved)
50 return "reserved"; 53 return "reserved";
51 else 54 else
52 return "unknown"; 55 return "unknown";
53 } 56 }
54 57
55 /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ 58 /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
56 static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) 59 static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
57 { 60 {
58 return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); 61 return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
59 } 62 }
60 63
61 /* 64 /*
62 * Address comparison utilities 65 * Address comparison utilities
63 */ 66 */
64 static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, 67 static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
65 phys_addr_t base2, phys_addr_t size2) 68 phys_addr_t base2, phys_addr_t size2)
66 { 69 {
67 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 70 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
68 } 71 }
69 72
70 static long __init_memblock memblock_overlaps_region(struct memblock_type *type, 73 static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
71 phys_addr_t base, phys_addr_t size) 74 phys_addr_t base, phys_addr_t size)
72 { 75 {
73 unsigned long i; 76 unsigned long i;
74 77
75 for (i = 0; i < type->cnt; i++) { 78 for (i = 0; i < type->cnt; i++) {
76 phys_addr_t rgnbase = type->regions[i].base; 79 phys_addr_t rgnbase = type->regions[i].base;
77 phys_addr_t rgnsize = type->regions[i].size; 80 phys_addr_t rgnsize = type->regions[i].size;
78 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) 81 if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
79 break; 82 break;
80 } 83 }
81 84
82 return (i < type->cnt) ? i : -1; 85 return (i < type->cnt) ? i : -1;
83 } 86 }
84 87
88 /*
89 * __memblock_find_range_bottom_up - find free area utility in bottom-up
90 * @start: start of candidate range
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find
93 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
95 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 *
98 * RETURNS:
99 * Found address on success, 0 on failure.
100 */
101 static phys_addr_t __init_memblock
102 __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align, int nid)
104 {
105 phys_addr_t this_start, this_end, cand;
106 u64 i;
107
108 for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
109 this_start = clamp(this_start, start, end);
110 this_end = clamp(this_end, start, end);
111
112 cand = round_up(this_start, align);
113 if (cand < this_end && this_end - cand >= size)
114 return cand;
115 }
116
117 return 0;
118 }
119
85 /** 120 /**
86 * __memblock_find_range_top_down - find free area utility, in top-down 121 * __memblock_find_range_top_down - find free area utility, in top-down
87 * @start: start of candidate range 122 * @start: start of candidate range
88 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
89 * @size: size of free area to find 124 * @size: size of free area to find
90 * @align: alignment of free area to find 125 * @align: alignment of free area to find
91 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
92 * 127 *
93 * Utility called from memblock_find_in_range_node(), find free area top-down. 128 * Utility called from memblock_find_in_range_node(), find free area top-down.
94 * 129 *
95 * RETURNS: 130 * RETURNS:
96 * Found address on success, %0 on failure. 131 * Found address on success, 0 on failure.
97 */ 132 */
98 static phys_addr_t __init_memblock 133 static phys_addr_t __init_memblock
99 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, 134 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
100 phys_addr_t size, phys_addr_t align, int nid) 135 phys_addr_t size, phys_addr_t align, int nid)
101 { 136 {
102 phys_addr_t this_start, this_end, cand; 137 phys_addr_t this_start, this_end, cand;
103 u64 i; 138 u64 i;
104 139
105 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { 140 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
106 this_start = clamp(this_start, start, end); 141 this_start = clamp(this_start, start, end);
107 this_end = clamp(this_end, start, end); 142 this_end = clamp(this_end, start, end);
108 143
109 if (this_end < size) 144 if (this_end < size)
110 continue; 145 continue;
111 146
112 cand = round_down(this_end - size, align); 147 cand = round_down(this_end - size, align);
113 if (cand >= this_start) 148 if (cand >= this_start)
114 return cand; 149 return cand;
115 } 150 }
116 151
117 return 0; 152 return 0;
118 } 153 }
119 154
120 /** 155 /**
121 * memblock_find_in_range_node - find free area in given range and node 156 * memblock_find_in_range_node - find free area in given range and node
122 * @start: start of candidate range 157 * @start: start of candidate range
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 158 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find 159 * @size: size of free area to find
125 * @align: alignment of free area to find 160 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node 161 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
127 * 162 *
128 * Find @size free area aligned to @align in the specified range and node. 163 * Find @size free area aligned to @align in the specified range and node.
129 * 164 *
165 * When allocation direction is bottom-up, the @start should be greater
166 * than the end of the kernel image. Otherwise, it will be trimmed. The
167 * reason is that we want the bottom-up allocation just near the kernel
168 * image so it is highly likely that the allocated memory and the kernel
169 * will reside in the same node.
170 *
171 * If bottom-up allocation failed, will try to allocate memory top-down.
172 *
130 * RETURNS: 173 * RETURNS:
131 * Found address on success, %0 on failure. 174 * Found address on success, 0 on failure.
132 */ 175 */
133 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 176 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134 phys_addr_t end, phys_addr_t size, 177 phys_addr_t end, phys_addr_t size,
135 phys_addr_t align, int nid) 178 phys_addr_t align, int nid)
136 { 179 {
180 int ret;
181 phys_addr_t kernel_end;
182
137 /* pump up @end */ 183 /* pump up @end */
138 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 184 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
139 end = memblock.current_limit; 185 end = memblock.current_limit;
140 186
141 /* avoid allocating the first page */ 187 /* avoid allocating the first page */
142 start = max_t(phys_addr_t, start, PAGE_SIZE); 188 start = max_t(phys_addr_t, start, PAGE_SIZE);
143 end = max(start, end); 189 end = max(start, end);
190 kernel_end = __pa_symbol(_end);
144 191
192 /*
193 * try bottom-up allocation only when bottom-up mode
194 * is set and @end is above the kernel image.
195 */
196 if (memblock_bottom_up() && end > kernel_end) {
197 phys_addr_t bottom_up_start;
198
199 /* make sure we will allocate above the kernel */
200 bottom_up_start = max(start, kernel_end);
201
202 /* ok, try bottom-up allocation first */
203 ret = __memblock_find_range_bottom_up(bottom_up_start, end,
204 size, align, nid);
205 if (ret)
206 return ret;
207
208 /*
209 * we always limit bottom-up allocation above the kernel,
210 * but top-down allocation doesn't have the limit, so
211 * retrying top-down allocation may succeed when bottom-up
212 * allocation failed.
213 *
214 * bottom-up allocation is expected to be fail very rarely,
215 * so we use WARN_ONCE() here to see the stack trace if
216 * fail happens.
217 */
218 WARN_ONCE(1, "memblock: bottom-up allocation failed, "
219 "memory hotunplug may be affected\n");
220 }
221
145 return __memblock_find_range_top_down(start, end, size, align, nid); 222 return __memblock_find_range_top_down(start, end, size, align, nid);
146 } 223 }
147 224
148 /** 225 /**
149 * memblock_find_in_range - find free area in given range 226 * memblock_find_in_range - find free area in given range
150 * @start: start of candidate range 227 * @start: start of candidate range
151 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} 228 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
152 * @size: size of free area to find 229 * @size: size of free area to find
153 * @align: alignment of free area to find 230 * @align: alignment of free area to find
154 * 231 *
155 * Find @size free area aligned to @align in the specified range. 232 * Find @size free area aligned to @align in the specified range.
156 * 233 *
157 * RETURNS: 234 * RETURNS:
158 * Found address on success, %0 on failure. 235 * Found address on success, 0 on failure.
159 */ 236 */
160 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, 237 phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
161 phys_addr_t end, phys_addr_t size, 238 phys_addr_t end, phys_addr_t size,
162 phys_addr_t align) 239 phys_addr_t align)
163 { 240 {
164 return memblock_find_in_range_node(start, end, size, align, 241 return memblock_find_in_range_node(start, end, size, align,
165 MAX_NUMNODES); 242 MAX_NUMNODES);
166 } 243 }
167 244
168 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) 245 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
169 { 246 {
170 type->total_size -= type->regions[r].size; 247 type->total_size -= type->regions[r].size;
171 memmove(&type->regions[r], &type->regions[r + 1], 248 memmove(&type->regions[r], &type->regions[r + 1],
172 (type->cnt - (r + 1)) * sizeof(type->regions[r])); 249 (type->cnt - (r + 1)) * sizeof(type->regions[r]));
173 type->cnt--; 250 type->cnt--;
174 251
175 /* Special case for empty arrays */ 252 /* Special case for empty arrays */
176 if (type->cnt == 0) { 253 if (type->cnt == 0) {
177 WARN_ON(type->total_size != 0); 254 WARN_ON(type->total_size != 0);
178 type->cnt = 1; 255 type->cnt = 1;
179 type->regions[0].base = 0; 256 type->regions[0].base = 0;
180 type->regions[0].size = 0; 257 type->regions[0].size = 0;
181 memblock_set_region_node(&type->regions[0], MAX_NUMNODES); 258 memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
182 } 259 }
183 } 260 }
184 261
185 phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( 262 phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
186 phys_addr_t *addr) 263 phys_addr_t *addr)
187 { 264 {
188 if (memblock.reserved.regions == memblock_reserved_init_regions) 265 if (memblock.reserved.regions == memblock_reserved_init_regions)
189 return 0; 266 return 0;
190 267
191 *addr = __pa(memblock.reserved.regions); 268 *addr = __pa(memblock.reserved.regions);
192 269
193 return PAGE_ALIGN(sizeof(struct memblock_region) * 270 return PAGE_ALIGN(sizeof(struct memblock_region) *
194 memblock.reserved.max); 271 memblock.reserved.max);
195 } 272 }
196 273
197 /** 274 /**
198 * memblock_double_array - double the size of the memblock regions array 275 * memblock_double_array - double the size of the memblock regions array
199 * @type: memblock type of the regions array being doubled 276 * @type: memblock type of the regions array being doubled
200 * @new_area_start: starting address of memory range to avoid overlap with 277 * @new_area_start: starting address of memory range to avoid overlap with
201 * @new_area_size: size of memory range to avoid overlap with 278 * @new_area_size: size of memory range to avoid overlap with
202 * 279 *
203 * Double the size of the @type regions array. If memblock is being used to 280 * Double the size of the @type regions array. If memblock is being used to
204 * allocate memory for a new reserved regions array and there is a previously 281 * allocate memory for a new reserved regions array and there is a previously
205 * allocated memory range [@new_area_start,@new_area_start+@new_area_size] 282 * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
206 * waiting to be reserved, ensure the memory used by the new array does 283 * waiting to be reserved, ensure the memory used by the new array does
207 * not overlap. 284 * not overlap.
208 * 285 *
209 * RETURNS: 286 * RETURNS:
210 * 0 on success, -1 on failure. 287 * 0 on success, -1 on failure.
211 */ 288 */
212 static int __init_memblock memblock_double_array(struct memblock_type *type, 289 static int __init_memblock memblock_double_array(struct memblock_type *type,
213 phys_addr_t new_area_start, 290 phys_addr_t new_area_start,
214 phys_addr_t new_area_size) 291 phys_addr_t new_area_size)
215 { 292 {
216 struct memblock_region *new_array, *old_array; 293 struct memblock_region *new_array, *old_array;
217 phys_addr_t old_alloc_size, new_alloc_size; 294 phys_addr_t old_alloc_size, new_alloc_size;
218 phys_addr_t old_size, new_size, addr; 295 phys_addr_t old_size, new_size, addr;
219 int use_slab = slab_is_available(); 296 int use_slab = slab_is_available();
220 int *in_slab; 297 int *in_slab;
221 298
222 /* We don't allow resizing until we know about the reserved regions 299 /* We don't allow resizing until we know about the reserved regions
223 * of memory that aren't suitable for allocation 300 * of memory that aren't suitable for allocation
224 */ 301 */
225 if (!memblock_can_resize) 302 if (!memblock_can_resize)
226 return -1; 303 return -1;
227 304
228 /* Calculate new doubled size */ 305 /* Calculate new doubled size */
229 old_size = type->max * sizeof(struct memblock_region); 306 old_size = type->max * sizeof(struct memblock_region);
230 new_size = old_size << 1; 307 new_size = old_size << 1;
231 /* 308 /*
232 * We need to allocated new one align to PAGE_SIZE, 309 * We need to allocated new one align to PAGE_SIZE,
233 * so we can free them completely later. 310 * so we can free them completely later.
234 */ 311 */
235 old_alloc_size = PAGE_ALIGN(old_size); 312 old_alloc_size = PAGE_ALIGN(old_size);
236 new_alloc_size = PAGE_ALIGN(new_size); 313 new_alloc_size = PAGE_ALIGN(new_size);
237 314
238 /* Retrieve the slab flag */ 315 /* Retrieve the slab flag */
239 if (type == &memblock.memory) 316 if (type == &memblock.memory)
240 in_slab = &memblock_memory_in_slab; 317 in_slab = &memblock_memory_in_slab;
241 else 318 else
242 in_slab = &memblock_reserved_in_slab; 319 in_slab = &memblock_reserved_in_slab;
243 320
244 /* Try to find some space for it. 321 /* Try to find some space for it.
245 * 322 *
246 * WARNING: We assume that either slab_is_available() and we use it or 323 * WARNING: We assume that either slab_is_available() and we use it or
247 * we use MEMBLOCK for allocations. That means that this is unsafe to 324 * we use MEMBLOCK for allocations. That means that this is unsafe to
248 * use when bootmem is currently active (unless bootmem itself is 325 * use when bootmem is currently active (unless bootmem itself is
249 * implemented on top of MEMBLOCK which isn't the case yet) 326 * implemented on top of MEMBLOCK which isn't the case yet)
250 * 327 *
251 * This should however not be an issue for now, as we currently only 328 * This should however not be an issue for now, as we currently only
252 * call into MEMBLOCK while it's still active, or much later when slab 329 * call into MEMBLOCK while it's still active, or much later when slab
253 * is active for memory hotplug operations 330 * is active for memory hotplug operations
254 */ 331 */
255 if (use_slab) { 332 if (use_slab) {
256 new_array = kmalloc(new_size, GFP_KERNEL); 333 new_array = kmalloc(new_size, GFP_KERNEL);
257 addr = new_array ? __pa(new_array) : 0; 334 addr = new_array ? __pa(new_array) : 0;
258 } else { 335 } else {
259 /* only exclude range when trying to double reserved.regions */ 336 /* only exclude range when trying to double reserved.regions */
260 if (type != &memblock.reserved) 337 if (type != &memblock.reserved)
261 new_area_start = new_area_size = 0; 338 new_area_start = new_area_size = 0;
262 339
263 addr = memblock_find_in_range(new_area_start + new_area_size, 340 addr = memblock_find_in_range(new_area_start + new_area_size,
264 memblock.current_limit, 341 memblock.current_limit,
265 new_alloc_size, PAGE_SIZE); 342 new_alloc_size, PAGE_SIZE);
266 if (!addr && new_area_size) 343 if (!addr && new_area_size)
267 addr = memblock_find_in_range(0, 344 addr = memblock_find_in_range(0,
268 min(new_area_start, memblock.current_limit), 345 min(new_area_start, memblock.current_limit),
269 new_alloc_size, PAGE_SIZE); 346 new_alloc_size, PAGE_SIZE);
270 347
271 new_array = addr ? __va(addr) : NULL; 348 new_array = addr ? __va(addr) : NULL;
272 } 349 }
273 if (!addr) { 350 if (!addr) {
274 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", 351 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
275 memblock_type_name(type), type->max, type->max * 2); 352 memblock_type_name(type), type->max, type->max * 2);
276 return -1; 353 return -1;
277 } 354 }
278 355
279 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", 356 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
280 memblock_type_name(type), type->max * 2, (u64)addr, 357 memblock_type_name(type), type->max * 2, (u64)addr,
281 (u64)addr + new_size - 1); 358 (u64)addr + new_size - 1);
282 359
283 /* 360 /*
284 * Found space, we now need to move the array over before we add the 361 * Found space, we now need to move the array over before we add the
285 * reserved region since it may be our reserved array itself that is 362 * reserved region since it may be our reserved array itself that is
286 * full. 363 * full.
287 */ 364 */
288 memcpy(new_array, type->regions, old_size); 365 memcpy(new_array, type->regions, old_size);
289 memset(new_array + type->max, 0, old_size); 366 memset(new_array + type->max, 0, old_size);
290 old_array = type->regions; 367 old_array = type->regions;
291 type->regions = new_array; 368 type->regions = new_array;
292 type->max <<= 1; 369 type->max <<= 1;
293 370
294 /* Free old array. We needn't free it if the array is the static one */ 371 /* Free old array. We needn't free it if the array is the static one */
295 if (*in_slab) 372 if (*in_slab)
296 kfree(old_array); 373 kfree(old_array);
297 else if (old_array != memblock_memory_init_regions && 374 else if (old_array != memblock_memory_init_regions &&
298 old_array != memblock_reserved_init_regions) 375 old_array != memblock_reserved_init_regions)
299 memblock_free(__pa(old_array), old_alloc_size); 376 memblock_free(__pa(old_array), old_alloc_size);
300 377
301 /* 378 /*
302 * Reserve the new array if that comes from the memblock. Otherwise, we 379 * Reserve the new array if that comes from the memblock. Otherwise, we
303 * needn't do it 380 * needn't do it
304 */ 381 */
305 if (!use_slab) 382 if (!use_slab)
306 BUG_ON(memblock_reserve(addr, new_alloc_size)); 383 BUG_ON(memblock_reserve(addr, new_alloc_size));
307 384
308 /* Update slab flag */ 385 /* Update slab flag */
309 *in_slab = use_slab; 386 *in_slab = use_slab;
310 387
311 return 0; 388 return 0;
312 } 389 }
313 390
314 /** 391 /**
315 * memblock_merge_regions - merge neighboring compatible regions 392 * memblock_merge_regions - merge neighboring compatible regions
316 * @type: memblock type to scan 393 * @type: memblock type to scan
317 * 394 *
318 * Scan @type and merge neighboring compatible regions. 395 * Scan @type and merge neighboring compatible regions.
319 */ 396 */
320 static void __init_memblock memblock_merge_regions(struct memblock_type *type) 397 static void __init_memblock memblock_merge_regions(struct memblock_type *type)
321 { 398 {
322 int i = 0; 399 int i = 0;
323 400
324 /* cnt never goes below 1 */ 401 /* cnt never goes below 1 */
325 while (i < type->cnt - 1) { 402 while (i < type->cnt - 1) {
326 struct memblock_region *this = &type->regions[i]; 403 struct memblock_region *this = &type->regions[i];
327 struct memblock_region *next = &type->regions[i + 1]; 404 struct memblock_region *next = &type->regions[i + 1];
328 405
329 if (this->base + this->size != next->base || 406 if (this->base + this->size != next->base ||
330 memblock_get_region_node(this) != 407 memblock_get_region_node(this) !=
331 memblock_get_region_node(next)) { 408 memblock_get_region_node(next)) {
332 BUG_ON(this->base + this->size > next->base); 409 BUG_ON(this->base + this->size > next->base);
333 i++; 410 i++;
334 continue; 411 continue;
335 } 412 }
336 413
337 this->size += next->size; 414 this->size += next->size;
338 /* move forward from next + 1, index of which is i + 2 */ 415 /* move forward from next + 1, index of which is i + 2 */
339 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); 416 memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
340 type->cnt--; 417 type->cnt--;
341 } 418 }
342 } 419 }
343 420
344 /** 421 /**
345 * memblock_insert_region - insert new memblock region 422 * memblock_insert_region - insert new memblock region
346 * @type: memblock type to insert into 423 * @type: memblock type to insert into
347 * @idx: index for the insertion point 424 * @idx: index for the insertion point
348 * @base: base address of the new region 425 * @base: base address of the new region
349 * @size: size of the new region 426 * @size: size of the new region
350 * @nid: node id of the new region 427 * @nid: node id of the new region
351 * 428 *
352 * Insert new memblock region [@base,@base+@size) into @type at @idx. 429 * Insert new memblock region [@base,@base+@size) into @type at @idx.
353 * @type must already have extra room to accomodate the new region. 430 * @type must already have extra room to accomodate the new region.
354 */ 431 */
355 static void __init_memblock memblock_insert_region(struct memblock_type *type, 432 static void __init_memblock memblock_insert_region(struct memblock_type *type,
356 int idx, phys_addr_t base, 433 int idx, phys_addr_t base,
357 phys_addr_t size, int nid) 434 phys_addr_t size, int nid)
358 { 435 {
359 struct memblock_region *rgn = &type->regions[idx]; 436 struct memblock_region *rgn = &type->regions[idx];
360 437
361 BUG_ON(type->cnt >= type->max); 438 BUG_ON(type->cnt >= type->max);
362 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); 439 memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
363 rgn->base = base; 440 rgn->base = base;
364 rgn->size = size; 441 rgn->size = size;
365 memblock_set_region_node(rgn, nid); 442 memblock_set_region_node(rgn, nid);
366 type->cnt++; 443 type->cnt++;
367 type->total_size += size; 444 type->total_size += size;
368 } 445 }
369 446
370 /** 447 /**
371 * memblock_add_region - add new memblock region 448 * memblock_add_region - add new memblock region
372 * @type: memblock type to add new region into 449 * @type: memblock type to add new region into
373 * @base: base address of the new region 450 * @base: base address of the new region
374 * @size: size of the new region 451 * @size: size of the new region
375 * @nid: nid of the new region 452 * @nid: nid of the new region
376 * 453 *
377 * Add new memblock region [@base,@base+@size) into @type. The new region 454 * Add new memblock region [@base,@base+@size) into @type. The new region
378 * is allowed to overlap with existing ones - overlaps don't affect already 455 * is allowed to overlap with existing ones - overlaps don't affect already
379 * existing regions. @type is guaranteed to be minimal (all neighbouring 456 * existing regions. @type is guaranteed to be minimal (all neighbouring
380 * compatible regions are merged) after the addition. 457 * compatible regions are merged) after the addition.
381 * 458 *
382 * RETURNS: 459 * RETURNS:
383 * 0 on success, -errno on failure. 460 * 0 on success, -errno on failure.
384 */ 461 */
385 static int __init_memblock memblock_add_region(struct memblock_type *type, 462 static int __init_memblock memblock_add_region(struct memblock_type *type,
386 phys_addr_t base, phys_addr_t size, int nid) 463 phys_addr_t base, phys_addr_t size, int nid)
387 { 464 {
388 bool insert = false; 465 bool insert = false;
389 phys_addr_t obase = base; 466 phys_addr_t obase = base;
390 phys_addr_t end = base + memblock_cap_size(base, &size); 467 phys_addr_t end = base + memblock_cap_size(base, &size);
391 int i, nr_new; 468 int i, nr_new;
392 469
393 if (!size) 470 if (!size)
394 return 0; 471 return 0;
395 472
396 /* special case for empty array */ 473 /* special case for empty array */
397 if (type->regions[0].size == 0) { 474 if (type->regions[0].size == 0) {
398 WARN_ON(type->cnt != 1 || type->total_size); 475 WARN_ON(type->cnt != 1 || type->total_size);
399 type->regions[0].base = base; 476 type->regions[0].base = base;
400 type->regions[0].size = size; 477 type->regions[0].size = size;
401 memblock_set_region_node(&type->regions[0], nid); 478 memblock_set_region_node(&type->regions[0], nid);
402 type->total_size = size; 479 type->total_size = size;
403 return 0; 480 return 0;
404 } 481 }
405 repeat: 482 repeat:
406 /* 483 /*
407 * The following is executed twice. Once with %false @insert and 484 * The following is executed twice. Once with %false @insert and
408 * then with %true. The first counts the number of regions needed 485 * then with %true. The first counts the number of regions needed
409 * to accomodate the new area. The second actually inserts them. 486 * to accomodate the new area. The second actually inserts them.
410 */ 487 */
411 base = obase; 488 base = obase;
412 nr_new = 0; 489 nr_new = 0;
413 490
414 for (i = 0; i < type->cnt; i++) { 491 for (i = 0; i < type->cnt; i++) {
415 struct memblock_region *rgn = &type->regions[i]; 492 struct memblock_region *rgn = &type->regions[i];
416 phys_addr_t rbase = rgn->base; 493 phys_addr_t rbase = rgn->base;
417 phys_addr_t rend = rbase + rgn->size; 494 phys_addr_t rend = rbase + rgn->size;
418 495
419 if (rbase >= end) 496 if (rbase >= end)
420 break; 497 break;
421 if (rend <= base) 498 if (rend <= base)
422 continue; 499 continue;
423 /* 500 /*
424 * @rgn overlaps. If it separates the lower part of new 501 * @rgn overlaps. If it separates the lower part of new
425 * area, insert that portion. 502 * area, insert that portion.
426 */ 503 */
427 if (rbase > base) { 504 if (rbase > base) {
428 nr_new++; 505 nr_new++;
429 if (insert) 506 if (insert)
430 memblock_insert_region(type, i++, base, 507 memblock_insert_region(type, i++, base,
431 rbase - base, nid); 508 rbase - base, nid);
432 } 509 }
433 /* area below @rend is dealt with, forget about it */ 510 /* area below @rend is dealt with, forget about it */
434 base = min(rend, end); 511 base = min(rend, end);
435 } 512 }
436 513
437 /* insert the remaining portion */ 514 /* insert the remaining portion */
438 if (base < end) { 515 if (base < end) {
439 nr_new++; 516 nr_new++;
440 if (insert) 517 if (insert)
441 memblock_insert_region(type, i, base, end - base, nid); 518 memblock_insert_region(type, i, base, end - base, nid);
442 } 519 }
443 520
444 /* 521 /*
445 * If this was the first round, resize array and repeat for actual 522 * If this was the first round, resize array and repeat for actual
446 * insertions; otherwise, merge and return. 523 * insertions; otherwise, merge and return.
447 */ 524 */
448 if (!insert) { 525 if (!insert) {
449 while (type->cnt + nr_new > type->max) 526 while (type->cnt + nr_new > type->max)
450 if (memblock_double_array(type, obase, size) < 0) 527 if (memblock_double_array(type, obase, size) < 0)
451 return -ENOMEM; 528 return -ENOMEM;
452 insert = true; 529 insert = true;
453 goto repeat; 530 goto repeat;
454 } else { 531 } else {
455 memblock_merge_regions(type); 532 memblock_merge_regions(type);
456 return 0; 533 return 0;
457 } 534 }
458 } 535 }
459 536
460 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, 537 int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
461 int nid) 538 int nid)
462 { 539 {
463 return memblock_add_region(&memblock.memory, base, size, nid); 540 return memblock_add_region(&memblock.memory, base, size, nid);
464 } 541 }
465 542
466 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 543 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
467 { 544 {
468 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); 545 return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
469 } 546 }
470 547
471 /** 548 /**
472 * memblock_isolate_range - isolate given range into disjoint memblocks 549 * memblock_isolate_range - isolate given range into disjoint memblocks
473 * @type: memblock type to isolate range for 550 * @type: memblock type to isolate range for
474 * @base: base of range to isolate 551 * @base: base of range to isolate
475 * @size: size of range to isolate 552 * @size: size of range to isolate
476 * @start_rgn: out parameter for the start of isolated region 553 * @start_rgn: out parameter for the start of isolated region
477 * @end_rgn: out parameter for the end of isolated region 554 * @end_rgn: out parameter for the end of isolated region
478 * 555 *
479 * Walk @type and ensure that regions don't cross the boundaries defined by 556 * Walk @type and ensure that regions don't cross the boundaries defined by
480 * [@base,@base+@size). Crossing regions are split at the boundaries, 557 * [@base,@base+@size). Crossing regions are split at the boundaries,
481 * which may create at most two more regions. The index of the first 558 * which may create at most two more regions. The index of the first
482 * region inside the range is returned in *@start_rgn and end in *@end_rgn. 559 * region inside the range is returned in *@start_rgn and end in *@end_rgn.
483 * 560 *
484 * RETURNS: 561 * RETURNS:
485 * 0 on success, -errno on failure. 562 * 0 on success, -errno on failure.
486 */ 563 */
487 static int __init_memblock memblock_isolate_range(struct memblock_type *type, 564 static int __init_memblock memblock_isolate_range(struct memblock_type *type,
488 phys_addr_t base, phys_addr_t size, 565 phys_addr_t base, phys_addr_t size,
489 int *start_rgn, int *end_rgn) 566 int *start_rgn, int *end_rgn)
490 { 567 {
491 phys_addr_t end = base + memblock_cap_size(base, &size); 568 phys_addr_t end = base + memblock_cap_size(base, &size);
492 int i; 569 int i;
493 570
494 *start_rgn = *end_rgn = 0; 571 *start_rgn = *end_rgn = 0;
495 572
496 if (!size) 573 if (!size)
497 return 0; 574 return 0;
498 575
499 /* we'll create at most two more regions */ 576 /* we'll create at most two more regions */
500 while (type->cnt + 2 > type->max) 577 while (type->cnt + 2 > type->max)
501 if (memblock_double_array(type, base, size) < 0) 578 if (memblock_double_array(type, base, size) < 0)
502 return -ENOMEM; 579 return -ENOMEM;
503 580
504 for (i = 0; i < type->cnt; i++) { 581 for (i = 0; i < type->cnt; i++) {
505 struct memblock_region *rgn = &type->regions[i]; 582 struct memblock_region *rgn = &type->regions[i];
506 phys_addr_t rbase = rgn->base; 583 phys_addr_t rbase = rgn->base;
507 phys_addr_t rend = rbase + rgn->size; 584 phys_addr_t rend = rbase + rgn->size;
508 585
509 if (rbase >= end) 586 if (rbase >= end)
510 break; 587 break;
511 if (rend <= base) 588 if (rend <= base)
512 continue; 589 continue;
513 590
514 if (rbase < base) { 591 if (rbase < base) {
515 /* 592 /*
516 * @rgn intersects from below. Split and continue 593 * @rgn intersects from below. Split and continue
517 * to process the next region - the new top half. 594 * to process the next region - the new top half.
518 */ 595 */
519 rgn->base = base; 596 rgn->base = base;
520 rgn->size -= base - rbase; 597 rgn->size -= base - rbase;
521 type->total_size -= base - rbase; 598 type->total_size -= base - rbase;
522 memblock_insert_region(type, i, rbase, base - rbase, 599 memblock_insert_region(type, i, rbase, base - rbase,
523 memblock_get_region_node(rgn)); 600 memblock_get_region_node(rgn));
524 } else if (rend > end) { 601 } else if (rend > end) {
525 /* 602 /*
526 * @rgn intersects from above. Split and redo the 603 * @rgn intersects from above. Split and redo the
527 * current region - the new bottom half. 604 * current region - the new bottom half.
528 */ 605 */
529 rgn->base = end; 606 rgn->base = end;
530 rgn->size -= end - rbase; 607 rgn->size -= end - rbase;
531 type->total_size -= end - rbase; 608 type->total_size -= end - rbase;
532 memblock_insert_region(type, i--, rbase, end - rbase, 609 memblock_insert_region(type, i--, rbase, end - rbase,
533 memblock_get_region_node(rgn)); 610 memblock_get_region_node(rgn));
534 } else { 611 } else {
535 /* @rgn is fully contained, record it */ 612 /* @rgn is fully contained, record it */
536 if (!*end_rgn) 613 if (!*end_rgn)
537 *start_rgn = i; 614 *start_rgn = i;
538 *end_rgn = i + 1; 615 *end_rgn = i + 1;
539 } 616 }
540 } 617 }
541 618
542 return 0; 619 return 0;
543 } 620 }
544 621
545 static int __init_memblock __memblock_remove(struct memblock_type *type, 622 static int __init_memblock __memblock_remove(struct memblock_type *type,
546 phys_addr_t base, phys_addr_t size) 623 phys_addr_t base, phys_addr_t size)
547 { 624 {
548 int start_rgn, end_rgn; 625 int start_rgn, end_rgn;
549 int i, ret; 626 int i, ret;
550 627
551 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); 628 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
552 if (ret) 629 if (ret)
553 return ret; 630 return ret;
554 631
555 for (i = end_rgn - 1; i >= start_rgn; i--) 632 for (i = end_rgn - 1; i >= start_rgn; i--)
556 memblock_remove_region(type, i); 633 memblock_remove_region(type, i);
557 return 0; 634 return 0;
558 } 635 }
559 636
560 int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) 637 int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
561 { 638 {
562 return __memblock_remove(&memblock.memory, base, size); 639 return __memblock_remove(&memblock.memory, base, size);
563 } 640 }
564 641
565 int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) 642 int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
566 { 643 {
567 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", 644 memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
568 (unsigned long long)base, 645 (unsigned long long)base,
569 (unsigned long long)base + size, 646 (unsigned long long)base + size,
570 (void *)_RET_IP_); 647 (void *)_RET_IP_);
571 648
572 return __memblock_remove(&memblock.reserved, base, size); 649 return __memblock_remove(&memblock.reserved, base, size);
573 } 650 }
574 651
575 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 652 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
576 { 653 {
577 struct memblock_type *_rgn = &memblock.reserved; 654 struct memblock_type *_rgn = &memblock.reserved;
578 655
579 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", 656 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
580 (unsigned long long)base, 657 (unsigned long long)base,
581 (unsigned long long)base + size, 658 (unsigned long long)base + size,
582 (void *)_RET_IP_); 659 (void *)_RET_IP_);
583 660
584 return memblock_add_region(_rgn, base, size, MAX_NUMNODES); 661 return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
585 } 662 }
586 663
587 /** 664 /**
588 * __next_free_mem_range - next function for for_each_free_mem_range() 665 * __next_free_mem_range - next function for for_each_free_mem_range()
589 * @idx: pointer to u64 loop variable 666 * @idx: pointer to u64 loop variable
590 * @nid: node selector, %MAX_NUMNODES for all nodes 667 * @nid: node selector, %MAX_NUMNODES for all nodes
591 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 668 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
592 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 669 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
593 * @out_nid: ptr to int for nid of the range, can be %NULL 670 * @out_nid: ptr to int for nid of the range, can be %NULL
594 * 671 *
595 * Find the first free area from *@idx which matches @nid, fill the out 672 * Find the first free area from *@idx which matches @nid, fill the out
596 * parameters, and update *@idx for the next iteration. The lower 32bit of 673 * parameters, and update *@idx for the next iteration. The lower 32bit of
597 * *@idx contains index into memory region and the upper 32bit indexes the 674 * *@idx contains index into memory region and the upper 32bit indexes the
598 * areas before each reserved region. For example, if reserved regions 675 * areas before each reserved region. For example, if reserved regions
599 * look like the following, 676 * look like the following,
600 * 677 *
601 * 0:[0-16), 1:[32-48), 2:[128-130) 678 * 0:[0-16), 1:[32-48), 2:[128-130)
602 * 679 *
603 * The upper 32bit indexes the following regions. 680 * The upper 32bit indexes the following regions.
604 * 681 *
605 * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) 682 * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
606 * 683 *
607 * As both region arrays are sorted, the function advances the two indices 684 * As both region arrays are sorted, the function advances the two indices
608 * in lockstep and returns each intersection. 685 * in lockstep and returns each intersection.
609 */ 686 */
610 void __init_memblock __next_free_mem_range(u64 *idx, int nid, 687 void __init_memblock __next_free_mem_range(u64 *idx, int nid,
611 phys_addr_t *out_start, 688 phys_addr_t *out_start,
612 phys_addr_t *out_end, int *out_nid) 689 phys_addr_t *out_end, int *out_nid)
613 { 690 {
614 struct memblock_type *mem = &memblock.memory; 691 struct memblock_type *mem = &memblock.memory;
615 struct memblock_type *rsv = &memblock.reserved; 692 struct memblock_type *rsv = &memblock.reserved;
616 int mi = *idx & 0xffffffff; 693 int mi = *idx & 0xffffffff;
617 int ri = *idx >> 32; 694 int ri = *idx >> 32;
618 695
619 for ( ; mi < mem->cnt; mi++) { 696 for ( ; mi < mem->cnt; mi++) {
620 struct memblock_region *m = &mem->regions[mi]; 697 struct memblock_region *m = &mem->regions[mi];
621 phys_addr_t m_start = m->base; 698 phys_addr_t m_start = m->base;
622 phys_addr_t m_end = m->base + m->size; 699 phys_addr_t m_end = m->base + m->size;
623 700
624 /* only memory regions are associated with nodes, check it */ 701 /* only memory regions are associated with nodes, check it */
625 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 702 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
626 continue; 703 continue;
627 704
628 /* scan areas before each reservation for intersection */ 705 /* scan areas before each reservation for intersection */
629 for ( ; ri < rsv->cnt + 1; ri++) { 706 for ( ; ri < rsv->cnt + 1; ri++) {
630 struct memblock_region *r = &rsv->regions[ri]; 707 struct memblock_region *r = &rsv->regions[ri];
631 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; 708 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
632 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; 709 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
633 710
634 /* if ri advanced past mi, break out to advance mi */ 711 /* if ri advanced past mi, break out to advance mi */
635 if (r_start >= m_end) 712 if (r_start >= m_end)
636 break; 713 break;
637 /* if the two regions intersect, we're done */ 714 /* if the two regions intersect, we're done */
638 if (m_start < r_end) { 715 if (m_start < r_end) {
639 if (out_start) 716 if (out_start)
640 *out_start = max(m_start, r_start); 717 *out_start = max(m_start, r_start);
641 if (out_end) 718 if (out_end)
642 *out_end = min(m_end, r_end); 719 *out_end = min(m_end, r_end);
643 if (out_nid) 720 if (out_nid)
644 *out_nid = memblock_get_region_node(m); 721 *out_nid = memblock_get_region_node(m);
645 /* 722 /*
646 * The region which ends first is advanced 723 * The region which ends first is advanced
647 * for the next iteration. 724 * for the next iteration.
648 */ 725 */
649 if (m_end <= r_end) 726 if (m_end <= r_end)
650 mi++; 727 mi++;
651 else 728 else
652 ri++; 729 ri++;
653 *idx = (u32)mi | (u64)ri << 32; 730 *idx = (u32)mi | (u64)ri << 32;
654 return; 731 return;
655 } 732 }
656 } 733 }
657 } 734 }
658 735
659 /* signal end of iteration */ 736 /* signal end of iteration */
660 *idx = ULLONG_MAX; 737 *idx = ULLONG_MAX;
661 } 738 }
662 739
663 /** 740 /**
664 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() 741 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
665 * @idx: pointer to u64 loop variable 742 * @idx: pointer to u64 loop variable
666 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 743 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
667 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 744 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
668 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 745 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
669 * @out_nid: ptr to int for nid of the range, can be %NULL 746 * @out_nid: ptr to int for nid of the range, can be %NULL
670 * 747 *
671 * Reverse of __next_free_mem_range(). 748 * Reverse of __next_free_mem_range().
672 */ 749 */
673 void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, 750 void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
674 phys_addr_t *out_start, 751 phys_addr_t *out_start,
675 phys_addr_t *out_end, int *out_nid) 752 phys_addr_t *out_end, int *out_nid)
676 { 753 {
677 struct memblock_type *mem = &memblock.memory; 754 struct memblock_type *mem = &memblock.memory;
678 struct memblock_type *rsv = &memblock.reserved; 755 struct memblock_type *rsv = &memblock.reserved;
679 int mi = *idx & 0xffffffff; 756 int mi = *idx & 0xffffffff;
680 int ri = *idx >> 32; 757 int ri = *idx >> 32;
681 758
682 if (*idx == (u64)ULLONG_MAX) { 759 if (*idx == (u64)ULLONG_MAX) {
683 mi = mem->cnt - 1; 760 mi = mem->cnt - 1;
684 ri = rsv->cnt; 761 ri = rsv->cnt;
685 } 762 }
686 763
687 for ( ; mi >= 0; mi--) { 764 for ( ; mi >= 0; mi--) {
688 struct memblock_region *m = &mem->regions[mi]; 765 struct memblock_region *m = &mem->regions[mi];
689 phys_addr_t m_start = m->base; 766 phys_addr_t m_start = m->base;
690 phys_addr_t m_end = m->base + m->size; 767 phys_addr_t m_end = m->base + m->size;
691 768
692 /* only memory regions are associated with nodes, check it */ 769 /* only memory regions are associated with nodes, check it */
693 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) 770 if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
694 continue; 771 continue;
695 772
696 /* scan areas before each reservation for intersection */ 773 /* scan areas before each reservation for intersection */
697 for ( ; ri >= 0; ri--) { 774 for ( ; ri >= 0; ri--) {
698 struct memblock_region *r = &rsv->regions[ri]; 775 struct memblock_region *r = &rsv->regions[ri];
699 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; 776 phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
700 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; 777 phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
701 778
702 /* if ri advanced past mi, break out to advance mi */ 779 /* if ri advanced past mi, break out to advance mi */
703 if (r_end <= m_start) 780 if (r_end <= m_start)
704 break; 781 break;
705 /* if the two regions intersect, we're done */ 782 /* if the two regions intersect, we're done */
706 if (m_end > r_start) { 783 if (m_end > r_start) {
707 if (out_start) 784 if (out_start)
708 *out_start = max(m_start, r_start); 785 *out_start = max(m_start, r_start);
709 if (out_end) 786 if (out_end)
710 *out_end = min(m_end, r_end); 787 *out_end = min(m_end, r_end);
711 if (out_nid) 788 if (out_nid)
712 *out_nid = memblock_get_region_node(m); 789 *out_nid = memblock_get_region_node(m);
713 790
714 if (m_start >= r_start) 791 if (m_start >= r_start)
715 mi--; 792 mi--;
716 else 793 else
717 ri--; 794 ri--;
718 *idx = (u32)mi | (u64)ri << 32; 795 *idx = (u32)mi | (u64)ri << 32;
719 return; 796 return;
720 } 797 }
721 } 798 }
722 } 799 }
723 800
724 *idx = ULLONG_MAX; 801 *idx = ULLONG_MAX;
725 } 802 }
726 803
727 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 804 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
728 /* 805 /*
729 * Common iterator interface used to define for_each_mem_range(). 806 * Common iterator interface used to define for_each_mem_range().
730 */ 807 */
731 void __init_memblock __next_mem_pfn_range(int *idx, int nid, 808 void __init_memblock __next_mem_pfn_range(int *idx, int nid,
732 unsigned long *out_start_pfn, 809 unsigned long *out_start_pfn,
733 unsigned long *out_end_pfn, int *out_nid) 810 unsigned long *out_end_pfn, int *out_nid)
734 { 811 {
735 struct memblock_type *type = &memblock.memory; 812 struct memblock_type *type = &memblock.memory;
736 struct memblock_region *r; 813 struct memblock_region *r;
737 814
738 while (++*idx < type->cnt) { 815 while (++*idx < type->cnt) {
739 r = &type->regions[*idx]; 816 r = &type->regions[*idx];
740 817
741 if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) 818 if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
742 continue; 819 continue;
743 if (nid == MAX_NUMNODES || nid == r->nid) 820 if (nid == MAX_NUMNODES || nid == r->nid)
744 break; 821 break;
745 } 822 }
746 if (*idx >= type->cnt) { 823 if (*idx >= type->cnt) {
747 *idx = -1; 824 *idx = -1;
748 return; 825 return;
749 } 826 }
750 827
751 if (out_start_pfn) 828 if (out_start_pfn)
752 *out_start_pfn = PFN_UP(r->base); 829 *out_start_pfn = PFN_UP(r->base);
753 if (out_end_pfn) 830 if (out_end_pfn)
754 *out_end_pfn = PFN_DOWN(r->base + r->size); 831 *out_end_pfn = PFN_DOWN(r->base + r->size);
755 if (out_nid) 832 if (out_nid)
756 *out_nid = r->nid; 833 *out_nid = r->nid;
757 } 834 }
758 835
759 /** 836 /**
760 * memblock_set_node - set node ID on memblock regions 837 * memblock_set_node - set node ID on memblock regions
761 * @base: base of area to set node ID for 838 * @base: base of area to set node ID for
762 * @size: size of area to set node ID for 839 * @size: size of area to set node ID for
763 * @nid: node ID to set 840 * @nid: node ID to set
764 * 841 *
765 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. 842 * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
766 * Regions which cross the area boundaries are split as necessary. 843 * Regions which cross the area boundaries are split as necessary.
767 * 844 *
768 * RETURNS: 845 * RETURNS:
769 * 0 on success, -errno on failure. 846 * 0 on success, -errno on failure.
770 */ 847 */
771 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, 848 int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
772 int nid) 849 int nid)
773 { 850 {
774 struct memblock_type *type = &memblock.memory; 851 struct memblock_type *type = &memblock.memory;
775 int start_rgn, end_rgn; 852 int start_rgn, end_rgn;
776 int i, ret; 853 int i, ret;
777 854
778 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); 855 ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
779 if (ret) 856 if (ret)
780 return ret; 857 return ret;
781 858
782 for (i = start_rgn; i < end_rgn; i++) 859 for (i = start_rgn; i < end_rgn; i++)
783 memblock_set_region_node(&type->regions[i], nid); 860 memblock_set_region_node(&type->regions[i], nid);
784 861
785 memblock_merge_regions(type); 862 memblock_merge_regions(type);
786 return 0; 863 return 0;
787 } 864 }
788 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 865 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
789 866
790 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, 867 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
791 phys_addr_t align, phys_addr_t max_addr, 868 phys_addr_t align, phys_addr_t max_addr,
792 int nid) 869 int nid)
793 { 870 {
794 phys_addr_t found; 871 phys_addr_t found;
795 872
796 if (WARN_ON(!align)) 873 if (WARN_ON(!align))
797 align = __alignof__(long long); 874 align = __alignof__(long long);
798 875
799 /* align @size to avoid excessive fragmentation on reserved array */ 876 /* align @size to avoid excessive fragmentation on reserved array */
800 size = round_up(size, align); 877 size = round_up(size, align);
801 878
802 found = memblock_find_in_range_node(0, max_addr, size, align, nid); 879 found = memblock_find_in_range_node(0, max_addr, size, align, nid);
803 if (found && !memblock_reserve(found, size)) 880 if (found && !memblock_reserve(found, size))
804 return found; 881 return found;
805 882
806 return 0; 883 return 0;
807 } 884 }
808 885
809 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) 886 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
810 { 887 {
811 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); 888 return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
812 } 889 }
813 890
814 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 891 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
815 { 892 {
816 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); 893 return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
817 } 894 }
818 895
819 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) 896 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
820 { 897 {
821 phys_addr_t alloc; 898 phys_addr_t alloc;
822 899
823 alloc = __memblock_alloc_base(size, align, max_addr); 900 alloc = __memblock_alloc_base(size, align, max_addr);
824 901
825 if (alloc == 0) 902 if (alloc == 0)
826 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", 903 panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
827 (unsigned long long) size, (unsigned long long) max_addr); 904 (unsigned long long) size, (unsigned long long) max_addr);
828 905
829 return alloc; 906 return alloc;
830 } 907 }
831 908
832 phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) 909 phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
833 { 910 {
834 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 911 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
835 } 912 }
836 913
837 phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) 914 phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
838 { 915 {
839 phys_addr_t res = memblock_alloc_nid(size, align, nid); 916 phys_addr_t res = memblock_alloc_nid(size, align, nid);
840 917
841 if (res) 918 if (res)
842 return res; 919 return res;
843 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 920 return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
844 } 921 }
845 922
846 923
847 /* 924 /*
848 * Remaining API functions 925 * Remaining API functions
849 */ 926 */
850 927
851 phys_addr_t __init memblock_phys_mem_size(void) 928 phys_addr_t __init memblock_phys_mem_size(void)
852 { 929 {
853 return memblock.memory.total_size; 930 return memblock.memory.total_size;
854 } 931 }
855 932
856 phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) 933 phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
857 { 934 {
858 unsigned long pages = 0; 935 unsigned long pages = 0;
859 struct memblock_region *r; 936 struct memblock_region *r;
860 unsigned long start_pfn, end_pfn; 937 unsigned long start_pfn, end_pfn;
861 938
862 for_each_memblock(memory, r) { 939 for_each_memblock(memory, r) {
863 start_pfn = memblock_region_memory_base_pfn(r); 940 start_pfn = memblock_region_memory_base_pfn(r);
864 end_pfn = memblock_region_memory_end_pfn(r); 941 end_pfn = memblock_region_memory_end_pfn(r);
865 start_pfn = min_t(unsigned long, start_pfn, limit_pfn); 942 start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
866 end_pfn = min_t(unsigned long, end_pfn, limit_pfn); 943 end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
867 pages += end_pfn - start_pfn; 944 pages += end_pfn - start_pfn;
868 } 945 }
869 946
870 return (phys_addr_t)pages << PAGE_SHIFT; 947 return (phys_addr_t)pages << PAGE_SHIFT;
871 } 948 }
872 949
873 /* lowest address */ 950 /* lowest address */
874 phys_addr_t __init_memblock memblock_start_of_DRAM(void) 951 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
875 { 952 {
876 return memblock.memory.regions[0].base; 953 return memblock.memory.regions[0].base;
877 } 954 }
878 955
879 phys_addr_t __init_memblock memblock_end_of_DRAM(void) 956 phys_addr_t __init_memblock memblock_end_of_DRAM(void)
880 { 957 {
881 int idx = memblock.memory.cnt - 1; 958 int idx = memblock.memory.cnt - 1;
882 959
883 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); 960 return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
884 } 961 }
885 962
886 void __init memblock_enforce_memory_limit(phys_addr_t limit) 963 void __init memblock_enforce_memory_limit(phys_addr_t limit)
887 { 964 {
888 unsigned long i; 965 unsigned long i;
889 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 966 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
890 967
891 if (!limit) 968 if (!limit)
892 return; 969 return;
893 970
894 /* find out max address */ 971 /* find out max address */
895 for (i = 0; i < memblock.memory.cnt; i++) { 972 for (i = 0; i < memblock.memory.cnt; i++) {
896 struct memblock_region *r = &memblock.memory.regions[i]; 973 struct memblock_region *r = &memblock.memory.regions[i];
897 974
898 if (limit <= r->size) { 975 if (limit <= r->size) {
899 max_addr = r->base + limit; 976 max_addr = r->base + limit;
900 break; 977 break;
901 } 978 }
902 limit -= r->size; 979 limit -= r->size;
903 } 980 }
904 981
905 /* truncate both memory and reserved regions */ 982 /* truncate both memory and reserved regions */
906 __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); 983 __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
907 __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); 984 __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
908 } 985 }
909 986
910 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) 987 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
911 { 988 {
912 unsigned int left = 0, right = type->cnt; 989 unsigned int left = 0, right = type->cnt;
913 990
914 do { 991 do {
915 unsigned int mid = (right + left) / 2; 992 unsigned int mid = (right + left) / 2;
916 993
917 if (addr < type->regions[mid].base) 994 if (addr < type->regions[mid].base)
918 right = mid; 995 right = mid;
919 else if (addr >= (type->regions[mid].base + 996 else if (addr >= (type->regions[mid].base +
920 type->regions[mid].size)) 997 type->regions[mid].size))
921 left = mid + 1; 998 left = mid + 1;
922 else 999 else
923 return mid; 1000 return mid;
924 } while (left < right); 1001 } while (left < right);
925 return -1; 1002 return -1;
926 } 1003 }
927 1004
928 int __init memblock_is_reserved(phys_addr_t addr) 1005 int __init memblock_is_reserved(phys_addr_t addr)
929 { 1006 {
930 return memblock_search(&memblock.reserved, addr) != -1; 1007 return memblock_search(&memblock.reserved, addr) != -1;
931 } 1008 }
932 1009
933 int __init_memblock memblock_is_memory(phys_addr_t addr) 1010 int __init_memblock memblock_is_memory(phys_addr_t addr)
934 { 1011 {
935 return memblock_search(&memblock.memory, addr) != -1; 1012 return memblock_search(&memblock.memory, addr) != -1;
936 } 1013 }
937 1014
938 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1015 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
939 int __init_memblock memblock_search_pfn_nid(unsigned long pfn, 1016 int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
940 unsigned long *start_pfn, unsigned long *end_pfn) 1017 unsigned long *start_pfn, unsigned long *end_pfn)
941 { 1018 {
942 struct memblock_type *type = &memblock.memory; 1019 struct memblock_type *type = &memblock.memory;
943 int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); 1020 int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
944 1021
945 if (mid == -1) 1022 if (mid == -1)
946 return -1; 1023 return -1;
947 1024
948 *start_pfn = type->regions[mid].base >> PAGE_SHIFT; 1025 *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
949 *end_pfn = (type->regions[mid].base + type->regions[mid].size) 1026 *end_pfn = (type->regions[mid].base + type->regions[mid].size)
950 >> PAGE_SHIFT; 1027 >> PAGE_SHIFT;
951 1028
952 return type->regions[mid].nid; 1029 return type->regions[mid].nid;
953 } 1030 }
954 #endif 1031 #endif
955 1032
956 /** 1033 /**
957 * memblock_is_region_memory - check if a region is a subset of memory 1034 * memblock_is_region_memory - check if a region is a subset of memory
958 * @base: base of region to check 1035 * @base: base of region to check
959 * @size: size of region to check 1036 * @size: size of region to check
960 * 1037 *
961 * Check if the region [@base, @base+@size) is a subset of a memory block. 1038 * Check if the region [@base, @base+@size) is a subset of a memory block.
962 * 1039 *
963 * RETURNS: 1040 * RETURNS:
964 * 0 if false, non-zero if true 1041 * 0 if false, non-zero if true
965 */ 1042 */
966 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 1043 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
967 { 1044 {
968 int idx = memblock_search(&memblock.memory, base); 1045 int idx = memblock_search(&memblock.memory, base);
969 phys_addr_t end = base + memblock_cap_size(base, &size); 1046 phys_addr_t end = base + memblock_cap_size(base, &size);
970 1047
971 if (idx == -1) 1048 if (idx == -1)
972 return 0; 1049 return 0;
973 return memblock.memory.regions[idx].base <= base && 1050 return memblock.memory.regions[idx].base <= base &&
974 (memblock.memory.regions[idx].base + 1051 (memblock.memory.regions[idx].base +
975 memblock.memory.regions[idx].size) >= end; 1052 memblock.memory.regions[idx].size) >= end;
976 } 1053 }
977 1054
978 /** 1055 /**
979 * memblock_is_region_reserved - check if a region intersects reserved memory 1056 * memblock_is_region_reserved - check if a region intersects reserved memory
980 * @base: base of region to check 1057 * @base: base of region to check
981 * @size: size of region to check 1058 * @size: size of region to check
982 * 1059 *
983 * Check if the region [@base, @base+@size) intersects a reserved memory block. 1060 * Check if the region [@base, @base+@size) intersects a reserved memory block.
984 * 1061 *
985 * RETURNS: 1062 * RETURNS:
986 * 0 if false, non-zero if true 1063 * 0 if false, non-zero if true
987 */ 1064 */
988 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 1065 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
989 { 1066 {
990 memblock_cap_size(base, &size); 1067 memblock_cap_size(base, &size);
991 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 1068 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
992 } 1069 }
993 1070
994 void __init_memblock memblock_trim_memory(phys_addr_t align) 1071 void __init_memblock memblock_trim_memory(phys_addr_t align)
995 { 1072 {
996 int i; 1073 int i;
997 phys_addr_t start, end, orig_start, orig_end; 1074 phys_addr_t start, end, orig_start, orig_end;
998 struct memblock_type *mem = &memblock.memory; 1075 struct memblock_type *mem = &memblock.memory;
999 1076
1000 for (i = 0; i < mem->cnt; i++) { 1077 for (i = 0; i < mem->cnt; i++) {
1001 orig_start = mem->regions[i].base; 1078 orig_start = mem->regions[i].base;
1002 orig_end = mem->regions[i].base + mem->regions[i].size; 1079 orig_end = mem->regions[i].base + mem->regions[i].size;
1003 start = round_up(orig_start, align); 1080 start = round_up(orig_start, align);
1004 end = round_down(orig_end, align); 1081 end = round_down(orig_end, align);
1005 1082
1006 if (start == orig_start && end == orig_end) 1083 if (start == orig_start && end == orig_end)
1007 continue; 1084 continue;
1008 1085
1009 if (start < end) { 1086 if (start < end) {
1010 mem->regions[i].base = start; 1087 mem->regions[i].base = start;
1011 mem->regions[i].size = end - start; 1088 mem->regions[i].size = end - start;
1012 } else { 1089 } else {
1013 memblock_remove_region(mem, i); 1090 memblock_remove_region(mem, i);
1014 i--; 1091 i--;
1015 } 1092 }
1016 } 1093 }
1017 } 1094 }
1018 1095
1019 void __init_memblock memblock_set_current_limit(phys_addr_t limit) 1096 void __init_memblock memblock_set_current_limit(phys_addr_t limit)
1020 { 1097 {
1021 memblock.current_limit = limit; 1098 memblock.current_limit = limit;
1022 } 1099 }
1023 1100
1024 static void __init_memblock memblock_dump(struct memblock_type *type, char *name) 1101 static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
1025 { 1102 {
1026 unsigned long long base, size; 1103 unsigned long long base, size;
1027 int i; 1104 int i;
1028 1105
1029 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); 1106 pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
1030 1107
1031 for (i = 0; i < type->cnt; i++) { 1108 for (i = 0; i < type->cnt; i++) {
1032 struct memblock_region *rgn = &type->regions[i]; 1109 struct memblock_region *rgn = &type->regions[i];
1033 char nid_buf[32] = ""; 1110 char nid_buf[32] = "";
1034 1111
1035 base = rgn->base; 1112 base = rgn->base;
1036 size = rgn->size; 1113 size = rgn->size;
1037 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 1114 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1038 if (memblock_get_region_node(rgn) != MAX_NUMNODES) 1115 if (memblock_get_region_node(rgn) != MAX_NUMNODES)
1039 snprintf(nid_buf, sizeof(nid_buf), " on node %d", 1116 snprintf(nid_buf, sizeof(nid_buf), " on node %d",
1040 memblock_get_region_node(rgn)); 1117 memblock_get_region_node(rgn));
1041 #endif 1118 #endif
1042 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", 1119 pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
1043 name, i, base, base + size - 1, size, nid_buf); 1120 name, i, base, base + size - 1, size, nid_buf);
1044 } 1121 }
1045 } 1122 }
1046 1123
1047 void __init_memblock __memblock_dump_all(void) 1124 void __init_memblock __memblock_dump_all(void)
1048 { 1125 {
1049 pr_info("MEMBLOCK configuration:\n"); 1126 pr_info("MEMBLOCK configuration:\n");
1050 pr_info(" memory size = %#llx reserved size = %#llx\n", 1127 pr_info(" memory size = %#llx reserved size = %#llx\n",
1051 (unsigned long long)memblock.memory.total_size, 1128 (unsigned long long)memblock.memory.total_size,
1052 (unsigned long long)memblock.reserved.total_size); 1129 (unsigned long long)memblock.reserved.total_size);
1053 1130
1054 memblock_dump(&memblock.memory, "memory"); 1131 memblock_dump(&memblock.memory, "memory");
1055 memblock_dump(&memblock.reserved, "reserved"); 1132 memblock_dump(&memblock.reserved, "reserved");
1056 } 1133 }
1057 1134
1058 void __init memblock_allow_resize(void) 1135 void __init memblock_allow_resize(void)
1059 { 1136 {
1060 memblock_can_resize = 1; 1137 memblock_can_resize = 1;
1061 } 1138 }
1062 1139
1063 static int __init early_memblock(char *p) 1140 static int __init early_memblock(char *p)
1064 { 1141 {
1065 if (p && strstr(p, "debug")) 1142 if (p && strstr(p, "debug"))
1066 memblock_debug = 1; 1143 memblock_debug = 1;
1067 return 0; 1144 return 0;
1068 } 1145 }
1069 early_param("memblock", early_memblock); 1146 early_param("memblock", early_memblock);
1070 1147
1071 #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) 1148 #if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
1072 1149
1073 static int memblock_debug_show(struct seq_file *m, void *private) 1150 static int memblock_debug_show(struct seq_file *m, void *private)
1074 { 1151 {
1075 struct memblock_type *type = m->private; 1152 struct memblock_type *type = m->private;
1076 struct memblock_region *reg; 1153 struct memblock_region *reg;
1077 int i; 1154 int i;
1078 1155
1079 for (i = 0; i < type->cnt; i++) { 1156 for (i = 0; i < type->cnt; i++) {
1080 reg = &type->regions[i]; 1157 reg = &type->regions[i];
1081 seq_printf(m, "%4d: ", i); 1158 seq_printf(m, "%4d: ", i);
1082 if (sizeof(phys_addr_t) == 4) 1159 if (sizeof(phys_addr_t) == 4)
1083 seq_printf(m, "0x%08lx..0x%08lx\n", 1160 seq_printf(m, "0x%08lx..0x%08lx\n",
1084 (unsigned long)reg->base, 1161 (unsigned long)reg->base,
1085 (unsigned long)(reg->base + reg->size - 1)); 1162 (unsigned long)(reg->base + reg->size - 1));
1086 else 1163 else
1087 seq_printf(m, "0x%016llx..0x%016llx\n", 1164 seq_printf(m, "0x%016llx..0x%016llx\n",
1088 (unsigned long long)reg->base, 1165 (unsigned long long)reg->base,
1089 (unsigned long long)(reg->base + reg->size - 1)); 1166 (unsigned long long)(reg->base + reg->size - 1));
1090 1167
1091 } 1168 }
1092 return 0; 1169 return 0;
1093 } 1170 }
1094 1171
1095 static int memblock_debug_open(struct inode *inode, struct file *file) 1172 static int memblock_debug_open(struct inode *inode, struct file *file)
1096 { 1173 {
1097 return single_open(file, memblock_debug_show, inode->i_private); 1174 return single_open(file, memblock_debug_show, inode->i_private);
1098 } 1175 }
1099 1176
1100 static const struct file_operations memblock_debug_fops = { 1177 static const struct file_operations memblock_debug_fops = {
1101 .open = memblock_debug_open, 1178 .open = memblock_debug_open,
1102 .read = seq_read, 1179 .read = seq_read,
1103 .llseek = seq_lseek, 1180 .llseek = seq_lseek,
1104 .release = single_release, 1181 .release = single_release,
1105 }; 1182 };
1106 1183
1107 static int __init memblock_init_debugfs(void) 1184 static int __init memblock_init_debugfs(void)
1108 { 1185 {
1109 struct dentry *root = debugfs_create_dir("memblock", NULL); 1186 struct dentry *root = debugfs_create_dir("memblock", NULL);
1110 if (!root) 1187 if (!root)
1111 return -ENXIO; 1188 return -ENXIO;
1112 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); 1189 debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
1113 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); 1190 debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
1114 1191
1115 return 0; 1192 return 0;
1116 } 1193 }
1117 __initcall(memblock_init_debugfs); 1194 __initcall(memblock_init_debugfs);
1118 1195
1119 #endif /* CONFIG_DEBUG_FS */ 1196 #endif /* CONFIG_DEBUG_FS */
1120 1197