Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/compiler.h>

24

#include <linux/compiler.h>

25

#include <linux/kernel.h>

25

#include <linux/kernel.h>

26

#include <linux/module.h>

26

#include <linux/module.h>

27

#include <linux/suspend.h>

27

#include <linux/suspend.h>

28

#include <linux/pagevec.h>

28

#include <linux/pagevec.h>

29

#include <linux/blkdev.h>

29

#include <linux/blkdev.h>

30

#include <linux/slab.h>

30

#include <linux/slab.h>

31

#include <linux/oom.h>

31

#include <linux/oom.h>

32

#include <linux/notifier.h>

32

#include <linux/notifier.h>

33

#include <linux/topology.h>

33

#include <linux/topology.h>

34

#include <linux/sysctl.h>

34

#include <linux/sysctl.h>

35

#include <linux/cpu.h>

35

#include <linux/cpu.h>

36

#include <linux/cpuset.h>

36

#include <linux/cpuset.h>

37

#include <linux/memory_hotplug.h>

37

#include <linux/memory_hotplug.h>

38

#include <linux/nodemask.h>

38

#include <linux/nodemask.h>

39

#include <linux/vmalloc.h>

39

#include <linux/vmalloc.h>

40

#include <linux/mempolicy.h>

40

#include <linux/mempolicy.h>

41

#include <linux/stop_machine.h>

41

#include <linux/stop_machine.h>

42

#include <linux/sort.h>

42

#include <linux/sort.h>

43

#include <linux/pfn.h>

43

#include <linux/pfn.h>

44

#include <linux/backing-dev.h>

44

#include <linux/backing-dev.h>

45

#include <linux/fault-inject.h>

45

#include <linux/fault-inject.h>

46

#include <linux/page-isolation.h>

46

#include <linux/page-isolation.h>

47

#include <linux/memcontrol.h>

47

#include <linux/memcontrol.h>

48

#include <linux/debugobjects.h>

48

#include <linux/debugobjects.h>

49

50

#include <asm/tlbflush.h>

50

#include <asm/tlbflush.h>

51

#include <asm/div64.h>

51

#include <asm/div64.h>

52

#include "internal.h"

52

#include "internal.h"

53

54

/*

54

/*

55

* Array of node states.

55

* Array of node states.

56

*/

56

*/

57

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

57

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

58

[N_POSSIBLE] = NODE_MASK_ALL,

58

[N_POSSIBLE] = NODE_MASK_ALL,

59

[N_ONLINE] = { { [0] = 1UL } },

59

[N_ONLINE] = { { [0] = 1UL } },

60

#ifndef CONFIG_NUMA

60

#ifndef CONFIG_NUMA

61

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

61

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

62

#ifdef CONFIG_HIGHMEM

62

#ifdef CONFIG_HIGHMEM

63

[N_HIGH_MEMORY] = { { [0] = 1UL } },

63

[N_HIGH_MEMORY] = { { [0] = 1UL } },

64

#endif

64

#endif

65

[N_CPU] = { { [0] = 1UL } },

65

[N_CPU] = { { [0] = 1UL } },

66

#endif /* NUMA */

66

#endif /* NUMA */

67

};

67

};

68

EXPORT_SYMBOL(node_states);

68

EXPORT_SYMBOL(node_states);

69

70

unsigned long totalram_pages __read_mostly;

70

unsigned long totalram_pages __read_mostly;

71

unsigned long totalreserve_pages __read_mostly;

71

unsigned long totalreserve_pages __read_mostly;

72

long nr_swap_pages;

72

long nr_swap_pages;

73

int percpu_pagelist_fraction;

73

int percpu_pagelist_fraction;

74

75

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

75

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

76

int pageblock_order __read_mostly;

76

int pageblock_order __read_mostly;

77

#endif

77

#endif

78

79

static void __free_pages_ok(struct page *page, unsigned int order);

79

static void __free_pages_ok(struct page *page, unsigned int order);

80

81

/*

81

/*

82

* results with 256, 32 in the lowmem_reserve sysctl:

82

* results with 256, 32 in the lowmem_reserve sysctl:

83

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

83

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

84

* 1G machine -> (16M dma, 784M normal, 224M high)

84

* 1G machine -> (16M dma, 784M normal, 224M high)

85

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

85

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

86

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

86

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

87

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

87

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

88

*

88

*

89

* TBD: should special case ZONE_DMA32 machines here - in those we normally

89

* TBD: should special case ZONE_DMA32 machines here - in those we normally

90

* don't need any ZONE_NORMAL reservation

90

* don't need any ZONE_NORMAL reservation

91

*/

91

*/

92

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

92

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

93

#ifdef CONFIG_ZONE_DMA

93

#ifdef CONFIG_ZONE_DMA

94

256,

94

256,

95

#endif

95

#endif

96

#ifdef CONFIG_ZONE_DMA32

96

#ifdef CONFIG_ZONE_DMA32

97

256,

97

256,

98

#endif

98

#endif

99

#ifdef CONFIG_HIGHMEM

99

#ifdef CONFIG_HIGHMEM

100

32,

100

32,

101

#endif

101

#endif

102

32,

102

32,

103

};

103

};

104

105

EXPORT_SYMBOL(totalram_pages);

105

EXPORT_SYMBOL(totalram_pages);

106

107

static char * const zone_names[MAX_NR_ZONES] = {

107

static char * const zone_names[MAX_NR_ZONES] = {

108

#ifdef CONFIG_ZONE_DMA

108

#ifdef CONFIG_ZONE_DMA

109

"DMA",

109

"DMA",

110

#endif

110

#endif

111

#ifdef CONFIG_ZONE_DMA32

111

#ifdef CONFIG_ZONE_DMA32

112

"DMA32",

112

"DMA32",

113

#endif

113

#endif

114

"Normal",

114

"Normal",

115

#ifdef CONFIG_HIGHMEM

115

#ifdef CONFIG_HIGHMEM

116

"HighMem",

116

"HighMem",

117

#endif

117

#endif

118

"Movable",

118

"Movable",

119

};

119

};

120

121

int min_free_kbytes = 1024;

121

int min_free_kbytes = 1024;

122

123

unsigned long __meminitdata nr_kernel_pages;

123

unsigned long __meminitdata nr_kernel_pages;

124

unsigned long __meminitdata nr_all_pages;

124

unsigned long __meminitdata nr_all_pages;

125

static unsigned long __meminitdata dma_reserve;

125

static unsigned long __meminitdata dma_reserve;

126

127

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

127

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

128

/*

128

/*

129

* MAX_ACTIVE_REGIONS determines the maximum number of distinct

129

* MAX_ACTIVE_REGIONS determines the maximum number of distinct

130

* ranges of memory (RAM) that may be registered with add_active_range().

130

* ranges of memory (RAM) that may be registered with add_active_range().

131

* Ranges passed to add_active_range() will be merged if possible

131

* Ranges passed to add_active_range() will be merged if possible

132

* so the number of times add_active_range() can be called is

132

* so the number of times add_active_range() can be called is

133

* related to the number of nodes and the number of holes

133

* related to the number of nodes and the number of holes

134

*/

134

*/

135

#ifdef CONFIG_MAX_ACTIVE_REGIONS

135

#ifdef CONFIG_MAX_ACTIVE_REGIONS

136

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

136

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

137

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

137

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

138

#else

138

#else

139

#if MAX_NUMNODES >= 32

139

#if MAX_NUMNODES >= 32

140

/* If there can be many nodes, allow up to 50 holes per node */

140

/* If there can be many nodes, allow up to 50 holes per node */

141

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

141

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

142

#else

142

#else

143

/* By default, allow up to 256 distinct regions */

143

/* By default, allow up to 256 distinct regions */

144

#define MAX_ACTIVE_REGIONS 256

144

#define MAX_ACTIVE_REGIONS 256

145

#endif

145

#endif

146

#endif

146

#endif

147

148

static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];

148

static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];

149

static int __meminitdata nr_nodemap_entries;

149

static int __meminitdata nr_nodemap_entries;

150

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

150

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

151

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

151

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

152

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

152

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

153

static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];

153

static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];

154

static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];

154

static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];

155

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

155

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

156

unsigned long __initdata required_kernelcore;

156

unsigned long __initdata required_kernelcore;

157

static unsigned long __initdata required_movablecore;

157

static unsigned long __initdata required_movablecore;

158

unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

158

unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

159

160

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

160

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

161

int movable_zone;

161

int movable_zone;

162

EXPORT_SYMBOL(movable_zone);

162

EXPORT_SYMBOL(movable_zone);

163

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

163

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

164

165

#if MAX_NUMNODES > 1

165

#if MAX_NUMNODES > 1

166

int nr_node_ids __read_mostly = MAX_NUMNODES;

166

int nr_node_ids __read_mostly = MAX_NUMNODES;

167

EXPORT_SYMBOL(nr_node_ids);

167

EXPORT_SYMBOL(nr_node_ids);

168

#endif

168

#endif

169

170

int page_group_by_mobility_disabled __read_mostly;

170

int page_group_by_mobility_disabled __read_mostly;

171

172

static void set_pageblock_migratetype(struct page *page, int migratetype)

172

static void set_pageblock_migratetype(struct page *page, int migratetype)

173

{

173

{

174

set_pageblock_flags_group(page, (unsigned long)migratetype,

174

set_pageblock_flags_group(page, (unsigned long)migratetype,

175

PB_migrate, PB_migrate_end);

175

PB_migrate, PB_migrate_end);

176

}

176

}

177

178

#ifdef CONFIG_DEBUG_VM

178

#ifdef CONFIG_DEBUG_VM

179

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

179

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

180

{

180

{

181

int ret = 0;

181

int ret = 0;

182

unsigned seq;

182

unsigned seq;

183

unsigned long pfn = page_to_pfn(page);

183

unsigned long pfn = page_to_pfn(page);

184

185

do {

185

do {

186

seq = zone_span_seqbegin(zone);

186

seq = zone_span_seqbegin(zone);

187

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

187

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

188

ret = 1;

188

ret = 1;

189

else if (pfn < zone->zone_start_pfn)

189

else if (pfn < zone->zone_start_pfn)

190

ret = 1;

190

ret = 1;

191

} while (zone_span_seqretry(zone, seq));

191

} while (zone_span_seqretry(zone, seq));

192

193

return ret;

193

return ret;

194

}

194

}

195

196

static int page_is_consistent(struct zone *zone, struct page *page)

196

static int page_is_consistent(struct zone *zone, struct page *page)

197

{

197

{

198

if (!pfn_valid_within(page_to_pfn(page)))

198

if (!pfn_valid_within(page_to_pfn(page)))

199

return 0;

199

return 0;

200

if (zone != page_zone(page))

200

if (zone != page_zone(page))

201

return 0;

201

return 0;

202

203

return 1;

203

return 1;

204

}

204

}

205

/*

205

/*

206

* Temporary debugging check for pages not lying within a given zone.

206

* Temporary debugging check for pages not lying within a given zone.

207

*/

207

*/

208

static int bad_range(struct zone *zone, struct page *page)

208

static int bad_range(struct zone *zone, struct page *page)

209

{

209

{

210

if (page_outside_zone_boundaries(zone, page))

210

if (page_outside_zone_boundaries(zone, page))

211

return 1;

211

return 1;

212

if (!page_is_consistent(zone, page))

212

if (!page_is_consistent(zone, page))

213

return 1;

213

return 1;

214

215

return 0;

215

return 0;

216

}

216

}

217

#else

217

#else

218

static inline int bad_range(struct zone *zone, struct page *page)

218

static inline int bad_range(struct zone *zone, struct page *page)

219

{

219

{

220

return 0;

220

return 0;

221

}

221

}

222

#endif

222

#endif

223

224

static void bad_page(struct page *page)

224

static void bad_page(struct page *page)

225

{

225

{

226

void *pc = page_get_page_cgroup(page);

226

void *pc = page_get_page_cgroup(page);

227

228

printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG

228

printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG

229

"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",

229

"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",

230

current->comm, page, (int)(2*sizeof(unsigned long)),

230

current->comm, page, (int)(2*sizeof(unsigned long)),

231

(unsigned long)page->flags, page->mapping,

231

(unsigned long)page->flags, page->mapping,

232

page_mapcount(page), page_count(page));

232

page_mapcount(page), page_count(page));

233

if (pc) {

233

if (pc) {

234

printk(KERN_EMERG "cgroup:%p\n", pc);

234

printk(KERN_EMERG "cgroup:%p\n", pc);

235

page_reset_bad_cgroup(page);

235

page_reset_bad_cgroup(page);

236

}

236

}

237

printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

237

printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

238

KERN_EMERG "Backtrace:\n");

238

KERN_EMERG "Backtrace:\n");

239

dump_stack();

239

dump_stack();

240

page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;

240

page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;

241

set_page_count(page, 0);

241

set_page_count(page, 0);

242

reset_page_mapcount(page);

242

reset_page_mapcount(page);

243

page->mapping = NULL;

243

page->mapping = NULL;

244

add_taint(TAINT_BAD_PAGE);

244

add_taint(TAINT_BAD_PAGE);

245

}

245

}

246

247

/*

247

/*

248

* Higher-order pages are called "compound pages". They are structured thusly:

248

* Higher-order pages are called "compound pages". They are structured thusly:

249

*

249

*

250

* The first PAGE_SIZE page is called the "head page".

250

* The first PAGE_SIZE page is called the "head page".

251

*

251

*

252

* The remaining PAGE_SIZE pages are called "tail pages".

252

* The remaining PAGE_SIZE pages are called "tail pages".

253

*

253

*

254

* All pages have PG_compound set. All pages have their ->private pointing at

254

* All pages have PG_compound set. All pages have their ->private pointing at

255

* the head page (even the head page has this).

255

* the head page (even the head page has this).

256

*

256

*

257

* The first tail page's ->lru.next holds the address of the compound page's

257

* The first tail page's ->lru.next holds the address of the compound page's

258

* put_page() function. Its ->lru.prev holds the order of allocation.

258

* put_page() function. Its ->lru.prev holds the order of allocation.

259

* This usage means that zero-order pages may not be compound.

259

* This usage means that zero-order pages may not be compound.

260

*/

260

*/

261

262

static void free_compound_page(struct page *page)

262

static void free_compound_page(struct page *page)

263

{

263

{

264

__free_pages_ok(page, compound_order(page));

264

__free_pages_ok(page, compound_order(page));

265

}

265

}

266

267

static void prep_compound_page(struct page *page, unsigned long order)

267

static void prep_compound_page(struct page *page, unsigned long order)

268

{

268

{

269

int i;

269

int i;

270

int nr_pages = 1 << order;

270

int nr_pages = 1 << order;

271

272

set_compound_page_dtor(page, free_compound_page);

272

set_compound_page_dtor(page, free_compound_page);

273

set_compound_order(page, order);

273

set_compound_order(page, order);

274

__SetPageHead(page);

274

__SetPageHead(page);

275

for (i = 1; i < nr_pages; i++) {

275

for (i = 1; i < nr_pages; i++) {

276

struct page *p = page + i;

276

struct page *p = page + i;

277

278

__SetPageTail(p);

278

__SetPageTail(p);

279

p->first_page = page;

279

p->first_page = page;

280

}

280

}

281

}

281

}

282

283

static void destroy_compound_page(struct page *page, unsigned long order)

283

static void destroy_compound_page(struct page *page, unsigned long order)

284

{

284

{

285

int i;

285

int i;

286

int nr_pages = 1 << order;

286

int nr_pages = 1 << order;

287

288

if (unlikely(compound_order(page) != order))

288

if (unlikely(compound_order(page) != order))

289

bad_page(page);

289

bad_page(page);

290

291

if (unlikely(!PageHead(page)))

291

if (unlikely(!PageHead(page)))

292

bad_page(page);

292

bad_page(page);

293

__ClearPageHead(page);

293

__ClearPageHead(page);

294

for (i = 1; i < nr_pages; i++) {

294

for (i = 1; i < nr_pages; i++) {

295

struct page *p = page + i;

295

struct page *p = page + i;

296

297

if (unlikely(!PageTail(p) |

297

if (unlikely(!PageTail(p) |

298

(p->first_page != page)))

298

(p->first_page != page)))

299

bad_page(page);

299

bad_page(page);

300

__ClearPageTail(p);

300

__ClearPageTail(p);

301

}

301

}

302

}

302

}

303

304

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

304

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

305

{

305

{

306

int i;

306

int i;

307

308

/*

308

/*

309

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

309

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

310

* and __GFP_HIGHMEM from hard or soft interrupt context.

310

* and __GFP_HIGHMEM from hard or soft interrupt context.

311

*/

311

*/

312

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

312

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

313

for (i = 0; i < (1 << order); i++)

313

for (i = 0; i < (1 << order); i++)

314

clear_highpage(page + i);

314

clear_highpage(page + i);

315

}

315

}

316

317

static inline void set_page_order(struct page *page, int order)

317

static inline void set_page_order(struct page *page, int order)

318

{

318

{

319

set_page_private(page, order);

319

set_page_private(page, order);

320

__SetPageBuddy(page);

320

__SetPageBuddy(page);

321

}

321

}

322

323

static inline void rmv_page_order(struct page *page)

323

static inline void rmv_page_order(struct page *page)

324

{

324

{

325

__ClearPageBuddy(page);

325

__ClearPageBuddy(page);

326

set_page_private(page, 0);

326

set_page_private(page, 0);

327

}

327

}

328

329

/*

329

/*

330

* Locate the struct page for both the matching buddy in our

330

* Locate the struct page for both the matching buddy in our

331

* pair (buddy1) and the combined O(n+1) page they form (page).

331

* pair (buddy1) and the combined O(n+1) page they form (page).

332

*

332

*

333

* 1) Any buddy B1 will have an order O twin B2 which satisfies

333

* 1) Any buddy B1 will have an order O twin B2 which satisfies

334

* the following equation:

334

* the following equation:

335

* B2 = B1 ^ (1 << O)

335

* B2 = B1 ^ (1 << O)

336

* For example, if the starting buddy (buddy2) is #8 its order

336

* For example, if the starting buddy (buddy2) is #8 its order

337

* 1 buddy is #10:

337

* 1 buddy is #10:

338

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

338

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

339

*

339

*

340

* 2) Any buddy B will have an order O+1 parent P which

340

* 2) Any buddy B will have an order O+1 parent P which

341

* satisfies the following equation:

341

* satisfies the following equation:

342

* P = B & ~(1 << O)

342

* P = B & ~(1 << O)

343

*

343

*

344

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

344

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

345

*/

345

*/

346

static inline struct page *

346

static inline struct page *

347

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

347

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

348

{

348

{

349

unsigned long buddy_idx = page_idx ^ (1 << order);

349

unsigned long buddy_idx = page_idx ^ (1 << order);

350

351

return page + (buddy_idx - page_idx);

351

return page + (buddy_idx - page_idx);

352

}

352

}

353

354

static inline unsigned long

354

static inline unsigned long

355

__find_combined_index(unsigned long page_idx, unsigned int order)

355

__find_combined_index(unsigned long page_idx, unsigned int order)

356

{

356

{

357

return (page_idx & ~(1 << order));

357

return (page_idx & ~(1 << order));

358

}

358

}

359

360

/*

360

/*

361

* This function checks whether a page is free && is the buddy

361

* This function checks whether a page is free && is the buddy

362

* we can do coalesce a page and its buddy if

362

* we can do coalesce a page and its buddy if

363

* (a) the buddy is not in a hole &&

363

* (a) the buddy is not in a hole &&

364

* (b) the buddy is in the buddy system &&

364

* (b) the buddy is in the buddy system &&

365

* (c) a page and its buddy have the same order &&

365

* (c) a page and its buddy have the same order &&

366

* (d) a page and its buddy are in the same zone.

366

* (d) a page and its buddy are in the same zone.

367

*

367

*

368

* For recording whether a page is in the buddy system, we use PG_buddy.

368

* For recording whether a page is in the buddy system, we use PG_buddy.

369

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

369

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

370

*

370

*

371

* For recording page's order, we use page_private(page).

371

* For recording page's order, we use page_private(page).

372

*/

372

*/

373

static inline int page_is_buddy(struct page *page, struct page *buddy,

373

static inline int page_is_buddy(struct page *page, struct page *buddy,

374

int order)

374

int order)

375

{

375

{

376

if (!pfn_valid_within(page_to_pfn(buddy)))

376

if (!pfn_valid_within(page_to_pfn(buddy)))

377

return 0;

377

return 0;

378

379

if (page_zone_id(page) != page_zone_id(buddy))

379

if (page_zone_id(page) != page_zone_id(buddy))

380

return 0;

380

return 0;

381

382

if (PageBuddy(buddy) && page_order(buddy) == order) {

382

if (PageBuddy(buddy) && page_order(buddy) == order) {

383

BUG_ON(page_count(buddy) != 0);

383

BUG_ON(page_count(buddy) != 0);

384

return 1;

384

return 1;

385

}

385

}

386

return 0;

386

return 0;

387

}

387

}

388

389

/*

389

/*

390

* Freeing function for a buddy system allocator.

390

* Freeing function for a buddy system allocator.

391

*

391

*

392

* The concept of a buddy system is to maintain direct-mapped table

392

* The concept of a buddy system is to maintain direct-mapped table

393

* (containing bit values) for memory blocks of various "orders".

393

* (containing bit values) for memory blocks of various "orders".

394

* The bottom level table contains the map for the smallest allocatable

394

* The bottom level table contains the map for the smallest allocatable

395

* units of memory (here, pages), and each level above it describes

395

* units of memory (here, pages), and each level above it describes

396

* pairs of units from the levels below, hence, "buddies".

396

* pairs of units from the levels below, hence, "buddies".

397

* At a high level, all that happens here is marking the table entry

397

* At a high level, all that happens here is marking the table entry

398

* at the bottom level available, and propagating the changes upward

398

* at the bottom level available, and propagating the changes upward

399

* as necessary, plus some accounting needed to play nicely with other

399

* as necessary, plus some accounting needed to play nicely with other

400

* parts of the VM system.

400

* parts of the VM system.

401

* At each level, we keep a list of pages, which are heads of continuous

401

* At each level, we keep a list of pages, which are heads of continuous

402

* free pages of length of (1 << order) and marked with PG_buddy. Page's

402

* free pages of length of (1 << order) and marked with PG_buddy. Page's

403

* order is recorded in page_private(page) field.

403

* order is recorded in page_private(page) field.

404

* So when we are allocating or freeing one, we can derive the state of the

404

* So when we are allocating or freeing one, we can derive the state of the

405

* other. That is, if we allocate a small block, and both were

405

* other. That is, if we allocate a small block, and both were

406

* free, the remainder of the region must be split into blocks.

406

* free, the remainder of the region must be split into blocks.

407

* If a block is freed, and its buddy is also free, then this

407

* If a block is freed, and its buddy is also free, then this

408

* triggers coalescing into a block of larger size.

408

* triggers coalescing into a block of larger size.

409

*

409

*

410

* -- wli

410

* -- wli

411

*/

411

*/

412

413

static inline void __free_one_page(struct page *page,

413

static inline void __free_one_page(struct page *page,

414

struct zone *zone, unsigned int order)

414

struct zone *zone, unsigned int order)

415

{

415

{

416

unsigned long page_idx;

416

unsigned long page_idx;

417

int order_size = 1 << order;

417

int order_size = 1 << order;

418

int migratetype = get_pageblock_migratetype(page);

418

int migratetype = get_pageblock_migratetype(page);

419

420

if (unlikely(PageCompound(page)))

420

if (unlikely(PageCompound(page)))

421

destroy_compound_page(page, order);

421

destroy_compound_page(page, order);

422

423

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

423

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

424

425

VM_BUG_ON(page_idx & (order_size - 1));

425

VM_BUG_ON(page_idx & (order_size - 1));

426

VM_BUG_ON(bad_range(zone, page));

426

VM_BUG_ON(bad_range(zone, page));

427

428

__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);

428

__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);

429

while (order < MAX_ORDER-1) {

429

while (order < MAX_ORDER-1) {

430

unsigned long combined_idx;

430

unsigned long combined_idx;

431

struct page *buddy;

431

struct page *buddy;

432

433

buddy = __page_find_buddy(page, page_idx, order);

433

buddy = __page_find_buddy(page, page_idx, order);

434

if (!page_is_buddy(page, buddy, order))

434

if (!page_is_buddy(page, buddy, order))

435

break; /* Move the buddy up one level. */

435

break; /* Move the buddy up one level. */

436

437

list_del(&buddy->lru);

437

list_del(&buddy->lru);

438

zone->free_area[order].nr_free--;

438

zone->free_area[order].nr_free--;

439

rmv_page_order(buddy);

439

rmv_page_order(buddy);

440

combined_idx = __find_combined_index(page_idx, order);

440

combined_idx = __find_combined_index(page_idx, order);

441

page = page + (combined_idx - page_idx);

441

page = page + (combined_idx - page_idx);

442

page_idx = combined_idx;

442

page_idx = combined_idx;

443

order++;

443

order++;

444

}

444

}

445

set_page_order(page, order);

445

set_page_order(page, order);

446

list_add(&page->lru,

446

list_add(&page->lru,

447

&zone->free_area[order].free_list[migratetype]);

447

&zone->free_area[order].free_list[migratetype]);

448

zone->free_area[order].nr_free++;

448

zone->free_area[order].nr_free++;

449

}

449

}

450

451

static inline int free_pages_check(struct page *page)

451

static inline int free_pages_check(struct page *page)

452

{

452

{

453

if (unlikely(page_mapcount(page) |

453

if (unlikely(page_mapcount(page) |

454

(page->mapping != NULL) |

454

(page->mapping != NULL) |

455

(page_get_page_cgroup(page) != NULL) |

455

(page_get_page_cgroup(page) != NULL) |

456

(page_count(page) != 0) |

456

(page_count(page) != 0) |

457

(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))

457

(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))

458

bad_page(page);

458

bad_page(page);

459

if (PageDirty(page))

459

if (PageDirty(page))

460

__ClearPageDirty(page);

460

__ClearPageDirty(page);

461

/*

461

/*

462

* For now, we report if PG_reserved was found set, but do not

462

* For now, we report if PG_reserved was found set, but do not

463

* clear it, and do not free the page. But we shall soon need

463

* clear it, and do not free the page. But we shall soon need

464

* to do more, for when the ZERO_PAGE count wraps negative.

464

* to do more, for when the ZERO_PAGE count wraps negative.

465

*/

465

*/

466

return PageReserved(page);

466

return PageReserved(page);

467

}

467

}

468

469

/*

469

/*

470

* Frees a list of pages.

470

* Frees a list of pages.

471

* Assumes all pages on list are in same zone, and of same order.

471

* Assumes all pages on list are in same zone, and of same order.

472

* count is the number of pages to free.

472

* count is the number of pages to free.

473

*

473

*

474

* If the zone was previously in an "all pages pinned" state then look to

474

* If the zone was previously in an "all pages pinned" state then look to

475

* see if this freeing clears that state.

475

* see if this freeing clears that state.

476

*

476

*

477

* And clear the zone's pages_scanned counter, to hold off the "all pages are

477

* And clear the zone's pages_scanned counter, to hold off the "all pages are

478

* pinned" detection logic.

478

* pinned" detection logic.

479

*/

479

*/

480

static void free_pages_bulk(struct zone *zone, int count,

480

static void free_pages_bulk(struct zone *zone, int count,

481

struct list_head *list, int order)

481

struct list_head *list, int order)

482

{

482

{

483

spin_lock(&zone->lock);

483

spin_lock(&zone->lock);

484

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

484

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

485

zone->pages_scanned = 0;

485

zone->pages_scanned = 0;

486

while (count--) {

486

while (count--) {

487

struct page *page;

487

struct page *page;

488

489

VM_BUG_ON(list_empty(list));

489

VM_BUG_ON(list_empty(list));

490

page = list_entry(list->prev, struct page, lru);

490

page = list_entry(list->prev, struct page, lru);

491

/* have to delete it as __free_one_page list manipulates */

491

/* have to delete it as __free_one_page list manipulates */

492

list_del(&page->lru);

492

list_del(&page->lru);

493

__free_one_page(page, zone, order);

493

__free_one_page(page, zone, order);

494

}

494

}

495

spin_unlock(&zone->lock);

495

spin_unlock(&zone->lock);

496

}

496

}

497

498

static void free_one_page(struct zone *zone, struct page *page, int order)

498

static void free_one_page(struct zone *zone, struct page *page, int order)

499

{

499

{

500

spin_lock(&zone->lock);

500

spin_lock(&zone->lock);

501

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

501

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

502

zone->pages_scanned = 0;

502

zone->pages_scanned = 0;

503

__free_one_page(page, zone, order);

503

__free_one_page(page, zone, order);

504

spin_unlock(&zone->lock);

504

spin_unlock(&zone->lock);

505

}

505

}

506

507

static void __free_pages_ok(struct page *page, unsigned int order)

507

static void __free_pages_ok(struct page *page, unsigned int order)

508

{

508

{

509

unsigned long flags;

509

unsigned long flags;

510

int i;

510

int i;

511

int reserved = 0;

511

int reserved = 0;

512

513

for (i = 0 ; i < (1 << order) ; ++i)

513

for (i = 0 ; i < (1 << order) ; ++i)

514

reserved += free_pages_check(page + i);

514

reserved += free_pages_check(page + i);

515

if (reserved)

515

if (reserved)

516

return;

516

return;

517

518

if (!PageHighMem(page)) {

518

if (!PageHighMem(page)) {

519

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

519

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

520

debug_check_no_obj_freed(page_address(page),

520

debug_check_no_obj_freed(page_address(page),

521

PAGE_SIZE << order);

521

PAGE_SIZE << order);

522

}

522

}

523

arch_free_page(page, order);

523

arch_free_page(page, order);

524

kernel_map_pages(page, 1 << order, 0);

524

kernel_map_pages(page, 1 << order, 0);

525

526

local_irq_save(flags);

526

local_irq_save(flags);

527

__count_vm_events(PGFREE, 1 << order);

527

__count_vm_events(PGFREE, 1 << order);

528

free_one_page(page_zone(page), page, order);

528

free_one_page(page_zone(page), page, order);

529

local_irq_restore(flags);

529

local_irq_restore(flags);

530

}

530

}

531

532

/*

532

/*

533

* permit the bootmem allocator to evade page validation on high-order frees

533

* permit the bootmem allocator to evade page validation on high-order frees

534

*/

534

*/

535

void __free_pages_bootmem(struct page *page, unsigned int order)

535

void __free_pages_bootmem(struct page *page, unsigned int order)

536

{

536

{

537

if (order == 0) {

537

if (order == 0) {

538

__ClearPageReserved(page);

538

__ClearPageReserved(page);

539

set_page_count(page, 0);

539

set_page_count(page, 0);

540

set_page_refcounted(page);

540

set_page_refcounted(page);

541

__free_page(page);

541

__free_page(page);

542

} else {

542

} else {

543

int loop;

543

int loop;

544

545

prefetchw(page);

545

prefetchw(page);

546

for (loop = 0; loop < BITS_PER_LONG; loop++) {

546

for (loop = 0; loop < BITS_PER_LONG; loop++) {

547

struct page *p = &page[loop];

547

struct page *p = &page[loop];

548

549

if (loop + 1 < BITS_PER_LONG)

549

if (loop + 1 < BITS_PER_LONG)

550

prefetchw(p + 1);

550

prefetchw(p + 1);

551

__ClearPageReserved(p);

551

__ClearPageReserved(p);

552

set_page_count(p, 0);

552

set_page_count(p, 0);

553

}

553

}

554

555

set_page_refcounted(page);

555

set_page_refcounted(page);

556

__free_pages(page, order);

556

__free_pages(page, order);

557

}

557

}

558

}

558

}

559

560

561

/*

561

/*

562

* The order of subdivision here is critical for the IO subsystem.

562

* The order of subdivision here is critical for the IO subsystem.

563

* Please do not alter this order without good reasons and regression

563

* Please do not alter this order without good reasons and regression

564

* testing. Specifically, as large blocks of memory are subdivided,

564

* testing. Specifically, as large blocks of memory are subdivided,

565

* the order in which smaller blocks are delivered depends on the order

565

* the order in which smaller blocks are delivered depends on the order

566

* they're subdivided in this function. This is the primary factor

566

* they're subdivided in this function. This is the primary factor

567

* influencing the order in which pages are delivered to the IO

567

* influencing the order in which pages are delivered to the IO

568

* subsystem according to empirical testing, and this is also justified

568

* subsystem according to empirical testing, and this is also justified

569

* by considering the behavior of a buddy system containing a single

569

* by considering the behavior of a buddy system containing a single

570

* large block of memory acted on by a series of small allocations.

570

* large block of memory acted on by a series of small allocations.

571

* This behavior is a critical factor in sglist merging's success.

571

* This behavior is a critical factor in sglist merging's success.

572

*

572

*

573

* -- wli

573

* -- wli

574

*/

574

*/

575

static inline void expand(struct zone *zone, struct page *page,

575

static inline void expand(struct zone *zone, struct page *page,

576

int low, int high, struct free_area *area,

576

int low, int high, struct free_area *area,

577

int migratetype)

577

int migratetype)

578

{

578

{

579

unsigned long size = 1 << high;

579

unsigned long size = 1 << high;

580

581

while (high > low) {

581

while (high > low) {

582

area--;

582

area--;

583

high--;

583

high--;

584

size >>= 1;

584

size >>= 1;

585

VM_BUG_ON(bad_range(zone, &page[size]));

585

VM_BUG_ON(bad_range(zone, &page[size]));

586

list_add(&page[size].lru, &area->free_list[migratetype]);

586

list_add(&page[size].lru, &area->free_list[migratetype]);

587

area->nr_free++;

587

area->nr_free++;

588

set_page_order(&page[size], high);

588

set_page_order(&page[size], high);

589

}

589

}

590

}

590

}

591

592

/*

592

/*

593

* This page is about to be returned from the page allocator

593

* This page is about to be returned from the page allocator

594

*/

594

*/

595

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

595

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

596

{

596

{

597

if (unlikely(page_mapcount(page) |

597

if (unlikely(page_mapcount(page) |

598

(page->mapping != NULL) |

598

(page->mapping != NULL) |

599

(page_get_page_cgroup(page) != NULL) |

599

(page_get_page_cgroup(page) != NULL) |

600

(page_count(page) != 0) |

600

(page_count(page) != 0) |

601

(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))

601

(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))

602

bad_page(page);

602

bad_page(page);

603

604

/*

604

/*

605

* For now, we report if PG_reserved was found set, but do not

605

* For now, we report if PG_reserved was found set, but do not

606

* clear it, and do not allocate the page: as a safety net.

606

* clear it, and do not allocate the page: as a safety net.

607

*/

607

*/

608

if (PageReserved(page))

608

if (PageReserved(page))

609

return 1;

609

return 1;

610

611

page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |

611

page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |

612

1 << PG_referenced | 1 << PG_arch_1 |

612

1 << PG_referenced | 1 << PG_arch_1 |

613

1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);

613

1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);

614

set_page_private(page, 0);

614

set_page_private(page, 0);

615

set_page_refcounted(page);

615

set_page_refcounted(page);

616

617

arch_alloc_page(page, order);

617

arch_alloc_page(page, order);

618

kernel_map_pages(page, 1 << order, 1);

618

kernel_map_pages(page, 1 << order, 1);

619

620

if (gfp_flags & __GFP_ZERO)

620

if (gfp_flags & __GFP_ZERO)

621

prep_zero_page(page, order, gfp_flags);

621

prep_zero_page(page, order, gfp_flags);

622

623

if (order && (gfp_flags & __GFP_COMP))

623

if (order && (gfp_flags & __GFP_COMP))

624

prep_compound_page(page, order);

624

prep_compound_page(page, order);

625

626

return 0;

626

return 0;

627

}

627

}

628

629

/*

629

/*

630

* Go through the free lists for the given migratetype and remove

630

* Go through the free lists for the given migratetype and remove

631

* the smallest available page from the freelists

631

* the smallest available page from the freelists

632

*/

632

*/

633

static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

633

static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

634

int migratetype)

634

int migratetype)

635

{

635

{

636

unsigned int current_order;

636

unsigned int current_order;

637

struct free_area * area;

637

struct free_area * area;

638

struct page *page;

638

struct page *page;

639

640

/* Find a page of the appropriate size in the preferred list */

640

/* Find a page of the appropriate size in the preferred list */

641

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

641

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

642

area = &(zone->free_area[current_order]);

642

area = &(zone->free_area[current_order]);

643

if (list_empty(&area->free_list[migratetype]))

643

if (list_empty(&area->free_list[migratetype]))

644

continue;

644

continue;

645

646

page = list_entry(area->free_list[migratetype].next,

646

page = list_entry(area->free_list[migratetype].next,

647

struct page, lru);

647

struct page, lru);

648

list_del(&page->lru);

648

list_del(&page->lru);

649

rmv_page_order(page);

649

rmv_page_order(page);

650

area->nr_free--;

650

area->nr_free--;

651

__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));

651

__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));

652

expand(zone, page, order, current_order, area, migratetype);

652

expand(zone, page, order, current_order, area, migratetype);

653

return page;

653

return page;

654

}

654

}

655

656

return NULL;

656

return NULL;

657

}

657

}

658

659

660

/*

660

/*

661

* This array describes the order lists are fallen back to when

661

* This array describes the order lists are fallen back to when

662

* the free lists for the desirable migrate type are depleted

662

* the free lists for the desirable migrate type are depleted

663

*/

663

*/

664

static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {

664

static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {

665

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

665

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

666

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

666

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

667

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

667

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

668

[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */

668

[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */

669

};

669

};

670

671

/*

671

/*

672

* Move the free pages in a range to the free lists of the requested type.

672

* Move the free pages in a range to the free lists of the requested type.

673

* Note that start_page and end_pages are not aligned on a pageblock

673

* Note that start_page and end_pages are not aligned on a pageblock

674

* boundary. If alignment is required, use move_freepages_block()

674

* boundary. If alignment is required, use move_freepages_block()

675

*/

675

*/

676

int move_freepages(struct zone *zone,

676

int move_freepages(struct zone *zone,

677

struct page *start_page, struct page *end_page,

677

struct page *start_page, struct page *end_page,

678

int migratetype)

678

int migratetype)

679

{

679

{

680

struct page *page;

680

struct page *page;

681

unsigned long order;

681

unsigned long order;

682

int pages_moved = 0;

682

int pages_moved = 0;

683

684

#ifndef CONFIG_HOLES_IN_ZONE

684

#ifndef CONFIG_HOLES_IN_ZONE

685

/*

685

/*

686

* page_zone is not safe to call in this context when

686

* page_zone is not safe to call in this context when

687

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

687

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

688

* anyway as we check zone boundaries in move_freepages_block().

688

* anyway as we check zone boundaries in move_freepages_block().

689

* Remove at a later date when no bug reports exist related to

689

* Remove at a later date when no bug reports exist related to

690

* grouping pages by mobility

690

* grouping pages by mobility

691

*/

691

*/

692

BUG_ON(page_zone(start_page) != page_zone(end_page));

692

BUG_ON(page_zone(start_page) != page_zone(end_page));

693

#endif

693

#endif

694

695

for (page = start_page; page <= end_page;) {

695

for (page = start_page; page <= end_page;) {

696

if (!pfn_valid_within(page_to_pfn(page))) {

696

if (!pfn_valid_within(page_to_pfn(page))) {

697

page++;

697

page++;

698

continue;

698

continue;

699

}

699

}

700

701

if (!PageBuddy(page)) {

701

if (!PageBuddy(page)) {

702

page++;

702

page++;

703

continue;

703

continue;

704

}

704

}

705

706

order = page_order(page);

706

order = page_order(page);

707

list_del(&page->lru);

707

list_del(&page->lru);

708

list_add(&page->lru,

708

list_add(&page->lru,

709

&zone->free_area[order].free_list[migratetype]);

709

&zone->free_area[order].free_list[migratetype]);

710

page += 1 << order;

710

page += 1 << order;

711

pages_moved += 1 << order;

711

pages_moved += 1 << order;

712

}

712

}

713

714

return pages_moved;

714

return pages_moved;

715

}

715

}

716

717

int move_freepages_block(struct zone *zone, struct page *page, int migratetype)

717

int move_freepages_block(struct zone *zone, struct page *page, int migratetype)

718

{

718

{

719

unsigned long start_pfn, end_pfn;

719

unsigned long start_pfn, end_pfn;

720

struct page *start_page, *end_page;

720

struct page *start_page, *end_page;

721

722

start_pfn = page_to_pfn(page);

722

start_pfn = page_to_pfn(page);

723

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

723

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

724

start_page = pfn_to_page(start_pfn);

724

start_page = pfn_to_page(start_pfn);

725

end_page = start_page + pageblock_nr_pages - 1;

725

end_page = start_page + pageblock_nr_pages - 1;

726

end_pfn = start_pfn + pageblock_nr_pages - 1;

726

end_pfn = start_pfn + pageblock_nr_pages - 1;

727

728

/* Do not cross zone boundaries */

728

/* Do not cross zone boundaries */

729

if (start_pfn < zone->zone_start_pfn)

729

if (start_pfn < zone->zone_start_pfn)

730

start_page = page;

730

start_page = page;

731

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

731

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

732

return 0;

732

return 0;

733

734

return move_freepages(zone, start_page, end_page, migratetype);

734

return move_freepages(zone, start_page, end_page, migratetype);

735

}

735

}

736

737

/* Remove an element from the buddy allocator from the fallback list */

737

/* Remove an element from the buddy allocator from the fallback list */

738

static struct page *__rmqueue_fallback(struct zone *zone, int order,

738

static struct page *__rmqueue_fallback(struct zone *zone, int order,

739

int start_migratetype)

739

int start_migratetype)

740

{

740

{

741

struct free_area * area;

741

struct free_area * area;

742

int current_order;

742

int current_order;

743

struct page *page;

743

struct page *page;

744

int migratetype, i;

744

int migratetype, i;

745

746

/* Find the largest possible block of pages in the other list */

746

/* Find the largest possible block of pages in the other list */

747

for (current_order = MAX_ORDER-1; current_order >= order;

747

for (current_order = MAX_ORDER-1; current_order >= order;

748

--current_order) {

748

--current_order) {

749

for (i = 0; i < MIGRATE_TYPES - 1; i++) {

749

for (i = 0; i < MIGRATE_TYPES - 1; i++) {

750

migratetype = fallbacks[start_migratetype][i];

750

migratetype = fallbacks[start_migratetype][i];

751

752

/* MIGRATE_RESERVE handled later if necessary */

752

/* MIGRATE_RESERVE handled later if necessary */

753

if (migratetype == MIGRATE_RESERVE)

753

if (migratetype == MIGRATE_RESERVE)

754

continue;

754

continue;

755

756

area = &(zone->free_area[current_order]);

756

area = &(zone->free_area[current_order]);

757

if (list_empty(&area->free_list[migratetype]))

757

if (list_empty(&area->free_list[migratetype]))

758

continue;

758

continue;

759

760

page = list_entry(area->free_list[migratetype].next,

760

page = list_entry(area->free_list[migratetype].next,

761

struct page, lru);

761

struct page, lru);

762

area->nr_free--;

762

area->nr_free--;

763

764

/*

764

/*

765

* If breaking a large block of pages, move all free

765

* If breaking a large block of pages, move all free

766

* pages to the preferred allocation list. If falling

766

* pages to the preferred allocation list. If falling

767

* back for a reclaimable kernel allocation, be more

767

* back for a reclaimable kernel allocation, be more

768

* agressive about taking ownership of free pages

768

* agressive about taking ownership of free pages

769

*/

769

*/

770

if (unlikely(current_order >= (pageblock_order >> 1)) ||

770

if (unlikely(current_order >= (pageblock_order >> 1)) ||

771

start_migratetype == MIGRATE_RECLAIMABLE) {

771

start_migratetype == MIGRATE_RECLAIMABLE) {

772

unsigned long pages;

772

unsigned long pages;

773

pages = move_freepages_block(zone, page,

773

pages = move_freepages_block(zone, page,

774

start_migratetype);

774

start_migratetype);

775

776

/* Claim the whole block if over half of it is free */

776

/* Claim the whole block if over half of it is free */

777

if (pages >= (1 << (pageblock_order-1)))

777

if (pages >= (1 << (pageblock_order-1)))

778

set_pageblock_migratetype(page,

778

set_pageblock_migratetype(page,

779

start_migratetype);

779

start_migratetype);

780

781

migratetype = start_migratetype;

781

migratetype = start_migratetype;

782

}

782

}

783

784

/* Remove the page from the freelists */

784

/* Remove the page from the freelists */

785

list_del(&page->lru);

785

list_del(&page->lru);

786

rmv_page_order(page);

786

rmv_page_order(page);

787

__mod_zone_page_state(zone, NR_FREE_PAGES,

787

__mod_zone_page_state(zone, NR_FREE_PAGES,

788

-(1UL << order));

788

-(1UL << order));

789

790

if (current_order == pageblock_order)

790

if (current_order == pageblock_order)

791

set_pageblock_migratetype(page,

791

set_pageblock_migratetype(page,

792

start_migratetype);

792

start_migratetype);

793

794

expand(zone, page, order, current_order, area, migratetype);

794

expand(zone, page, order, current_order, area, migratetype);

795

return page;

795

return page;

796

}

796

}

797

}

797

}

798

799

/* Use MIGRATE_RESERVE rather than fail an allocation */

799

/* Use MIGRATE_RESERVE rather than fail an allocation */

800

return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);

800

return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);

801

}

801

}

802

803

/*

803

/*

804

* Do the hard work of removing an element from the buddy allocator.

804

* Do the hard work of removing an element from the buddy allocator.

805

* Call me with the zone->lock already held.

805

* Call me with the zone->lock already held.

806

*/

806

*/

807

static struct page *__rmqueue(struct zone *zone, unsigned int order,

807

static struct page *__rmqueue(struct zone *zone, unsigned int order,

808

int migratetype)

808

int migratetype)

809

{

809

{

810

struct page *page;

810

struct page *page;

811

812

page = __rmqueue_smallest(zone, order, migratetype);

812

page = __rmqueue_smallest(zone, order, migratetype);

813

814

if (unlikely(!page))

814

if (unlikely(!page))

815

page = __rmqueue_fallback(zone, order, migratetype);

815

page = __rmqueue_fallback(zone, order, migratetype);

816

817

return page;

817

return page;

818

}

818

}

819

820

/*

820

/*

821

* Obtain a specified number of elements from the buddy allocator, all under

821

* Obtain a specified number of elements from the buddy allocator, all under

822

* a single hold of the lock, for efficiency. Add them to the supplied list.

822

* a single hold of the lock, for efficiency. Add them to the supplied list.

823

* Returns the number of new pages which were placed at *list.

823

* Returns the number of new pages which were placed at *list.

824

*/

824

*/

825

static int rmqueue_bulk(struct zone *zone, unsigned int order,

825

static int rmqueue_bulk(struct zone *zone, unsigned int order,

826

unsigned long count, struct list_head *list,

826

unsigned long count, struct list_head *list,

827

int migratetype)

827

int migratetype)

828

{

828

{

829

int i;

829

int i;

830

831

spin_lock(&zone->lock);

831

spin_lock(&zone->lock);

832

for (i = 0; i < count; ++i) {

832

for (i = 0; i < count; ++i) {

833

struct page *page = __rmqueue(zone, order, migratetype);

833

struct page *page = __rmqueue(zone, order, migratetype);

834

if (unlikely(page == NULL))

834

if (unlikely(page == NULL))

835

break;

835

break;

836

837

/*

837

/*

838

* Split buddy pages returned by expand() are received here

838

* Split buddy pages returned by expand() are received here

839

* in physical page order. The page is added to the callers and

839

* in physical page order. The page is added to the callers and

840

* list and the list head then moves forward. From the callers

840

* list and the list head then moves forward. From the callers

841

* perspective, the linked list is ordered by page number in

841

* perspective, the linked list is ordered by page number in

842

* some conditions. This is useful for IO devices that can

842

* some conditions. This is useful for IO devices that can

843

* merge IO requests if the physical pages are ordered

843

* merge IO requests if the physical pages are ordered

844

* properly.

844

* properly.

845

*/

845

*/

846

list_add(&page->lru, list);

846

list_add(&page->lru, list);

847

set_page_private(page, migratetype);

847

set_page_private(page, migratetype);

848

list = &page->lru;

848

list = &page->lru;

849

}

849

}

850

spin_unlock(&zone->lock);

850

spin_unlock(&zone->lock);

851

return i;

851

return i;

852

}

852

}

853

854

#ifdef CONFIG_NUMA

854

#ifdef CONFIG_NUMA

855

/*

855

/*

856

* Called from the vmstat counter updater to drain pagesets of this

856

* Called from the vmstat counter updater to drain pagesets of this

857

* currently executing processor on remote nodes after they have

857

* currently executing processor on remote nodes after they have

858

* expired.

858

* expired.

859

*

859

*

860

* Note that this function must be called with the thread pinned to

860

* Note that this function must be called with the thread pinned to

861

* a single processor.

861

* a single processor.

862

*/

862

*/

863

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

863

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

864

{

864

{

865

unsigned long flags;

865

unsigned long flags;

866

int to_drain;

866

int to_drain;

867

868

local_irq_save(flags);

868

local_irq_save(flags);

869

if (pcp->count >= pcp->batch)

869

if (pcp->count >= pcp->batch)

870

to_drain = pcp->batch;

870

to_drain = pcp->batch;

871

else

871

else

872

to_drain = pcp->count;

872

to_drain = pcp->count;

873

free_pages_bulk(zone, to_drain, &pcp->list, 0);

873

free_pages_bulk(zone, to_drain, &pcp->list, 0);

874

pcp->count -= to_drain;

874

pcp->count -= to_drain;

875

local_irq_restore(flags);

875

local_irq_restore(flags);

876

}

876

}

877

#endif

877

#endif

878

879

/*

879

/*

880

* Drain pages of the indicated processor.

880

* Drain pages of the indicated processor.

881

*

881

*

882

* The processor must either be the current processor and the

882

* The processor must either be the current processor and the

883

* thread pinned to the current processor or a processor that

883

* thread pinned to the current processor or a processor that

884

* is not online.

884

* is not online.

885

*/

885

*/

886

static void drain_pages(unsigned int cpu)

886

static void drain_pages(unsigned int cpu)

887

{

887

{

888

unsigned long flags;

888

unsigned long flags;

889

struct zone *zone;

889

struct zone *zone;

890

891

for_each_zone(zone) {

891

for_each_zone(zone) {

892

struct per_cpu_pageset *pset;

892

struct per_cpu_pageset *pset;

893

struct per_cpu_pages *pcp;

893

struct per_cpu_pages *pcp;

894

895

if (!populated_zone(zone))

895

if (!populated_zone(zone))

896

continue;

896

continue;

897

898

pset = zone_pcp(zone, cpu);

898

pset = zone_pcp(zone, cpu);

899

900

pcp = &pset->pcp;

900

pcp = &pset->pcp;

901

local_irq_save(flags);

901

local_irq_save(flags);

902

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

902

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

903

pcp->count = 0;

903

pcp->count = 0;

904

local_irq_restore(flags);

904

local_irq_restore(flags);

905

}

905

}

906

}

906

}

907

908

/*

908

/*

909

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

909

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

910

*/

910

*/

911

void drain_local_pages(void *arg)

911

void drain_local_pages(void *arg)

912

{

912

{

913

drain_pages(smp_processor_id());

913

drain_pages(smp_processor_id());

914

}

914

}

915

916

/*

916

/*

917

* Spill all the per-cpu pages from all CPUs back into the buddy allocator

917

* Spill all the per-cpu pages from all CPUs back into the buddy allocator

918

*/

918

*/

919

void drain_all_pages(void)

919

void drain_all_pages(void)

920

{

920

{

921

on_each_cpu(drain_local_pages, NULL, 1);

921

on_each_cpu(drain_local_pages, NULL, 1);

922

}

922

}

923

924

#ifdef CONFIG_HIBERNATION

924

#ifdef CONFIG_HIBERNATION

925

926

void mark_free_pages(struct zone *zone)

926

void mark_free_pages(struct zone *zone)

927

{

927

{

928

unsigned long pfn, max_zone_pfn;

928

unsigned long pfn, max_zone_pfn;

929

unsigned long flags;

929

unsigned long flags;

930

int order, t;

930

int order, t;

931

struct list_head *curr;

931

struct list_head *curr;

932

933

if (!zone->spanned_pages)

933

if (!zone->spanned_pages)

934

return;

934

return;

935

936

spin_lock_irqsave(&zone->lock, flags);

936

spin_lock_irqsave(&zone->lock, flags);

937

938

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

938

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

939

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

939

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

940

if (pfn_valid(pfn)) {

940

if (pfn_valid(pfn)) {

941

struct page *page = pfn_to_page(pfn);

941

struct page *page = pfn_to_page(pfn);

942

943

if (!swsusp_page_is_forbidden(page))

943

if (!swsusp_page_is_forbidden(page))

944

swsusp_unset_page_free(page);

944

swsusp_unset_page_free(page);

945

}

945

}

946

947

for_each_migratetype_order(order, t) {

947

for_each_migratetype_order(order, t) {

948

list_for_each(curr, &zone->free_area[order].free_list[t]) {

948

list_for_each(curr, &zone->free_area[order].free_list[t]) {

949

unsigned long i;

949

unsigned long i;

950

951

pfn = page_to_pfn(list_entry(curr, struct page, lru));

951

pfn = page_to_pfn(list_entry(curr, struct page, lru));

952

for (i = 0; i < (1UL << order); i++)

952

for (i = 0; i < (1UL << order); i++)

953

swsusp_set_page_free(pfn_to_page(pfn + i));

953

swsusp_set_page_free(pfn_to_page(pfn + i));

954

}

954

}

955

}

955

}

956

spin_unlock_irqrestore(&zone->lock, flags);

956

spin_unlock_irqrestore(&zone->lock, flags);

957

}

957

}

958

#endif /* CONFIG_PM */

958

#endif /* CONFIG_PM */

959

960

/*

960

/*

961

* Free a 0-order page

961

* Free a 0-order page

962

*/

962

*/

963

static void free_hot_cold_page(struct page *page, int cold)

963

static void free_hot_cold_page(struct page *page, int cold)

964

{

964

{

965

struct zone *zone = page_zone(page);

965

struct zone *zone = page_zone(page);

966

struct per_cpu_pages *pcp;

966

struct per_cpu_pages *pcp;

967

unsigned long flags;

967

unsigned long flags;

968

969

if (PageAnon(page))

969

if (PageAnon(page))

970

page->mapping = NULL;

970

page->mapping = NULL;

971

if (free_pages_check(page))

971

if (free_pages_check(page))

972

return;

972

return;

973

974

if (!PageHighMem(page)) {

974

if (!PageHighMem(page)) {

975

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

975

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

976

debug_check_no_obj_freed(page_address(page), PAGE_SIZE);

976

debug_check_no_obj_freed(page_address(page), PAGE_SIZE);

977

}

977

}

978

arch_free_page(page, 0);

978

arch_free_page(page, 0);

979

kernel_map_pages(page, 1, 0);

979

kernel_map_pages(page, 1, 0);

980

981

pcp = &zone_pcp(zone, get_cpu())->pcp;

981

pcp = &zone_pcp(zone, get_cpu())->pcp;

982

local_irq_save(flags);

982

local_irq_save(flags);

983

__count_vm_event(PGFREE);

983

__count_vm_event(PGFREE);

984

if (cold)

984

if (cold)

985

list_add_tail(&page->lru, &pcp->list);

985

list_add_tail(&page->lru, &pcp->list);

986

else

986

else

987

list_add(&page->lru, &pcp->list);

987

list_add(&page->lru, &pcp->list);

988

set_page_private(page, get_pageblock_migratetype(page));

988

set_page_private(page, get_pageblock_migratetype(page));

989

pcp->count++;

989

pcp->count++;

990

if (pcp->count >= pcp->high) {

990

if (pcp->count >= pcp->high) {

991

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

991

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

992

pcp->count -= pcp->batch;

992

pcp->count -= pcp->batch;

993

}

993

}

994

local_irq_restore(flags);

994

local_irq_restore(flags);

995

put_cpu();

995

put_cpu();

996

}

996

}

997

998

void free_hot_page(struct page *page)

998

void free_hot_page(struct page *page)

999

{

999

{

1000

free_hot_cold_page(page, 0);

1000

free_hot_cold_page(page, 0);

1001

}

1001

}

1002

1003

void free_cold_page(struct page *page)

1003

void free_cold_page(struct page *page)

1004

{

1004

{

1005

free_hot_cold_page(page, 1);

1005

free_hot_cold_page(page, 1);

1006

}

1006

}

1007

1008

/*

1008

/*

1009

* split_page takes a non-compound higher-order page, and splits it into

1009

* split_page takes a non-compound higher-order page, and splits it into

1010

* n (1<<order) sub-pages: page[0..n]

1010

* n (1<<order) sub-pages: page[0..n]

1011

* Each sub-page must be freed individually.

1011

* Each sub-page must be freed individually.

1012

*

1012

*

1013

* Note: this is probably too low level an operation for use in drivers.

1013

* Note: this is probably too low level an operation for use in drivers.

1014

* Please consult with lkml before using this in your driver.

1014

* Please consult with lkml before using this in your driver.

1015

*/

1015

*/

1016

void split_page(struct page *page, unsigned int order)

1016

void split_page(struct page *page, unsigned int order)

1017

{

1017

{

1018

int i;

1018

int i;

1019

1020

VM_BUG_ON(PageCompound(page));

1020

VM_BUG_ON(PageCompound(page));

1021

VM_BUG_ON(!page_count(page));

1021

VM_BUG_ON(!page_count(page));

1022

for (i = 1; i < (1 << order); i++)

1022

for (i = 1; i < (1 << order); i++)

1023

set_page_refcounted(page + i);

1023

set_page_refcounted(page + i);

1024

}

1024

}

1025

1026

/*

1026

/*

1027

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1027

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1028

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1028

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1029

* or two.

1029

* or two.

1030

*/

1030

*/

1031

static struct page *buffered_rmqueue(struct zone *preferred_zone,

1031

static struct page *buffered_rmqueue(struct zone *preferred_zone,

1032

struct zone *zone, int order, gfp_t gfp_flags)

1032

struct zone *zone, int order, gfp_t gfp_flags)

1033

{

1033

{

1034

unsigned long flags;

1034

unsigned long flags;

1035

struct page *page;

1035

struct page *page;

1036

int cold = !!(gfp_flags & __GFP_COLD);

1036

int cold = !!(gfp_flags & __GFP_COLD);

1037

int cpu;

1037

int cpu;

1038

int migratetype = allocflags_to_migratetype(gfp_flags);

1038

int migratetype = allocflags_to_migratetype(gfp_flags);

1039

1040

again:

1040

again:

1041

cpu = get_cpu();

1041

cpu = get_cpu();

1042

if (likely(order == 0)) {

1042

if (likely(order == 0)) {

1043

struct per_cpu_pages *pcp;

1043

struct per_cpu_pages *pcp;

1044

1045

pcp = &zone_pcp(zone, cpu)->pcp;

1045

pcp = &zone_pcp(zone, cpu)->pcp;

1046

local_irq_save(flags);

1046

local_irq_save(flags);

1047

if (!pcp->count) {

1047

if (!pcp->count) {

1048

pcp->count = rmqueue_bulk(zone, 0,

1048

pcp->count = rmqueue_bulk(zone, 0,

1049

pcp->batch, &pcp->list, migratetype);

1049

pcp->batch, &pcp->list, migratetype);

1050

if (unlikely(!pcp->count))

1050

if (unlikely(!pcp->count))

1051

goto failed;

1051

goto failed;

1052

}

1052

}

1053

1054

/* Find a page of the appropriate migrate type */

1054

/* Find a page of the appropriate migrate type */

1055

if (cold) {

1055

if (cold) {

1056

list_for_each_entry_reverse(page, &pcp->list, lru)

1056

list_for_each_entry_reverse(page, &pcp->list, lru)

1057

if (page_private(page) == migratetype)

1057

if (page_private(page) == migratetype)

1058

break;

1058

break;

1059

} else {

1059

} else {

1060

list_for_each_entry(page, &pcp->list, lru)

1060

list_for_each_entry(page, &pcp->list, lru)

1061

if (page_private(page) == migratetype)

1061

if (page_private(page) == migratetype)

1062

break;

1062

break;

1063

}

1063

}

1064

1065

/* Allocate more to the pcp list if necessary */

1065

/* Allocate more to the pcp list if necessary */

1066

if (unlikely(&page->lru == &pcp->list)) {

1066

if (unlikely(&page->lru == &pcp->list)) {

1067

pcp->count += rmqueue_bulk(zone, 0,

1067

pcp->count += rmqueue_bulk(zone, 0,

1068

pcp->batch, &pcp->list, migratetype);

1068

pcp->batch, &pcp->list, migratetype);

1069

page = list_entry(pcp->list.next, struct page, lru);

1069

page = list_entry(pcp->list.next, struct page, lru);

1070

}

1070

}

1071

1072

list_del(&page->lru);

1072

list_del(&page->lru);

1073

pcp->count--;

1073

pcp->count--;

1074

} else {

1074

} else {

1075

spin_lock_irqsave(&zone->lock, flags);

1075

spin_lock_irqsave(&zone->lock, flags);

1076

page = __rmqueue(zone, order, migratetype);

1076

page = __rmqueue(zone, order, migratetype);

1077

spin_unlock(&zone->lock);

1077

spin_unlock(&zone->lock);

1078

if (!page)

1078

if (!page)

1079

goto failed;

1079

goto failed;

1080

}

1080

}

1081

1082

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1082

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1083

zone_statistics(preferred_zone, zone);

1083

zone_statistics(preferred_zone, zone);

1084

local_irq_restore(flags);

1084

local_irq_restore(flags);

1085

put_cpu();

1085

put_cpu();

1086

1087

VM_BUG_ON(bad_range(zone, page));

1087

VM_BUG_ON(bad_range(zone, page));

1088

if (prep_new_page(page, order, gfp_flags))

1088

if (prep_new_page(page, order, gfp_flags))

1089

goto again;

1089

goto again;

1090

return page;

1090

return page;

1091

1092

failed:

1092

failed:

1093

local_irq_restore(flags);

1093

local_irq_restore(flags);

1094

put_cpu();

1094

put_cpu();

1095

return NULL;

1095

return NULL;

1096

}

1096

}

1097

1098

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

1098

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

1099

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

1099

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

1100

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

1100

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

1101

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

1101

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

1102

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1102

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1103

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1103

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1104

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1104

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1105

1106

#ifdef CONFIG_FAIL_PAGE_ALLOC

1106

#ifdef CONFIG_FAIL_PAGE_ALLOC

1107

1108

static struct fail_page_alloc_attr {

1108

static struct fail_page_alloc_attr {

1109

struct fault_attr attr;

1109

struct fault_attr attr;

1110

1111

u32 ignore_gfp_highmem;

1111

u32 ignore_gfp_highmem;

1112

u32 ignore_gfp_wait;

1112

u32 ignore_gfp_wait;

1113

u32 min_order;

1113

u32 min_order;

1114

1115

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1115

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1116

1117

struct dentry *ignore_gfp_highmem_file;

1117

struct dentry *ignore_gfp_highmem_file;

1118

struct dentry *ignore_gfp_wait_file;

1118

struct dentry *ignore_gfp_wait_file;

1119

struct dentry *min_order_file;

1119

struct dentry *min_order_file;

1120

1121

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1121

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1122

1123

} fail_page_alloc = {

1123

} fail_page_alloc = {

1124

.attr = FAULT_ATTR_INITIALIZER,

1124

.attr = FAULT_ATTR_INITIALIZER,

1125

.ignore_gfp_wait = 1,

1125

.ignore_gfp_wait = 1,

1126

.ignore_gfp_highmem = 1,

1126

.ignore_gfp_highmem = 1,

1127

.min_order = 1,

1127

.min_order = 1,

1128

};

1128

};

1129

1130

static int __init setup_fail_page_alloc(char *str)

1130

static int __init setup_fail_page_alloc(char *str)

1131

{

1131

{

1132

return setup_fault_attr(&fail_page_alloc.attr, str);

1132

return setup_fault_attr(&fail_page_alloc.attr, str);

1133

}

1133

}

1134

__setup("fail_page_alloc=", setup_fail_page_alloc);

1134

__setup("fail_page_alloc=", setup_fail_page_alloc);

1135

1136

static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1136

static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1137

{

1137

{

1138

if (order < fail_page_alloc.min_order)

1138

if (order < fail_page_alloc.min_order)

1139

return 0;

1139

return 0;

1140

if (gfp_mask & __GFP_NOFAIL)

1140

if (gfp_mask & __GFP_NOFAIL)

1141

return 0;

1141

return 0;

1142

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1142

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1143

return 0;

1143

return 0;

1144

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1144

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1145

return 0;

1145

return 0;

1146

1147

return should_fail(&fail_page_alloc.attr, 1 << order);

1147

return should_fail(&fail_page_alloc.attr, 1 << order);

1148

}

1148

}

1149

1150

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1150

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1151

1152

static int __init fail_page_alloc_debugfs(void)

1152

static int __init fail_page_alloc_debugfs(void)

1153

{

1153

{

1154

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1154

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1155

struct dentry *dir;

1155

struct dentry *dir;

1156

int err;

1156

int err;

1157

1158

err = init_fault_attr_dentries(&fail_page_alloc.attr,

1158

err = init_fault_attr_dentries(&fail_page_alloc.attr,

1159

"fail_page_alloc");

1159

"fail_page_alloc");

1160

if (err)

1160

if (err)

1161

return err;

1161

return err;

1162

dir = fail_page_alloc.attr.dentries.dir;

1162

dir = fail_page_alloc.attr.dentries.dir;

1163

1164

fail_page_alloc.ignore_gfp_wait_file =

1164

fail_page_alloc.ignore_gfp_wait_file =

1165

debugfs_create_bool("ignore-gfp-wait", mode, dir,

1165

debugfs_create_bool("ignore-gfp-wait", mode, dir,

1166

&fail_page_alloc.ignore_gfp_wait);

1166

&fail_page_alloc.ignore_gfp_wait);

1167

1168

fail_page_alloc.ignore_gfp_highmem_file =

1168

fail_page_alloc.ignore_gfp_highmem_file =

1169

debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1169

debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1170

&fail_page_alloc.ignore_gfp_highmem);

1170

&fail_page_alloc.ignore_gfp_highmem);

1171

fail_page_alloc.min_order_file =

1171

fail_page_alloc.min_order_file =

1172

debugfs_create_u32("min-order", mode, dir,

1172

debugfs_create_u32("min-order", mode, dir,

1173

&fail_page_alloc.min_order);

1173

&fail_page_alloc.min_order);

1174

1175

if (!fail_page_alloc.ignore_gfp_wait_file ||

1175

if (!fail_page_alloc.ignore_gfp_wait_file ||

1176

!fail_page_alloc.ignore_gfp_highmem_file ||

1176

!fail_page_alloc.ignore_gfp_highmem_file ||

1177

!fail_page_alloc.min_order_file) {

1177

!fail_page_alloc.min_order_file) {

1178

err = -ENOMEM;

1178

err = -ENOMEM;

1179

debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);

1179

debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);

1180

debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);

1180

debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);

1181

debugfs_remove(fail_page_alloc.min_order_file);

1181

debugfs_remove(fail_page_alloc.min_order_file);

1182

cleanup_fault_attr_dentries(&fail_page_alloc.attr);

1182

cleanup_fault_attr_dentries(&fail_page_alloc.attr);

1183

}

1183

}

1184

1185

return err;

1185

return err;

1186

}

1186

}

1187

1188

late_initcall(fail_page_alloc_debugfs);

1188

late_initcall(fail_page_alloc_debugfs);

1189

1190

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1190

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1191

1192

#else /* CONFIG_FAIL_PAGE_ALLOC */

1192

#else /* CONFIG_FAIL_PAGE_ALLOC */

1193

1194

static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1194

static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1195

{

1195

{

1196

return 0;

1196

return 0;

1197

}

1197

}

1198

1199

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1199

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1200

1201

/*

1201

/*

1202

* Return 1 if free pages are above 'mark'. This takes into account the order

1202

* Return 1 if free pages are above 'mark'. This takes into account the order

1203

* of the allocation.

1203

* of the allocation.

1204

*/

1204

*/

1205

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1205

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1206

int classzone_idx, int alloc_flags)

1206

int classzone_idx, int alloc_flags)

1207

{

1207

{

1208

/* free_pages my go negative - that's OK */

1208

/* free_pages my go negative - that's OK */

1209

long min = mark;

1209

long min = mark;

1210

long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;

1210

long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;

1211

int o;

1211

int o;

1212

1213

if (alloc_flags & ALLOC_HIGH)

1213

if (alloc_flags & ALLOC_HIGH)

1214

min -= min / 2;

1214

min -= min / 2;

1215

if (alloc_flags & ALLOC_HARDER)

1215

if (alloc_flags & ALLOC_HARDER)

1216

min -= min / 4;

1216

min -= min / 4;

1217

1218

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1218

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1219

return 0;

1219

return 0;

1220

for (o = 0; o < order; o++) {

1220

for (o = 0; o < order; o++) {

1221

/* At the next order, this order's pages become unavailable */

1221

/* At the next order, this order's pages become unavailable */

1222

free_pages -= z->free_area[o].nr_free << o;

1222

free_pages -= z->free_area[o].nr_free << o;

1223

1224

/* Require fewer higher order pages to be free */

1224

/* Require fewer higher order pages to be free */

1225

min >>= 1;

1225

min >>= 1;

1226

1227

if (free_pages <= min)

1227

if (free_pages <= min)

1228

return 0;

1228

return 0;

1229

}

1229

}

1230

return 1;

1230

return 1;

1231

}

1231

}

1232

1233

#ifdef CONFIG_NUMA

1233

#ifdef CONFIG_NUMA

1234

/*

1234

/*

1235

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1235

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1236

* skip over zones that are not allowed by the cpuset, or that have

1236

* skip over zones that are not allowed by the cpuset, or that have

1237

* been recently (in last second) found to be nearly full. See further

1237

* been recently (in last second) found to be nearly full. See further

1238

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1238

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1239

* that have to skip over a lot of full or unallowed zones.

1239

* that have to skip over a lot of full or unallowed zones.

1240

*

1240

*

1241

* If the zonelist cache is present in the passed in zonelist, then

1241

* If the zonelist cache is present in the passed in zonelist, then

1242

* returns a pointer to the allowed node mask (either the current

1242

* returns a pointer to the allowed node mask (either the current

1243

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1243

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1244

*

1244

*

1245

* If the zonelist cache is not available for this zonelist, does

1245

* If the zonelist cache is not available for this zonelist, does

1246

* nothing and returns NULL.

1246

* nothing and returns NULL.

1247

*

1247

*

1248

* If the fullzones BITMAP in the zonelist cache is stale (more than

1248

* If the fullzones BITMAP in the zonelist cache is stale (more than

1249

* a second since last zap'd) then we zap it out (clear its bits.)

1249

* a second since last zap'd) then we zap it out (clear its bits.)

1250

*

1250

*

1251

* We hold off even calling zlc_setup, until after we've checked the

1251

* We hold off even calling zlc_setup, until after we've checked the

1252

* first zone in the zonelist, on the theory that most allocations will

1252

* first zone in the zonelist, on the theory that most allocations will

1253

* be satisfied from that first zone, so best to examine that zone as

1253

* be satisfied from that first zone, so best to examine that zone as

1254

* quickly as we can.

1254

* quickly as we can.

1255

*/

1255

*/

1256

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1256

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1257

{

1257

{

1258

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1258

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1259

nodemask_t *allowednodes; /* zonelist_cache approximation */

1259

nodemask_t *allowednodes; /* zonelist_cache approximation */

1260

1261

zlc = zonelist->zlcache_ptr;

1261

zlc = zonelist->zlcache_ptr;

1262

if (!zlc)

1262

if (!zlc)

1263

return NULL;

1263

return NULL;

1264

1265

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1265

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1266

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1266

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1267

zlc->last_full_zap = jiffies;

1267

zlc->last_full_zap = jiffies;

1268

}

1268

}

1269

1270

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1270

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1271

&cpuset_current_mems_allowed :

1271

&cpuset_current_mems_allowed :

1272

&node_states[N_HIGH_MEMORY];

1272

&node_states[N_HIGH_MEMORY];

1273

return allowednodes;

1273

return allowednodes;

1274

}

1274

}

1275

1276

/*

1276

/*

1277

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1277

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1278

* if it is worth looking at further for free memory:

1278

* if it is worth looking at further for free memory:

1279

* 1) Check that the zone isn't thought to be full (doesn't have its

1279

* 1) Check that the zone isn't thought to be full (doesn't have its

1280

* bit set in the zonelist_cache fullzones BITMAP).

1280

* bit set in the zonelist_cache fullzones BITMAP).

1281

* 2) Check that the zones node (obtained from the zonelist_cache

1281

* 2) Check that the zones node (obtained from the zonelist_cache

1282

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1282

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1283

* Return true (non-zero) if zone is worth looking at further, or

1283

* Return true (non-zero) if zone is worth looking at further, or

1284

* else return false (zero) if it is not.

1284

* else return false (zero) if it is not.

1285

*

1285

*

1286

* This check -ignores- the distinction between various watermarks,

1286

* This check -ignores- the distinction between various watermarks,

1287

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1287

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1288

* found to be full for any variation of these watermarks, it will

1288

* found to be full for any variation of these watermarks, it will

1289

* be considered full for up to one second by all requests, unless

1289

* be considered full for up to one second by all requests, unless

1290

* we are so low on memory on all allowed nodes that we are forced

1290

* we are so low on memory on all allowed nodes that we are forced

1291

* into the second scan of the zonelist.

1291

* into the second scan of the zonelist.

1292

*

1292

*

1293

* In the second scan we ignore this zonelist cache and exactly

1293

* In the second scan we ignore this zonelist cache and exactly

1294

* apply the watermarks to all zones, even it is slower to do so.

1294

* apply the watermarks to all zones, even it is slower to do so.

1295

* We are low on memory in the second scan, and should leave no stone

1295

* We are low on memory in the second scan, and should leave no stone

1296

* unturned looking for a free page.

1296

* unturned looking for a free page.

1297

*/

1297

*/

1298

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1298

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1299

nodemask_t *allowednodes)

1299

nodemask_t *allowednodes)

1300

{

1300

{

1301

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1301

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1302

int i; /* index of *z in zonelist zones */

1302

int i; /* index of *z in zonelist zones */

1303

int n; /* node that zone *z is on */

1303

int n; /* node that zone *z is on */

1304

1305

zlc = zonelist->zlcache_ptr;

1305

zlc = zonelist->zlcache_ptr;

1306

if (!zlc)

1306

if (!zlc)

1307

return 1;

1307

return 1;

1308

1309

i = z - zonelist->_zonerefs;

1309

i = z - zonelist->_zonerefs;

1310

n = zlc->z_to_n[i];

1310

n = zlc->z_to_n[i];

1311

1312

/* This zone is worth trying if it is allowed but not full */

1312

/* This zone is worth trying if it is allowed but not full */

1313

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1313

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1314

}

1314

}

1315

1316

/*

1316

/*

1317

* Given 'z' scanning a zonelist, set the corresponding bit in

1317

* Given 'z' scanning a zonelist, set the corresponding bit in

1318

* zlc->fullzones, so that subsequent attempts to allocate a page

1318

* zlc->fullzones, so that subsequent attempts to allocate a page

1319

* from that zone don't waste time re-examining it.

1319

* from that zone don't waste time re-examining it.

1320

*/

1320

*/

1321

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1321

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1322

{

1322

{

1323

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1323

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1324

int i; /* index of *z in zonelist zones */

1324

int i; /* index of *z in zonelist zones */

1325

1326

zlc = zonelist->zlcache_ptr;

1326

zlc = zonelist->zlcache_ptr;

1327

if (!zlc)

1327

if (!zlc)

1328

return;

1328

return;

1329

1330

i = z - zonelist->_zonerefs;

1330

i = z - zonelist->_zonerefs;

1331

1332

set_bit(i, zlc->fullzones);

1332

set_bit(i, zlc->fullzones);

1333

}

1333

}

1334

1335

#else /* CONFIG_NUMA */

1335

#else /* CONFIG_NUMA */

1336

1337

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1337

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1338

{

1338

{

1339

return NULL;

1339

return NULL;

1340

}

1340

}

1341

1342

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1342

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1343

nodemask_t *allowednodes)

1343

nodemask_t *allowednodes)

1344

{

1344

{

1345

return 1;

1345

return 1;

1346

}

1346

}

1347

1348

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1348

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1349

{

1349

{

1350

}

1350

}

1351

#endif /* CONFIG_NUMA */

1351

#endif /* CONFIG_NUMA */

1352

1353

/*

1353

/*

1354

* get_page_from_freelist goes through the zonelist trying to allocate

1354

* get_page_from_freelist goes through the zonelist trying to allocate

1355

* a page.

1355

* a page.

1356

*/

1356

*/

1357

static struct page *

1357

static struct page *

1358

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1358

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1359

struct zonelist *zonelist, int high_zoneidx, int alloc_flags)

1359

struct zonelist *zonelist, int high_zoneidx, int alloc_flags)

1360

{

1360

{

1361

struct zoneref *z;

1361

struct zoneref *z;

1362

struct page *page = NULL;

1362

struct page *page = NULL;

1363

int classzone_idx;

1363

int classzone_idx;

1364

struct zone *zone, *preferred_zone;

1364

struct zone *zone, *preferred_zone;

1365

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1365

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1366

int zlc_active = 0; /* set if using zonelist_cache */

1366

int zlc_active = 0; /* set if using zonelist_cache */

1367

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1367

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1368

1369

(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,

1369

(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,

1370

&preferred_zone);

1370

&preferred_zone);

1371

if (!preferred_zone)

1371

if (!preferred_zone)

1372

return NULL;

1372

return NULL;

1373

1374

classzone_idx = zone_idx(preferred_zone);

1374

classzone_idx = zone_idx(preferred_zone);

1375

1376

zonelist_scan:

1376

zonelist_scan:

1377

/*

1377

/*

1378

* Scan zonelist, looking for a zone with enough free.

1378

* Scan zonelist, looking for a zone with enough free.

1379

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1379

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1380

*/

1380

*/

1381

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1381

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1382

high_zoneidx, nodemask) {

1382

high_zoneidx, nodemask) {

1383

if (NUMA_BUILD && zlc_active &&

1383

if (NUMA_BUILD && zlc_active &&

1384

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1384

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1385

continue;

1385

continue;

1386

if ((alloc_flags & ALLOC_CPUSET) &&

1386

if ((alloc_flags & ALLOC_CPUSET) &&

1387

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1387

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1388

goto try_next_zone;

1388

goto try_next_zone;

1389

1390

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1390

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1391

unsigned long mark;

1391

unsigned long mark;

1392

if (alloc_flags & ALLOC_WMARK_MIN)

1392

if (alloc_flags & ALLOC_WMARK_MIN)

1393

mark = zone->pages_min;

1393

mark = zone->pages_min;

1394

else if (alloc_flags & ALLOC_WMARK_LOW)

1394

else if (alloc_flags & ALLOC_WMARK_LOW)

1395

mark = zone->pages_low;

1395

mark = zone->pages_low;

1396

else

1396

else

1397

mark = zone->pages_high;

1397

mark = zone->pages_high;

1398

if (!zone_watermark_ok(zone, order, mark,

1398

if (!zone_watermark_ok(zone, order, mark,

1399

classzone_idx, alloc_flags)) {

1399

classzone_idx, alloc_flags)) {

1400

if (!zone_reclaim_mode ||

1400

if (!zone_reclaim_mode ||

1401

!zone_reclaim(zone, gfp_mask, order))

1401

!zone_reclaim(zone, gfp_mask, order))

1402

goto this_zone_full;

1402

goto this_zone_full;

1403

}

1403

}

1404

}

1404

}

1405

1406

page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);

1406

page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);

1407

if (page)

1407

if (page)

1408

break;

1408

break;

1409

this_zone_full:

1409

this_zone_full:

1410

if (NUMA_BUILD)

1410

if (NUMA_BUILD)

1411

zlc_mark_zone_full(zonelist, z);

1411

zlc_mark_zone_full(zonelist, z);

1412

try_next_zone:

1412

try_next_zone:

1413

if (NUMA_BUILD && !did_zlc_setup) {

1413

if (NUMA_BUILD && !did_zlc_setup) {

1414

/* we do zlc_setup after the first zone is tried */

1414

/* we do zlc_setup after the first zone is tried */

1415

allowednodes = zlc_setup(zonelist, alloc_flags);

1415

allowednodes = zlc_setup(zonelist, alloc_flags);

1416

zlc_active = 1;

1416

zlc_active = 1;

1417

did_zlc_setup = 1;

1417

did_zlc_setup = 1;

1418

}

1418

}

1419

}

1419

}

1420

1421

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1421

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1422

/* Disable zlc cache for second zonelist scan */

1422

/* Disable zlc cache for second zonelist scan */

1423

zlc_active = 0;

1423

zlc_active = 0;

1424

goto zonelist_scan;

1424

goto zonelist_scan;

1425

}

1425

}

1426

return page;

1426

return page;

1427

}

1427

}

1428

1429

/*

1429

/*

1430

* This is the 'heart' of the zoned buddy allocator.

1430

* This is the 'heart' of the zoned buddy allocator.

1431

*/

1431

*/

1432

static struct page *

1432

static struct page *

1433

__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,

1433

__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,

1434

struct zonelist *zonelist, nodemask_t *nodemask)

1434

struct zonelist *zonelist, nodemask_t *nodemask)

1435

{

1435

{

1436

const gfp_t wait = gfp_mask & __GFP_WAIT;

1436

const gfp_t wait = gfp_mask & __GFP_WAIT;

1437

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

1437

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

1438

struct zoneref *z;

1438

struct zoneref *z;

1439

struct zone *zone;

1439

struct zone *zone;

1440

struct page *page;

1440

struct page *page;

1441

struct reclaim_state reclaim_state;

1441

struct reclaim_state reclaim_state;

1442

struct task_struct *p = current;

1442

struct task_struct *p = current;

1443

int do_retry;

1443

int do_retry;

1444

int alloc_flags;

1444

int alloc_flags;

1445

unsigned long did_some_progress;

1445

unsigned long did_some_progress;

1446

unsigned long pages_reclaimed = 0;

1446

unsigned long pages_reclaimed = 0;

1447

1448

might_sleep_if(wait);

1448

might_sleep_if(wait);

1449

1450

if (should_fail_alloc_page(gfp_mask, order))

1450

if (should_fail_alloc_page(gfp_mask, order))

1451

return NULL;

1451

return NULL;

1452

1453

restart:

1453

restart:

1454

z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */

1454

z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */

1455

1456

if (unlikely(!z->zone)) {

1456

if (unlikely(!z->zone)) {

1457

/*

1457

/*

1458

* Happens if we have an empty zonelist as a result of

1458

* Happens if we have an empty zonelist as a result of

1459

* GFP_THISNODE being used on a memoryless node

1459

* GFP_THISNODE being used on a memoryless node

1460

*/

1460

*/

1461

return NULL;

1461

return NULL;

1462

}

1462

}

1463

1464

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

1464

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

1465

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1465

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1466

if (page)

1466

if (page)

1467

goto got_pg;

1467

goto got_pg;

1468

1469

/*

1469

/*

1470

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1470

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1471

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1471

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1472

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1472

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1473

* using a larger set of nodes after it has established that the

1473

* using a larger set of nodes after it has established that the

1474

* allowed per node queues are empty and that nodes are

1474

* allowed per node queues are empty and that nodes are

1475

* over allocated.

1475

* over allocated.

1476

*/

1476

*/

1477

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1477

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1478

goto nopage;

1478

goto nopage;

1479

1480

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

1480

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

1481

wakeup_kswapd(zone, order);

1481

wakeup_kswapd(zone, order);

1482

1483

/*

1483

/*

1484

* OK, we're below the kswapd watermark and have kicked background

1484

* OK, we're below the kswapd watermark and have kicked background

1485

* reclaim. Now things get more complex, so set up alloc_flags according

1485

* reclaim. Now things get more complex, so set up alloc_flags according

1486

* to how we want to proceed.

1486

* to how we want to proceed.

1487

*

1487

*

1488

* The caller may dip into page reserves a bit more if the caller

1488

* The caller may dip into page reserves a bit more if the caller

1489

* cannot run direct reclaim, or if the caller has realtime scheduling

1489

* cannot run direct reclaim, or if the caller has realtime scheduling

1490

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1490

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1491

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1491

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1492

*/

1492

*/

1493

alloc_flags = ALLOC_WMARK_MIN;

1493

alloc_flags = ALLOC_WMARK_MIN;

1494

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1494

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1495

alloc_flags |= ALLOC_HARDER;

1495

alloc_flags |= ALLOC_HARDER;

1496

if (gfp_mask & __GFP_HIGH)

1496

if (gfp_mask & __GFP_HIGH)

1497

alloc_flags |= ALLOC_HIGH;

1497

alloc_flags |= ALLOC_HIGH;

1498

if (wait)

1498

if (wait)

1499

alloc_flags |= ALLOC_CPUSET;

1499

alloc_flags |= ALLOC_CPUSET;

1500

1501

/*

1501

/*

1502

* Go through the zonelist again. Let __GFP_HIGH and allocations

1502

* Go through the zonelist again. Let __GFP_HIGH and allocations

1503

* coming from realtime tasks go deeper into reserves.

1503

* coming from realtime tasks go deeper into reserves.

1504

*

1504

*

1505

* This is the last chance, in general, before the goto nopage.

1505

* This is the last chance, in general, before the goto nopage.

1506

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1506

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1507

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1507

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1508

*/

1508

*/

1509

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

1509

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

1510

high_zoneidx, alloc_flags);

1510

high_zoneidx, alloc_flags);

1511

if (page)

1511

if (page)

1512

goto got_pg;

1512

goto got_pg;

1513

1514

/* This allocation should allow future memory freeing. */

1514

/* This allocation should allow future memory freeing. */

1515

1516

rebalance:

1516

rebalance:

1517

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1517

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1518

&& !in_interrupt()) {

1518

&& !in_interrupt()) {

1519

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1519

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1520

nofail_alloc:

1520

nofail_alloc:

1521

/* go through the zonelist yet again, ignoring mins */

1521

/* go through the zonelist yet again, ignoring mins */

1522

page = get_page_from_freelist(gfp_mask, nodemask, order,

1522

page = get_page_from_freelist(gfp_mask, nodemask, order,

1523

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);

1523

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);

1524

if (page)

1524

if (page)

1525

goto got_pg;

1525

goto got_pg;

1526

if (gfp_mask & __GFP_NOFAIL) {

1526

if (gfp_mask & __GFP_NOFAIL) {

1527

congestion_wait(WRITE, HZ/50);

1527

congestion_wait(WRITE, HZ/50);

1528

goto nofail_alloc;

1528

goto nofail_alloc;

1529

}

1529

}

1530

}

1530

}

1531

goto nopage;

1531

goto nopage;

1532

}

1532

}

1533

1534

/* Atomic allocations - we can't balance anything */

1534

/* Atomic allocations - we can't balance anything */

1535

if (!wait)

1535

if (!wait)

1536

goto nopage;

1536

goto nopage;

1537

1538

cond_resched();

1538

cond_resched();

1539

1540

/* We now go into synchronous reclaim */

1540

/* We now go into synchronous reclaim */

1541

cpuset_memory_pressure_bump();

1541

cpuset_memory_pressure_bump();

1542

p->flags |= PF_MEMALLOC;

1542

p->flags |= PF_MEMALLOC;

1543

reclaim_state.reclaimed_slab = 0;

1543

reclaim_state.reclaimed_slab = 0;

1544

p->reclaim_state = &reclaim_state;

1544

p->reclaim_state = &reclaim_state;

1545

1546

did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);

1546

did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);

1547

1548

p->reclaim_state = NULL;

1548

p->reclaim_state = NULL;

1549

p->flags &= ~PF_MEMALLOC;

1549

p->flags &= ~PF_MEMALLOC;

1550

1551

cond_resched();

1551

cond_resched();

1552

1553

if (order != 0)

1553

if (order != 0)

1554

drain_all_pages();

1554

drain_all_pages();

1555

1556

if (likely(did_some_progress)) {

1556

if (likely(did_some_progress)) {

1557

page = get_page_from_freelist(gfp_mask, nodemask, order,

1557

page = get_page_from_freelist(gfp_mask, nodemask, order,

1558

zonelist, high_zoneidx, alloc_flags);

1558

zonelist, high_zoneidx, alloc_flags);

1559

if (page)

1559

if (page)

1560

goto got_pg;

1560

goto got_pg;

1561

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1561

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1562

if (!try_set_zone_oom(zonelist, gfp_mask)) {

1562

if (!try_set_zone_oom(zonelist, gfp_mask)) {

1563

schedule_timeout_uninterruptible(1);

1563

schedule_timeout_uninterruptible(1);

1564

goto restart;

1564

goto restart;

1565

}

1565

}

1566

1567

/*

1567

/*

1568

* Go through the zonelist yet one more time, keep

1568

* Go through the zonelist yet one more time, keep

1569

* very high watermark here, this is only to catch

1569

* very high watermark here, this is only to catch

1570

* a parallel oom killing, we must fail if we're still

1570

* a parallel oom killing, we must fail if we're still

1571

* under heavy pressure.

1571

* under heavy pressure.

1572

*/

1572

*/

1573

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

1573

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

1574

order, zonelist, high_zoneidx,

1574

order, zonelist, high_zoneidx,

1575

ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1575

ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1576

if (page) {

1576

if (page) {

1577

clear_zonelist_oom(zonelist, gfp_mask);

1577

clear_zonelist_oom(zonelist, gfp_mask);

1578

goto got_pg;

1578

goto got_pg;

1579

}

1579

}

1580

1581

/* The OOM killer will not help higher order allocs so fail */

1581

/* The OOM killer will not help higher order allocs so fail */

1582

if (order > PAGE_ALLOC_COSTLY_ORDER) {

1582

if (order > PAGE_ALLOC_COSTLY_ORDER) {

1583

clear_zonelist_oom(zonelist, gfp_mask);

1583

clear_zonelist_oom(zonelist, gfp_mask);

1584

goto nopage;

1584

goto nopage;

1585

}

1585

}

1586

1587

out_of_memory(zonelist, gfp_mask, order);

1587

out_of_memory(zonelist, gfp_mask, order);

1588

clear_zonelist_oom(zonelist, gfp_mask);

1588

clear_zonelist_oom(zonelist, gfp_mask);

1589

goto restart;

1589

goto restart;

1590

}

1590

}

1591

1592

/*

1592

/*

1593

* Don't let big-order allocations loop unless the caller explicitly

1593

* Don't let big-order allocations loop unless the caller explicitly

1594

* requests that. Wait for some write requests to complete then retry.

1594

* requests that. Wait for some write requests to complete then retry.

1595

*

1595

*

1596

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

1596

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

1597

* means __GFP_NOFAIL, but that may not be true in other

1597

* means __GFP_NOFAIL, but that may not be true in other

1598

* implementations.

1598

* implementations.

1599

*

1599

*

1600

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

1600

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

1601

* specified, then we retry until we no longer reclaim any pages

1601

* specified, then we retry until we no longer reclaim any pages

1602

* (above), or we've reclaimed an order of pages at least as

1602

* (above), or we've reclaimed an order of pages at least as

1603

* large as the allocation's order. In both cases, if the

1603

* large as the allocation's order. In both cases, if the

1604

* allocation still fails, we stop retrying.

1604

* allocation still fails, we stop retrying.

1605

*/

1605

*/

1606

pages_reclaimed += did_some_progress;

1606

pages_reclaimed += did_some_progress;

1607

do_retry = 0;

1607

do_retry = 0;

1608

if (!(gfp_mask & __GFP_NORETRY)) {

1608

if (!(gfp_mask & __GFP_NORETRY)) {

1609

if (order <= PAGE_ALLOC_COSTLY_ORDER) {

1609

if (order <= PAGE_ALLOC_COSTLY_ORDER) {

1610

do_retry = 1;

1610

do_retry = 1;

1611

} else {

1611

} else {

1612

if (gfp_mask & __GFP_REPEAT &&

1612

if (gfp_mask & __GFP_REPEAT &&

1613

pages_reclaimed < (1 << order))

1613

pages_reclaimed < (1 << order))

1614

do_retry = 1;

1614

do_retry = 1;

1615

}

1615

}

1616

if (gfp_mask & __GFP_NOFAIL)

1616

if (gfp_mask & __GFP_NOFAIL)

1617

do_retry = 1;

1617

do_retry = 1;

1618

}

1618

}

1619

if (do_retry) {

1619

if (do_retry) {

1620

congestion_wait(WRITE, HZ/50);

1620

congestion_wait(WRITE, HZ/50);

1621

goto rebalance;

1621

goto rebalance;

1622

}

1622

}

1623

1624

nopage:

1624

nopage:

1625

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1625

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1626

printk(KERN_WARNING "%s: page allocation failure."

1626

printk(KERN_WARNING "%s: page allocation failure."

1627

" order:%d, mode:0x%x\n",

1627

" order:%d, mode:0x%x\n",

1628

p->comm, order, gfp_mask);

1628

p->comm, order, gfp_mask);

1629

dump_stack();

1629

dump_stack();

1630

show_mem();

1630

show_mem();

1631

}

1631

}

1632

got_pg:

1632

got_pg:

1633

return page;

1633

return page;

1634

}

1634

}

1635

1636

struct page *

1636

struct page *

1637

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1637

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1638

struct zonelist *zonelist)

1638

struct zonelist *zonelist)

1639

{

1639

{

1640

return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);

1640

return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);

1641

}

1641

}

1642

1643

struct page *

1643

struct page *

1644

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

1644

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

1645

struct zonelist *zonelist, nodemask_t *nodemask)

1645

struct zonelist *zonelist, nodemask_t *nodemask)

1646

{

1646

{

1647

return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);

1647

return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);

1648

}

1648

}

1649

1650

EXPORT_SYMBOL(__alloc_pages);

1650

EXPORT_SYMBOL(__alloc_pages);

1651

1652

/*

1652

/*

1653

* Common helper functions.

1653

* Common helper functions.

1654

*/

1654

*/

1655

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1655

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1656

{

1656

{

1657

struct page * page;

1657

struct page * page;

1658

page = alloc_pages(gfp_mask, order);

1658

page = alloc_pages(gfp_mask, order);

1659

if (!page)

1659

if (!page)

1660

return 0;

1660

return 0;

1661

return (unsigned long) page_address(page);

1661

return (unsigned long) page_address(page);

1662

}

1662

}

1663

1664

EXPORT_SYMBOL(__get_free_pages);

1664

EXPORT_SYMBOL(__get_free_pages);

1665

1666

unsigned long get_zeroed_page(gfp_t gfp_mask)

1666

unsigned long get_zeroed_page(gfp_t gfp_mask)

1667

{

1667

{

1668

struct page * page;

1668

struct page * page;

1669

1670

/*

1670

/*

1671

* get_zeroed_page() returns a 32-bit address, which cannot represent

1671

* get_zeroed_page() returns a 32-bit address, which cannot represent

1672

* a highmem page

1672

* a highmem page

1673

*/

1673

*/

1674

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1674

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1675

1676

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1676

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1677

if (page)

1677

if (page)

1678

return (unsigned long) page_address(page);

1678

return (unsigned long) page_address(page);

1679

return 0;

1679

return 0;

1680

}

1680

}

1681

1682

EXPORT_SYMBOL(get_zeroed_page);

1682

EXPORT_SYMBOL(get_zeroed_page);

1683

1684

void __pagevec_free(struct pagevec *pvec)

1684

void __pagevec_free(struct pagevec *pvec)

1685

{

1685

{

1686

int i = pagevec_count(pvec);

1686

int i = pagevec_count(pvec);

1687

1688

while (--i >= 0)

1688

while (--i >= 0)

1689

free_hot_cold_page(pvec->pages[i], pvec->cold);

1689

free_hot_cold_page(pvec->pages[i], pvec->cold);

1690

}

1690

}

1691

1692

void __free_pages(struct page *page, unsigned int order)

1692

void __free_pages(struct page *page, unsigned int order)

1693

{

1693

{

1694

if (put_page_testzero(page)) {

1694

if (put_page_testzero(page)) {

1695

if (order == 0)

1695

if (order == 0)

1696

free_hot_page(page);

1696

free_hot_page(page);

1697

else

1697

else

1698

__free_pages_ok(page, order);

1698

__free_pages_ok(page, order);

1699

}

1699

}

1700

}

1700

}

1701

1702

EXPORT_SYMBOL(__free_pages);

1702

EXPORT_SYMBOL(__free_pages);

1703

1704

void free_pages(unsigned long addr, unsigned int order)

1704

void free_pages(unsigned long addr, unsigned int order)

1705

{

1705

{

1706

if (addr != 0) {

1706

if (addr != 0) {

1707

VM_BUG_ON(!virt_addr_valid((void *)addr));

1707

VM_BUG_ON(!virt_addr_valid((void *)addr));

1708

__free_pages(virt_to_page((void *)addr), order);

1708

__free_pages(virt_to_page((void *)addr), order);

1709

}

1709

}

1710

}

1710

}

1711

1712

EXPORT_SYMBOL(free_pages);

1712

EXPORT_SYMBOL(free_pages);

1713

1714

static unsigned int nr_free_zone_pages(int offset)

1714

static unsigned int nr_free_zone_pages(int offset)

1715

{

1715

{

1716

struct zoneref *z;

1716

struct zoneref *z;

1717

struct zone *zone;

1717

struct zone *zone;

1718

1719

/* Just pick one node, since fallback list is circular */

1719

/* Just pick one node, since fallback list is circular */

1720

unsigned int sum = 0;

1720

unsigned int sum = 0;

1721

1722

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

1722

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

1723

1724

for_each_zone_zonelist(zone, z, zonelist, offset) {

1724

for_each_zone_zonelist(zone, z, zonelist, offset) {

1725

unsigned long size = zone->present_pages;

1725

unsigned long size = zone->present_pages;

1726

unsigned long high = zone->pages_high;

1726

unsigned long high = zone->pages_high;

1727

if (size > high)

1727

if (size > high)

1728

sum += size - high;

1728

sum += size - high;

1729

}

1729

}

1730

1731

return sum;

1731

return sum;

1732

}

1732

}

1733

1734

/*

1734

/*

1735

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1735

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1736

*/

1736

*/

1737

unsigned int nr_free_buffer_pages(void)

1737

unsigned int nr_free_buffer_pages(void)

1738

{

1738

{

1739

return nr_free_zone_pages(gfp_zone(GFP_USER));

1739

return nr_free_zone_pages(gfp_zone(GFP_USER));

1740

}

1740

}

1741

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

1741

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

1742

1743

/*

1743

/*

1744

* Amount of free RAM allocatable within all zones

1744

* Amount of free RAM allocatable within all zones

1745

*/

1745

*/

1746

unsigned int nr_free_pagecache_pages(void)

1746

unsigned int nr_free_pagecache_pages(void)

1747

{

1747

{

1748

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

1748

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

1749

}

1749

}

1750

1751

static inline void show_node(struct zone *zone)

1751

static inline void show_node(struct zone *zone)

1752

{

1752

{

1753

if (NUMA_BUILD)

1753

if (NUMA_BUILD)

1754

printk("Node %d ", zone_to_nid(zone));

1754

printk("Node %d ", zone_to_nid(zone));

1755

}

1755

}

1756

1757

void si_meminfo(struct sysinfo *val)

1757

void si_meminfo(struct sysinfo *val)

1758

{

1758

{

1759

val->totalram = totalram_pages;

1759

val->totalram = totalram_pages;

1760

val->sharedram = 0;

1760

val->sharedram = 0;

1761

val->freeram = global_page_state(NR_FREE_PAGES);

1761

val->freeram = global_page_state(NR_FREE_PAGES);

1762

val->bufferram = nr_blockdev_pages();

1762

val->bufferram = nr_blockdev_pages();

1763

val->totalhigh = totalhigh_pages;

1763

val->totalhigh = totalhigh_pages;

1764

val->freehigh = nr_free_highpages();

1764

val->freehigh = nr_free_highpages();

1765

val->mem_unit = PAGE_SIZE;

1765

val->mem_unit = PAGE_SIZE;

1766

}

1766

}

1767

1768

EXPORT_SYMBOL(si_meminfo);

1768

EXPORT_SYMBOL(si_meminfo);

1769

1770

#ifdef CONFIG_NUMA

1770

#ifdef CONFIG_NUMA

1771

void si_meminfo_node(struct sysinfo *val, int nid)

1771

void si_meminfo_node(struct sysinfo *val, int nid)

1772

{

1772

{

1773

pg_data_t *pgdat = NODE_DATA(nid);

1773

pg_data_t *pgdat = NODE_DATA(nid);

1774

1775

val->totalram = pgdat->node_present_pages;

1775

val->totalram = pgdat->node_present_pages;

1776

val->freeram = node_page_state(nid, NR_FREE_PAGES);

1776

val->freeram = node_page_state(nid, NR_FREE_PAGES);

1777

#ifdef CONFIG_HIGHMEM

1777

#ifdef CONFIG_HIGHMEM

1778

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1778

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1779

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

1779

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

1780

NR_FREE_PAGES);

1780

NR_FREE_PAGES);

1781

#else

1781

#else

1782

val->totalhigh = 0;

1782

val->totalhigh = 0;

1783

val->freehigh = 0;

1783

val->freehigh = 0;

1784

#endif

1784

#endif

1785

val->mem_unit = PAGE_SIZE;

1785

val->mem_unit = PAGE_SIZE;

1786

}

1786

}

1787

#endif

1787

#endif

1788

1789

#define K(x) ((x) << (PAGE_SHIFT-10))

1789

#define K(x) ((x) << (PAGE_SHIFT-10))

1790

1791

/*

1791

/*

1792

* Show free area list (used inside shift_scroll-lock stuff)

1792

* Show free area list (used inside shift_scroll-lock stuff)

1793

* We also calculate the percentage fragmentation. We do this by counting the

1793

* We also calculate the percentage fragmentation. We do this by counting the

1794

* memory on each free list with the exception of the first item on the list.

1794

* memory on each free list with the exception of the first item on the list.

1795

*/

1795

*/

1796

void show_free_areas(void)

1796

void show_free_areas(void)

1797

{

1797

{

1798

int cpu;

1798

int cpu;

1799

struct zone *zone;

1799

struct zone *zone;

1800

1801

for_each_zone(zone) {

1801

for_each_zone(zone) {

1802

if (!populated_zone(zone))

1802

if (!populated_zone(zone))

1803

continue;

1803

continue;

1804

1805

show_node(zone);

1805

show_node(zone);

1806

printk("%s per-cpu:\n", zone->name);

1806

printk("%s per-cpu:\n", zone->name);

1807

1808

for_each_online_cpu(cpu) {

1808

for_each_online_cpu(cpu) {

1809

struct per_cpu_pageset *pageset;

1809

struct per_cpu_pageset *pageset;

1810

1811

pageset = zone_pcp(zone, cpu);

1811

pageset = zone_pcp(zone, cpu);

1812

1813

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

1813

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

1814

cpu, pageset->pcp.high,

1814

cpu, pageset->pcp.high,

1815

pageset->pcp.batch, pageset->pcp.count);

1815

pageset->pcp.batch, pageset->pcp.count);

1816

}

1816

}

1817

}

1817

}

1818

1819

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"

1819

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"

1820

" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",

1820

" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",

1821

global_page_state(NR_ACTIVE),

1821

global_page_state(NR_ACTIVE),

1822

global_page_state(NR_INACTIVE),

1822

global_page_state(NR_INACTIVE),

1823

global_page_state(NR_FILE_DIRTY),

1823

global_page_state(NR_FILE_DIRTY),

1824

global_page_state(NR_WRITEBACK),

1824

global_page_state(NR_WRITEBACK),

1825

global_page_state(NR_UNSTABLE_NFS),

1825

global_page_state(NR_UNSTABLE_NFS),

1826

global_page_state(NR_FREE_PAGES),

1826

global_page_state(NR_FREE_PAGES),

1827

global_page_state(NR_SLAB_RECLAIMABLE) +

1827

global_page_state(NR_SLAB_RECLAIMABLE) +

1828

global_page_state(NR_SLAB_UNRECLAIMABLE),

1828

global_page_state(NR_SLAB_UNRECLAIMABLE),

1829

global_page_state(NR_FILE_MAPPED),

1829

global_page_state(NR_FILE_MAPPED),

1830

global_page_state(NR_PAGETABLE),

1830

global_page_state(NR_PAGETABLE),

1831

global_page_state(NR_BOUNCE));

1831

global_page_state(NR_BOUNCE));

1832

1833

for_each_zone(zone) {

1833

for_each_zone(zone) {

1834

int i;

1834

int i;

1835

1836

if (!populated_zone(zone))

1836

if (!populated_zone(zone))

1837

continue;

1837

continue;

1838

1839

show_node(zone);

1839

show_node(zone);

1840

printk("%s"

1840

printk("%s"

1841

" free:%lukB"

1841

" free:%lukB"

1842

" min:%lukB"

1842

" min:%lukB"

1843

" low:%lukB"

1843

" low:%lukB"

1844

" high:%lukB"

1844

" high:%lukB"

1845

" active:%lukB"

1845

" active:%lukB"

1846

" inactive:%lukB"

1846

" inactive:%lukB"

1847

" present:%lukB"

1847

" present:%lukB"

1848

" pages_scanned:%lu"

1848

" pages_scanned:%lu"

1849

" all_unreclaimable? %s"

1849

" all_unreclaimable? %s"

1850

"\n",

1850

"\n",

1851

zone->name,

1851

zone->name,

1852

K(zone_page_state(zone, NR_FREE_PAGES)),

1852

K(zone_page_state(zone, NR_FREE_PAGES)),

1853

K(zone->pages_min),

1853

K(zone->pages_min),

1854

K(zone->pages_low),

1854

K(zone->pages_low),

1855

K(zone->pages_high),

1855

K(zone->pages_high),

1856

K(zone_page_state(zone, NR_ACTIVE)),

1856

K(zone_page_state(zone, NR_ACTIVE)),

1857

K(zone_page_state(zone, NR_INACTIVE)),

1857

K(zone_page_state(zone, NR_INACTIVE)),

1858

K(zone->present_pages),

1858

K(zone->present_pages),

1859

zone->pages_scanned,

1859

zone->pages_scanned,

1860

(zone_is_all_unreclaimable(zone) ? "yes" : "no")

1860

(zone_is_all_unreclaimable(zone) ? "yes" : "no")

1861

);

1861

);

1862

printk("lowmem_reserve[]:");

1862

printk("lowmem_reserve[]:");

1863

for (i = 0; i < MAX_NR_ZONES; i++)

1863

for (i = 0; i < MAX_NR_ZONES; i++)

1864

printk(" %lu", zone->lowmem_reserve[i]);

1864

printk(" %lu", zone->lowmem_reserve[i]);

1865

printk("\n");

1865

printk("\n");

1866

}

1866

}

1867

1868

for_each_zone(zone) {

1868

for_each_zone(zone) {

1869

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1869

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1870

1871

if (!populated_zone(zone))

1871

if (!populated_zone(zone))

1872

continue;

1872

continue;

1873

1874

show_node(zone);

1874

show_node(zone);

1875

printk("%s: ", zone->name);

1875

printk("%s: ", zone->name);

1876

1877

spin_lock_irqsave(&zone->lock, flags);

1877

spin_lock_irqsave(&zone->lock, flags);

1878

for (order = 0; order < MAX_ORDER; order++) {

1878

for (order = 0; order < MAX_ORDER; order++) {

1879

nr[order] = zone->free_area[order].nr_free;

1879

nr[order] = zone->free_area[order].nr_free;

1880

total += nr[order] << order;

1880

total += nr[order] << order;

1881

}

1881

}

1882

spin_unlock_irqrestore(&zone->lock, flags);

1882

spin_unlock_irqrestore(&zone->lock, flags);

1883

for (order = 0; order < MAX_ORDER; order++)

1883

for (order = 0; order < MAX_ORDER; order++)

1884

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1884

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1885

printk("= %lukB\n", K(total));

1885

printk("= %lukB\n", K(total));

1886

}

1886

}

1887

1888

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

1888

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

1889

1890

show_swap_cache_info();

1890

show_swap_cache_info();

1891

}

1891

}

1892

1893

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

1893

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

1894

{

1894

{

1895

zoneref->zone = zone;

1895

zoneref->zone = zone;

1896

zoneref->zone_idx = zone_idx(zone);

1896

zoneref->zone_idx = zone_idx(zone);

1897

}

1897

}

1898

1899

/*

1899

/*

1900

* Builds allocation fallback zone lists.

1900

* Builds allocation fallback zone lists.

1901

*

1901

*

1902

* Add all populated zones of a node to the zonelist.

1902

* Add all populated zones of a node to the zonelist.

1903

*/

1903

*/

1904

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

1904

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

1905

int nr_zones, enum zone_type zone_type)

1905

int nr_zones, enum zone_type zone_type)

1906

{

1906

{

1907

struct zone *zone;

1907

struct zone *zone;

1908

1909

BUG_ON(zone_type >= MAX_NR_ZONES);

1909

BUG_ON(zone_type >= MAX_NR_ZONES);

1910

zone_type++;

1910

zone_type++;

1911

1912

do {

1912

do {

1913

zone_type--;

1913

zone_type--;

1914

zone = pgdat->node_zones + zone_type;

1914

zone = pgdat->node_zones + zone_type;

1915

if (populated_zone(zone)) {

1915

if (populated_zone(zone)) {

1916

zoneref_set_zone(zone,

1916

zoneref_set_zone(zone,

1917

&zonelist->_zonerefs[nr_zones++]);

1917

&zonelist->_zonerefs[nr_zones++]);

1918

check_highest_zone(zone_type);

1918

check_highest_zone(zone_type);

1919

}

1919

}

1920

1921

} while (zone_type);

1921

} while (zone_type);

1922

return nr_zones;

1922

return nr_zones;

1923

}

1923

}

1924

1925

1926

/*

1926

/*

1927

* zonelist_order:

1927

* zonelist_order:

1928

* 0 = automatic detection of better ordering.

1928

* 0 = automatic detection of better ordering.

1929

* 1 = order by ([node] distance, -zonetype)

1929

* 1 = order by ([node] distance, -zonetype)

1930

* 2 = order by (-zonetype, [node] distance)

1930

* 2 = order by (-zonetype, [node] distance)

1931

*

1931

*

1932

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

1932

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

1933

* the same zonelist. So only NUMA can configure this param.

1933

* the same zonelist. So only NUMA can configure this param.

1934

*/

1934

*/

1935

#define ZONELIST_ORDER_DEFAULT 0

1935

#define ZONELIST_ORDER_DEFAULT 0

1936

#define ZONELIST_ORDER_NODE 1

1936

#define ZONELIST_ORDER_NODE 1

1937

#define ZONELIST_ORDER_ZONE 2

1937

#define ZONELIST_ORDER_ZONE 2

1938

1939

/* zonelist order in the kernel.

1939

/* zonelist order in the kernel.

1940

* set_zonelist_order() will set this to NODE or ZONE.

1940

* set_zonelist_order() will set this to NODE or ZONE.

1941

*/

1941

*/

1942

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

1942

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

1943

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

1943

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

1944

1945

1946

#ifdef CONFIG_NUMA

1946

#ifdef CONFIG_NUMA

1947

/* The value user specified ....changed by config */

1947

/* The value user specified ....changed by config */

1948

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1948

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1949

/* string for sysctl */

1949

/* string for sysctl */

1950

#define NUMA_ZONELIST_ORDER_LEN 16

1950

#define NUMA_ZONELIST_ORDER_LEN 16

1951

char numa_zonelist_order[16] = "default";

1951

char numa_zonelist_order[16] = "default";

1952

1953

/*

1953

/*

1954

* interface for configure zonelist ordering.

1954

* interface for configure zonelist ordering.

1955

* command line option "numa_zonelist_order"

1955

* command line option "numa_zonelist_order"

1956

* = "[dD]efault - default, automatic configuration.

1956

* = "[dD]efault - default, automatic configuration.

1957

* = "[nN]ode - order by node locality, then by zone within node

1957

* = "[nN]ode - order by node locality, then by zone within node

1958

* = "[zZ]one - order by zone, then by locality within zone

1958

* = "[zZ]one - order by zone, then by locality within zone

1959

*/

1959

*/

1960

1961

static int __parse_numa_zonelist_order(char *s)

1961

static int __parse_numa_zonelist_order(char *s)

1962

{

1962

{

1963

if (*s == 'd' || *s == 'D') {

1963

if (*s == 'd' || *s == 'D') {

1964

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1964

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1965

} else if (*s == 'n' || *s == 'N') {

1965

} else if (*s == 'n' || *s == 'N') {

1966

user_zonelist_order = ZONELIST_ORDER_NODE;

1966

user_zonelist_order = ZONELIST_ORDER_NODE;

1967

} else if (*s == 'z' || *s == 'Z') {

1967

} else if (*s == 'z' || *s == 'Z') {

1968

user_zonelist_order = ZONELIST_ORDER_ZONE;

1968

user_zonelist_order = ZONELIST_ORDER_ZONE;

1969

} else {

1969

} else {

1970

printk(KERN_WARNING

1970

printk(KERN_WARNING

1971

"Ignoring invalid numa_zonelist_order value: "

1971

"Ignoring invalid numa_zonelist_order value: "

1972

"%s\n", s);

1972

"%s\n", s);

1973

return -EINVAL;

1973

return -EINVAL;

1974

}

1974

}

1975

return 0;

1975

return 0;

1976

}

1976

}

1977

1978

static __init int setup_numa_zonelist_order(char *s)

1978

static __init int setup_numa_zonelist_order(char *s)

1979

{

1979

{

1980

if (s)

1980

if (s)

1981

return __parse_numa_zonelist_order(s);

1981

return __parse_numa_zonelist_order(s);

1982

return 0;

1982

return 0;

1983

}

1983

}

1984

early_param("numa_zonelist_order", setup_numa_zonelist_order);

1984

early_param("numa_zonelist_order", setup_numa_zonelist_order);

1985

1986

/*

1986

/*

1987

* sysctl handler for numa_zonelist_order

1987

* sysctl handler for numa_zonelist_order

1988

*/

1988

*/

1989

int numa_zonelist_order_handler(ctl_table *table, int write,

1989

int numa_zonelist_order_handler(ctl_table *table, int write,

1990

struct file *file, void __user *buffer, size_t *length,

1990

struct file *file, void __user *buffer, size_t *length,

1991

loff_t *ppos)

1991

loff_t *ppos)

1992

{

1992

{

1993

char saved_string[NUMA_ZONELIST_ORDER_LEN];

1993

char saved_string[NUMA_ZONELIST_ORDER_LEN];

1994

int ret;

1994

int ret;

1995

1996

if (write)

1996

if (write)

1997

strncpy(saved_string, (char*)table->data,

1997

strncpy(saved_string, (char*)table->data,

1998

NUMA_ZONELIST_ORDER_LEN);

1998

NUMA_ZONELIST_ORDER_LEN);

1999

ret = proc_dostring(table, write, file, buffer, length, ppos);

1999

ret = proc_dostring(table, write, file, buffer, length, ppos);

2000

if (ret)

2000

if (ret)

2001

return ret;

2001

return ret;

2002

if (write) {

2002

if (write) {

2003

int oldval = user_zonelist_order;

2003

int oldval = user_zonelist_order;

2004

if (__parse_numa_zonelist_order((char*)table->data)) {

2004

if (__parse_numa_zonelist_order((char*)table->data)) {

2005

/*

2005

/*

2006

* bogus value. restore saved string

2006

* bogus value. restore saved string

2007

*/

2007

*/

2008

strncpy((char*)table->data, saved_string,

2008

strncpy((char*)table->data, saved_string,

2009

NUMA_ZONELIST_ORDER_LEN);

2009

NUMA_ZONELIST_ORDER_LEN);

2010

user_zonelist_order = oldval;

2010

user_zonelist_order = oldval;

2011

} else if (oldval != user_zonelist_order)

2011

} else if (oldval != user_zonelist_order)

2012

build_all_zonelists();

2012

build_all_zonelists();

2013

}

2013

}

2014

return 0;

2014

return 0;

2015

}

2015

}

2016

2017

2018

#define MAX_NODE_LOAD (num_online_nodes())

2018

#define MAX_NODE_LOAD (num_online_nodes())

2019

static int node_load[MAX_NUMNODES];

2019

static int node_load[MAX_NUMNODES];

2020

2021

/**

2021

/**

2022

* find_next_best_node - find the next node that should appear in a given node's fallback list

2022

* find_next_best_node - find the next node that should appear in a given node's fallback list

2023

* @node: node whose fallback list we're appending

2023

* @node: node whose fallback list we're appending

2024

* @used_node_mask: nodemask_t of already used nodes

2024

* @used_node_mask: nodemask_t of already used nodes

2025

*

2025

*

2026

* We use a number of factors to determine which is the next node that should

2026

* We use a number of factors to determine which is the next node that should

2027

* appear on a given node's fallback list. The node should not have appeared

2027

* appear on a given node's fallback list. The node should not have appeared

2028

* already in @node's fallback list, and it should be the next closest node

2028

* already in @node's fallback list, and it should be the next closest node

2029

* according to the distance array (which contains arbitrary distance values

2029

* according to the distance array (which contains arbitrary distance values

2030

* from each node to each node in the system), and should also prefer nodes

2030

* from each node to each node in the system), and should also prefer nodes

2031

* with no CPUs, since presumably they'll have very little allocation pressure

2031

* with no CPUs, since presumably they'll have very little allocation pressure

2032

* on them otherwise.

2032

* on them otherwise.

2033

* It returns -1 if no node is found.

2033

* It returns -1 if no node is found.

2034

*/

2034

*/

2035

static int find_next_best_node(int node, nodemask_t *used_node_mask)

2035

static int find_next_best_node(int node, nodemask_t *used_node_mask)

2036

{

2036

{

2037

int n, val;

2037

int n, val;

2038

int min_val = INT_MAX;

2038

int min_val = INT_MAX;

2039

int best_node = -1;

2039

int best_node = -1;

2040

node_to_cpumask_ptr(tmp, 0);

2040

node_to_cpumask_ptr(tmp, 0);

2041

2042

/* Use the local node if we haven't already */

2042

/* Use the local node if we haven't already */

2043

if (!node_isset(node, *used_node_mask)) {

2043

if (!node_isset(node, *used_node_mask)) {

2044

node_set(node, *used_node_mask);

2044

node_set(node, *used_node_mask);

2045

return node;

2045

return node;

2046

}

2046

}

2047

2048

for_each_node_state(n, N_HIGH_MEMORY) {

2048

for_each_node_state(n, N_HIGH_MEMORY) {

2049

2050

/* Don't want a node to appear more than once */

2050

/* Don't want a node to appear more than once */

2051

if (node_isset(n, *used_node_mask))

2051

if (node_isset(n, *used_node_mask))

2052

continue;

2052

continue;

2053

2054

/* Use the distance array to find the distance */

2054

/* Use the distance array to find the distance */

2055

val = node_distance(node, n);

2055

val = node_distance(node, n);

2056

2057

/* Penalize nodes under us ("prefer the next node") */

2057

/* Penalize nodes under us ("prefer the next node") */

2058

val += (n < node);

2058

val += (n < node);

2059

2060

/* Give preference to headless and unused nodes */

2060

/* Give preference to headless and unused nodes */

2061

node_to_cpumask_ptr_next(tmp, n);

2061

node_to_cpumask_ptr_next(tmp, n);

2062

if (!cpus_empty(*tmp))

2062

if (!cpus_empty(*tmp))

2063

val += PENALTY_FOR_NODE_WITH_CPUS;

2063

val += PENALTY_FOR_NODE_WITH_CPUS;

2064

2065

/* Slight preference for less loaded node */

2065

/* Slight preference for less loaded node */

2066

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

2066

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

2067

val += node_load[n];

2067

val += node_load[n];

2068

2069

if (val < min_val) {

2069

if (val < min_val) {

2070

min_val = val;

2070

min_val = val;

2071

best_node = n;

2071

best_node = n;

2072

}

2072

}

2073

}

2073

}

2074

2075

if (best_node >= 0)

2075

if (best_node >= 0)

2076

node_set(best_node, *used_node_mask);

2076

node_set(best_node, *used_node_mask);

2077

2078

return best_node;

2078

return best_node;

2079

}

2079

}

2080

2081

2082

/*

2082

/*

2083

* Build zonelists ordered by node and zones within node.

2083

* Build zonelists ordered by node and zones within node.

2084

* This results in maximum locality--normal zone overflows into local

2084

* This results in maximum locality--normal zone overflows into local

2085

* DMA zone, if any--but risks exhausting DMA zone.

2085

* DMA zone, if any--but risks exhausting DMA zone.

2086

*/

2086

*/

2087

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

2087

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

2088

{

2088

{

2089

int j;

2089

int j;

2090

struct zonelist *zonelist;

2090

struct zonelist *zonelist;

2091

2092

zonelist = &pgdat->node_zonelists[0];

2092

zonelist = &pgdat->node_zonelists[0];

2093

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

2093

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

2094

;

2094

;

2095

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2095

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2096

MAX_NR_ZONES - 1);

2096

MAX_NR_ZONES - 1);

2097

zonelist->_zonerefs[j].zone = NULL;

2097

zonelist->_zonerefs[j].zone = NULL;

2098

zonelist->_zonerefs[j].zone_idx = 0;

2098

zonelist->_zonerefs[j].zone_idx = 0;

2099

}

2099

}

2100

2101

/*

2101

/*

2102

* Build gfp_thisnode zonelists

2102

* Build gfp_thisnode zonelists

2103

*/

2103

*/

2104

static void build_thisnode_zonelists(pg_data_t *pgdat)

2104

static void build_thisnode_zonelists(pg_data_t *pgdat)

2105

{

2105

{

2106

int j;

2106

int j;

2107

struct zonelist *zonelist;

2107

struct zonelist *zonelist;

2108

2109

zonelist = &pgdat->node_zonelists[1];

2109

zonelist = &pgdat->node_zonelists[1];

2110

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2110

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2111

zonelist->_zonerefs[j].zone = NULL;

2111

zonelist->_zonerefs[j].zone = NULL;

2112

zonelist->_zonerefs[j].zone_idx = 0;

2112

zonelist->_zonerefs[j].zone_idx = 0;

2113

}

2113

}

2114

2115

/*

2115

/*

2116

* Build zonelists ordered by zone and nodes within zones.

2116

* Build zonelists ordered by zone and nodes within zones.

2117

* This results in conserving DMA zone[s] until all Normal memory is

2117

* This results in conserving DMA zone[s] until all Normal memory is

2118

* exhausted, but results in overflowing to remote node while memory

2118

* exhausted, but results in overflowing to remote node while memory

2119

* may still exist in local DMA zone.

2119

* may still exist in local DMA zone.

2120

*/

2120

*/

2121

static int node_order[MAX_NUMNODES];

2121

static int node_order[MAX_NUMNODES];

2122

2123

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

2123

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

2124

{

2124

{

2125

int pos, j, node;

2125

int pos, j, node;

2126

int zone_type; /* needs to be signed */

2126

int zone_type; /* needs to be signed */

2127

struct zone *z;

2127

struct zone *z;

2128

struct zonelist *zonelist;

2128

struct zonelist *zonelist;

2129

2130

zonelist = &pgdat->node_zonelists[0];

2130

zonelist = &pgdat->node_zonelists[0];

2131

pos = 0;

2131

pos = 0;

2132

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

2132

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

2133

for (j = 0; j < nr_nodes; j++) {

2133

for (j = 0; j < nr_nodes; j++) {

2134

node = node_order[j];

2134

node = node_order[j];

2135

z = &NODE_DATA(node)->node_zones[zone_type];

2135

z = &NODE_DATA(node)->node_zones[zone_type];

2136

if (populated_zone(z)) {

2136

if (populated_zone(z)) {

2137

zoneref_set_zone(z,

2137

zoneref_set_zone(z,

2138

&zonelist->_zonerefs[pos++]);

2138

&zonelist->_zonerefs[pos++]);

2139

check_highest_zone(zone_type);

2139

check_highest_zone(zone_type);

2140

}

2140

}

2141

}

2141

}

2142

}

2142

}

2143

zonelist->_zonerefs[pos].zone = NULL;

2143

zonelist->_zonerefs[pos].zone = NULL;

2144

zonelist->_zonerefs[pos].zone_idx = 0;

2144

zonelist->_zonerefs[pos].zone_idx = 0;

2145

}

2145

}

2146

2147

static int default_zonelist_order(void)

2147

static int default_zonelist_order(void)

2148

{

2148

{

2149

int nid, zone_type;

2149

int nid, zone_type;

2150

unsigned long low_kmem_size,total_size;

2150

unsigned long low_kmem_size,total_size;

2151

struct zone *z;

2151

struct zone *z;

2152

int average_size;

2152

int average_size;

2153

/*

2153

/*

2154

* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.

2154

* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.

2155

* If they are really small and used heavily, the system can fall

2155

* If they are really small and used heavily, the system can fall

2156

* into OOM very easily.

2156

* into OOM very easily.

2157

* This function detect ZONE_DMA/DMA32 size and confgigures zone order.

2157

* This function detect ZONE_DMA/DMA32 size and confgigures zone order.

2158

*/

2158

*/

2159

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

2159

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

2160

low_kmem_size = 0;

2160

low_kmem_size = 0;

2161

total_size = 0;

2161

total_size = 0;

2162

for_each_online_node(nid) {

2162

for_each_online_node(nid) {

2163

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2163

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2164

z = &NODE_DATA(nid)->node_zones[zone_type];

2164

z = &NODE_DATA(nid)->node_zones[zone_type];

2165

if (populated_zone(z)) {

2165

if (populated_zone(z)) {

2166

if (zone_type < ZONE_NORMAL)

2166

if (zone_type < ZONE_NORMAL)

2167

low_kmem_size += z->present_pages;

2167

low_kmem_size += z->present_pages;

2168

total_size += z->present_pages;

2168

total_size += z->present_pages;

2169

}

2169

}

2170

}

2170

}

2171

}

2171

}

2172

if (!low_kmem_size || /* there are no DMA area. */

2172

if (!low_kmem_size || /* there are no DMA area. */

2173

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

2173

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

2174

return ZONELIST_ORDER_NODE;

2174

return ZONELIST_ORDER_NODE;

2175

/*

2175

/*

2176

* look into each node's config.

2176

* look into each node's config.

2177

* If there is a node whose DMA/DMA32 memory is very big area on

2177

* If there is a node whose DMA/DMA32 memory is very big area on

2178

* local memory, NODE_ORDER may be suitable.

2178

* local memory, NODE_ORDER may be suitable.

2179

*/

2179

*/

2180

average_size = total_size /

2180

average_size = total_size /

2181

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

2181

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

2182

for_each_online_node(nid) {

2182

for_each_online_node(nid) {

2183

low_kmem_size = 0;

2183

low_kmem_size = 0;

2184

total_size = 0;

2184

total_size = 0;

2185

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2185

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2186

z = &NODE_DATA(nid)->node_zones[zone_type];

2186

z = &NODE_DATA(nid)->node_zones[zone_type];

2187

if (populated_zone(z)) {

2187

if (populated_zone(z)) {

2188

if (zone_type < ZONE_NORMAL)

2188

if (zone_type < ZONE_NORMAL)

2189

low_kmem_size += z->present_pages;

2189

low_kmem_size += z->present_pages;

2190

total_size += z->present_pages;

2190

total_size += z->present_pages;

2191

}

2191

}

2192

}

2192

}

2193

if (low_kmem_size &&

2193

if (low_kmem_size &&

2194

total_size > average_size && /* ignore small node */

2194

total_size > average_size && /* ignore small node */

2195

low_kmem_size > total_size * 70/100)

2195

low_kmem_size > total_size * 70/100)

2196

return ZONELIST_ORDER_NODE;

2196

return ZONELIST_ORDER_NODE;

2197

}

2197

}

2198

return ZONELIST_ORDER_ZONE;

2198

return ZONELIST_ORDER_ZONE;

2199

}

2199

}

2200

2201

static void set_zonelist_order(void)

2201

static void set_zonelist_order(void)

2202

{

2202

{

2203

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

2203

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

2204

current_zonelist_order = default_zonelist_order();

2204

current_zonelist_order = default_zonelist_order();

2205

else

2205

else

2206

current_zonelist_order = user_zonelist_order;

2206

current_zonelist_order = user_zonelist_order;

2207

}

2207

}

2208

2209

static void build_zonelists(pg_data_t *pgdat)

2209

static void build_zonelists(pg_data_t *pgdat)

2210

{

2210

{

2211

int j, node, load;

2211

int j, node, load;

2212

enum zone_type i;

2212

enum zone_type i;

2213

nodemask_t used_mask;

2213

nodemask_t used_mask;

2214

int local_node, prev_node;

2214

int local_node, prev_node;

2215

struct zonelist *zonelist;

2215

struct zonelist *zonelist;

2216

int order = current_zonelist_order;

2216

int order = current_zonelist_order;

2217

2218

/* initialize zonelists */

2218

/* initialize zonelists */

2219

for (i = 0; i < MAX_ZONELISTS; i++) {

2219

for (i = 0; i < MAX_ZONELISTS; i++) {

2220

zonelist = pgdat->node_zonelists + i;

2220

zonelist = pgdat->node_zonelists + i;

2221

zonelist->_zonerefs[0].zone = NULL;

2221

zonelist->_zonerefs[0].zone = NULL;

2222

zonelist->_zonerefs[0].zone_idx = 0;

2222

zonelist->_zonerefs[0].zone_idx = 0;

2223

}

2223

}

2224

2225

/* NUMA-aware ordering of nodes */

2225

/* NUMA-aware ordering of nodes */

2226

local_node = pgdat->node_id;

2226

local_node = pgdat->node_id;

2227

load = num_online_nodes();

2227

load = num_online_nodes();

2228

prev_node = local_node;

2228

prev_node = local_node;

2229

nodes_clear(used_mask);

2229

nodes_clear(used_mask);

2230

2231

memset(node_load, 0, sizeof(node_load));

2231

memset(node_load, 0, sizeof(node_load));

2232

memset(node_order, 0, sizeof(node_order));

2232

memset(node_order, 0, sizeof(node_order));

2233

j = 0;

2233

j = 0;

2234

2235

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

2235

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

2236

int distance = node_distance(local_node, node);

2236

int distance = node_distance(local_node, node);

2237

2238

/*

2238

/*

2239

* If another node is sufficiently far away then it is better

2239

* If another node is sufficiently far away then it is better

2240

* to reclaim pages in a zone before going off node.

2240

* to reclaim pages in a zone before going off node.

2241

*/

2241

*/

2242

if (distance > RECLAIM_DISTANCE)

2242

if (distance > RECLAIM_DISTANCE)

2243

zone_reclaim_mode = 1;

2243

zone_reclaim_mode = 1;

2244

2245

/*

2245

/*

2246

* We don't want to pressure a particular node.

2246

* We don't want to pressure a particular node.

2247

* So adding penalty to the first node in same

2247

* So adding penalty to the first node in same

2248

* distance group to make it round-robin.

2248

* distance group to make it round-robin.

2249

*/

2249

*/

2250

if (distance != node_distance(local_node, prev_node))

2250

if (distance != node_distance(local_node, prev_node))

2251

node_load[node] = load;

2251

node_load[node] = load;

2252

2253

prev_node = node;

2253

prev_node = node;

2254

load--;

2254

load--;

2255

if (order == ZONELIST_ORDER_NODE)

2255

if (order == ZONELIST_ORDER_NODE)

2256

build_zonelists_in_node_order(pgdat, node);

2256

build_zonelists_in_node_order(pgdat, node);

2257

else

2257

else

2258

node_order[j++] = node; /* remember order */

2258

node_order[j++] = node; /* remember order */

2259

}

2259

}

2260

2261

if (order == ZONELIST_ORDER_ZONE) {

2261

if (order == ZONELIST_ORDER_ZONE) {

2262

/* calculate node order -- i.e., DMA last! */

2262

/* calculate node order -- i.e., DMA last! */

2263

build_zonelists_in_zone_order(pgdat, j);

2263

build_zonelists_in_zone_order(pgdat, j);

2264

}

2264

}

2265

2266

build_thisnode_zonelists(pgdat);

2266

build_thisnode_zonelists(pgdat);

2267

}

2267

}

2268

2269

/* Construct the zonelist performance cache - see further mmzone.h */

2269

/* Construct the zonelist performance cache - see further mmzone.h */

2270

static void build_zonelist_cache(pg_data_t *pgdat)

2270

static void build_zonelist_cache(pg_data_t *pgdat)

2271

{

2271

{

2272

struct zonelist *zonelist;

2272

struct zonelist *zonelist;

2273

struct zonelist_cache *zlc;

2273

struct zonelist_cache *zlc;

2274

struct zoneref *z;

2274

struct zoneref *z;

2275

2276

zonelist = &pgdat->node_zonelists[0];

2276

zonelist = &pgdat->node_zonelists[0];

2277

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

2277

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

2278

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

2278

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

2279

for (z = zonelist->_zonerefs; z->zone; z++)

2279

for (z = zonelist->_zonerefs; z->zone; z++)

2280

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

2280

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

2281

}

2281

}

2282

2283

2284

#else /* CONFIG_NUMA */

2284

#else /* CONFIG_NUMA */

2285

2286

static void set_zonelist_order(void)

2286

static void set_zonelist_order(void)

2287

{

2287

{

2288

current_zonelist_order = ZONELIST_ORDER_ZONE;

2288

current_zonelist_order = ZONELIST_ORDER_ZONE;

2289

}

2289

}

2290

2291

static void build_zonelists(pg_data_t *pgdat)

2291

static void build_zonelists(pg_data_t *pgdat)

2292

{

2292

{

2293

int node, local_node;

2293

int node, local_node;

2294

enum zone_type j;

2294

enum zone_type j;

2295

struct zonelist *zonelist;

2295

struct zonelist *zonelist;

2296

2297

local_node = pgdat->node_id;

2297

local_node = pgdat->node_id;

2298

2299

zonelist = &pgdat->node_zonelists[0];

2299

zonelist = &pgdat->node_zonelists[0];

2300

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2300

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2301

2302

/*

2302

/*

2303

* Now we build the zonelist so that it contains the zones

2303

* Now we build the zonelist so that it contains the zones

2304

* of all the other nodes.

2304

* of all the other nodes.

2305

* We don't want to pressure a particular node, so when

2305

* We don't want to pressure a particular node, so when

2306

* building the zones for node N, we make sure that the

2306

* building the zones for node N, we make sure that the

2307

* zones coming right after the local ones are those from

2307

* zones coming right after the local ones are those from

2308

* node N+1 (modulo N)

2308

* node N+1 (modulo N)

2309

*/

2309

*/

2310

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

2310

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

2311

if (!node_online(node))

2311

if (!node_online(node))

2312

continue;

2312

continue;

2313

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2313

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2314

MAX_NR_ZONES - 1);

2314

MAX_NR_ZONES - 1);

2315

}

2315

}

2316

for (node = 0; node < local_node; node++) {

2316

for (node = 0; node < local_node; node++) {

2317

if (!node_online(node))

2317

if (!node_online(node))

2318

continue;

2318

continue;

2319

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2319

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2320

MAX_NR_ZONES - 1);

2320

MAX_NR_ZONES - 1);

2321

}

2321

}

2322

2323

zonelist->_zonerefs[j].zone = NULL;

2323

zonelist->_zonerefs[j].zone = NULL;

2324

zonelist->_zonerefs[j].zone_idx = 0;

2324

zonelist->_zonerefs[j].zone_idx = 0;

2325

}

2325

}

2326

2327

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

2327

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

2328

static void build_zonelist_cache(pg_data_t *pgdat)

2328

static void build_zonelist_cache(pg_data_t *pgdat)

2329

{

2329

{

2330

pgdat->node_zonelists[0].zlcache_ptr = NULL;

2330

pgdat->node_zonelists[0].zlcache_ptr = NULL;

2331

}

2331

}

2332

2333

#endif /* CONFIG_NUMA */

2333

#endif /* CONFIG_NUMA */

2334

2335

/* return values int ....just for stop_machine_run() */

2335

/* return values int ....just for stop_machine_run() */

2336

static int __build_all_zonelists(void *dummy)

2336

static int __build_all_zonelists(void *dummy)

2337

{

2337

{

2338

int nid;

2338

int nid;

2339

2340

for_each_online_node(nid) {

2340

for_each_online_node(nid) {

2341

pg_data_t *pgdat = NODE_DATA(nid);

2341

pg_data_t *pgdat = NODE_DATA(nid);

2342

2343

build_zonelists(pgdat);

2343

build_zonelists(pgdat);

2344

build_zonelist_cache(pgdat);

2344

build_zonelist_cache(pgdat);

2345

}

2345

}

2346

return 0;

2346

return 0;

2347

}

2347

}

2348

2349

void build_all_zonelists(void)

2349

void build_all_zonelists(void)

2350

{

2350

{

2351

set_zonelist_order();

2351

set_zonelist_order();

2352

2353

if (system_state == SYSTEM_BOOTING) {

2353

if (system_state == SYSTEM_BOOTING) {

2354

__build_all_zonelists(NULL);

2354

__build_all_zonelists(NULL);

2355

mminit_verify_zonelist();

2355

cpuset_init_current_mems_allowed();

2356

cpuset_init_current_mems_allowed();

2356

} else {

2357

} else {

2357

/* we have to stop all cpus to guarantee there is no user

2358

/* we have to stop all cpus to guarantee there is no user

2358

of zonelist */

2359

of zonelist */

2359

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

2360

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

2360

/* cpuset refresh routine should be here */

2361

/* cpuset refresh routine should be here */

2361

}

2362

}

2362

vm_total_pages = nr_free_pagecache_pages();

2363

vm_total_pages = nr_free_pagecache_pages();

2363

/*

2364

/*

2364

* Disable grouping by mobility if the number of pages in the

2365

* Disable grouping by mobility if the number of pages in the

2365

* system is too low to allow the mechanism to work. It would be

2366

* system is too low to allow the mechanism to work. It would be

2366

* more accurate, but expensive to check per-zone. This check is

2367

* more accurate, but expensive to check per-zone. This check is

2367

* made on memory-hotadd so a system can start with mobility

2368

* made on memory-hotadd so a system can start with mobility

2368

* disabled and enable it later

2369

* disabled and enable it later

2369

*/

2370

*/

2370

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

2371

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

2371

page_group_by_mobility_disabled = 1;

2372

page_group_by_mobility_disabled = 1;

2372

else

2373

else

2373

page_group_by_mobility_disabled = 0;

2374

page_group_by_mobility_disabled = 0;

2374

2375

printk("Built %i zonelists in %s order, mobility grouping %s. "

2376

printk("Built %i zonelists in %s order, mobility grouping %s. "

2376

"Total pages: %ld\n",

2377

"Total pages: %ld\n",

2377

num_online_nodes(),

2378

num_online_nodes(),

2378

zonelist_order_name[current_zonelist_order],

2379

zonelist_order_name[current_zonelist_order],

2379

page_group_by_mobility_disabled ? "off" : "on",

2380

page_group_by_mobility_disabled ? "off" : "on",

2380

vm_total_pages);

2381

vm_total_pages);

2381

#ifdef CONFIG_NUMA

2382

#ifdef CONFIG_NUMA

2382

printk("Policy zone: %s\n", zone_names[policy_zone]);

2383

printk("Policy zone: %s\n", zone_names[policy_zone]);

2383

#endif

2384

#endif

2384

}

2385

}

2385

2386

/*

2387

/*

2387

* Helper functions to size the waitqueue hash table.

2388

* Helper functions to size the waitqueue hash table.

2388

* Essentially these want to choose hash table sizes sufficiently

2389

* Essentially these want to choose hash table sizes sufficiently

2389

* large so that collisions trying to wait on pages are rare.

2390

* large so that collisions trying to wait on pages are rare.

2390

* But in fact, the number of active page waitqueues on typical

2391

* But in fact, the number of active page waitqueues on typical

2391

* systems is ridiculously low, less than 200. So this is even

2392

* systems is ridiculously low, less than 200. So this is even

2392

* conservative, even though it seems large.

2393

* conservative, even though it seems large.

2393

*

2394

*

2394

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

2395

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

2395

* waitqueues, i.e. the size of the waitq table given the number of pages.

2396

* waitqueues, i.e. the size of the waitq table given the number of pages.

2396

*/

2397

*/

2397

#define PAGES_PER_WAITQUEUE 256

2398

#define PAGES_PER_WAITQUEUE 256

2398

2399

#ifndef CONFIG_MEMORY_HOTPLUG

2400

#ifndef CONFIG_MEMORY_HOTPLUG

2400

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2401

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2401

{

2402

{

2402

unsigned long size = 1;

2403

unsigned long size = 1;

2403

2404

pages /= PAGES_PER_WAITQUEUE;

2405

pages /= PAGES_PER_WAITQUEUE;

2405

2406

while (size < pages)

2407

while (size < pages)

2407

size <<= 1;

2408

size <<= 1;

2408

2409

/*

2410

/*

2410

* Once we have dozens or even hundreds of threads sleeping

2411

* Once we have dozens or even hundreds of threads sleeping

2411

* on IO we've got bigger problems than wait queue collision.

2412

* on IO we've got bigger problems than wait queue collision.

2412

* Limit the size of the wait table to a reasonable size.

2413

* Limit the size of the wait table to a reasonable size.

2413

*/

2414

*/

2414

size = min(size, 4096UL);

2415

size = min(size, 4096UL);

2415

2416

return max(size, 4UL);

2417

return max(size, 4UL);

2417

}

2418

}

2418

#else

2419

#else

2419

/*

2420

/*

2420

* A zone's size might be changed by hot-add, so it is not possible to determine

2421

* A zone's size might be changed by hot-add, so it is not possible to determine

2421

* a suitable size for its wait_table. So we use the maximum size now.

2422

* a suitable size for its wait_table. So we use the maximum size now.

2422

*

2423

*

2423

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

2424

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

2424

*

2425

*

2425

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

2426

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

2426

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

2427

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

2427

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

2428

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

2428

*

2429

*

2429

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

2430

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

2430

* or more by the traditional way. (See above). It equals:

2431

* or more by the traditional way. (See above). It equals:

2431

*

2432

*

2432

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

2433

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

2433

* ia64(16K page size) : = ( 8G + 4M)byte.

2434

* ia64(16K page size) : = ( 8G + 4M)byte.

2434

* powerpc (64K page size) : = (32G +16M)byte.

2435

* powerpc (64K page size) : = (32G +16M)byte.

2435

*/

2436

*/

2436

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2437

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2437

{

2438

{

2438

return 4096UL;

2439

return 4096UL;

2439

}

2440

}

2440

#endif

2441

#endif

2441

2442

/*

2443

/*

2443

* This is an integer logarithm so that shifts can be used later

2444

* This is an integer logarithm so that shifts can be used later

2444

* to extract the more random high bits from the multiplicative

2445

* to extract the more random high bits from the multiplicative

2445

* hash function before the remainder is taken.

2446

* hash function before the remainder is taken.

2446

*/

2447

*/

2447

static inline unsigned long wait_table_bits(unsigned long size)

2448

static inline unsigned long wait_table_bits(unsigned long size)

2448

{

2449

{

2449

return ffz(~size);

2450

return ffz(~size);

2450

}

2451

}

2451

2452

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

2453

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

2453

2454

/*

2455

/*

2455

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

2456

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

2456

* of blocks reserved is based on zone->pages_min. The memory within the

2457

* of blocks reserved is based on zone->pages_min. The memory within the

2457

* reserve will tend to store contiguous free pages. Setting min_free_kbytes

2458

* reserve will tend to store contiguous free pages. Setting min_free_kbytes

2458

* higher will lead to a bigger reserve which will get freed as contiguous

2459

* higher will lead to a bigger reserve which will get freed as contiguous

2459

* blocks as reclaim kicks in

2460

* blocks as reclaim kicks in

2460

*/

2461

*/

2461

static void setup_zone_migrate_reserve(struct zone *zone)

2462

static void setup_zone_migrate_reserve(struct zone *zone)

2462

{

2463

{

2463

unsigned long start_pfn, pfn, end_pfn;

2464

unsigned long start_pfn, pfn, end_pfn;

2464

struct page *page;

2465

struct page *page;

2465

unsigned long reserve, block_migratetype;

2466

unsigned long reserve, block_migratetype;

2466

2467

/* Get the start pfn, end pfn and the number of blocks to reserve */

2468

/* Get the start pfn, end pfn and the number of blocks to reserve */

2468

start_pfn = zone->zone_start_pfn;

2469

start_pfn = zone->zone_start_pfn;

2469

end_pfn = start_pfn + zone->spanned_pages;

2470

end_pfn = start_pfn + zone->spanned_pages;

2470

reserve = roundup(zone->pages_min, pageblock_nr_pages) >>

2471

reserve = roundup(zone->pages_min, pageblock_nr_pages) >>

2471

pageblock_order;

2472

pageblock_order;

2472

2473

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

2474

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

2474

if (!pfn_valid(pfn))

2475

if (!pfn_valid(pfn))

2475

continue;

2476

continue;

2476

page = pfn_to_page(pfn);

2477

page = pfn_to_page(pfn);

2477

2478

/* Blocks with reserved pages will never free, skip them. */

2479

/* Blocks with reserved pages will never free, skip them. */

2479

if (PageReserved(page))

2480

if (PageReserved(page))

2480

continue;

2481

continue;

2481

2482

block_migratetype = get_pageblock_migratetype(page);

2483

block_migratetype = get_pageblock_migratetype(page);

2483

2484

/* If this block is reserved, account for it */

2485

/* If this block is reserved, account for it */

2485

if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {

2486

if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {

2486

reserve--;

2487

reserve--;

2487

continue;

2488

continue;

2488

}

2489

}

2489

2490

/* Suitable for reserving if this block is movable */

2491

/* Suitable for reserving if this block is movable */

2491

if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {

2492

if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {

2492

set_pageblock_migratetype(page, MIGRATE_RESERVE);

2493

set_pageblock_migratetype(page, MIGRATE_RESERVE);

2493

move_freepages_block(zone, page, MIGRATE_RESERVE);

2494

move_freepages_block(zone, page, MIGRATE_RESERVE);

2494

reserve--;

2495

reserve--;

2495

continue;

2496

continue;

2496

}

2497

}

2497

2498

/*

2499

/*

2499

* If the reserve is met and this is a previous reserved block,

2500

* If the reserve is met and this is a previous reserved block,

2500

* take it back

2501

* take it back

2501

*/

2502

*/

2502

if (block_migratetype == MIGRATE_RESERVE) {

2503

if (block_migratetype == MIGRATE_RESERVE) {

2503

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2504

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2504

move_freepages_block(zone, page, MIGRATE_MOVABLE);

2505

move_freepages_block(zone, page, MIGRATE_MOVABLE);

2505

}

2506

}

2506

}

2507

}

2507

}

2508

}

2508

2509

/*

2510

/*

2510

* Initially all pages are reserved - free ones are freed

2511

* Initially all pages are reserved - free ones are freed

2511

* up by free_all_bootmem() once the early boot process is

2512

* up by free_all_bootmem() once the early boot process is

2512

* done. Non-atomic initialization, single-pass.

2513

* done. Non-atomic initialization, single-pass.

2513

*/

2514

*/

2514

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

2515

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

2515

unsigned long start_pfn, enum memmap_context context)

2516

unsigned long start_pfn, enum memmap_context context)

2516

{

2517

{

2517

struct page *page;

2518

struct page *page;

2518

unsigned long end_pfn = start_pfn + size;

2519

unsigned long end_pfn = start_pfn + size;

2519

unsigned long pfn;

2520

unsigned long pfn;

2520

struct zone *z;

2521

struct zone *z;

2521

2522

z = &NODE_DATA(nid)->node_zones[zone];

2523

z = &NODE_DATA(nid)->node_zones[zone];

2523

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

2524

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

2524

/*

2525

/*

2525

* There can be holes in boot-time mem_map[]s

2526

* There can be holes in boot-time mem_map[]s

2526

* handed to this function. They do not

2527

* handed to this function. They do not

2527

* exist on hotplugged memory.

2528

* exist on hotplugged memory.

2528

*/

2529

*/

2529

if (context == MEMMAP_EARLY) {

2530

if (context == MEMMAP_EARLY) {

2530

if (!early_pfn_valid(pfn))

2531

if (!early_pfn_valid(pfn))

2531

continue;

2532

continue;

2532

if (!early_pfn_in_nid(pfn, nid))

2533

if (!early_pfn_in_nid(pfn, nid))

2533

continue;

2534

continue;

2534

}

2535

}

2535

page = pfn_to_page(pfn);

2536

page = pfn_to_page(pfn);

2536

set_page_links(page, zone, nid, pfn);

2537

set_page_links(page, zone, nid, pfn);

2537

mminit_verify_page_links(page, zone, nid, pfn);

2538

mminit_verify_page_links(page, zone, nid, pfn);

2538

init_page_count(page);

2539

init_page_count(page);

2539

reset_page_mapcount(page);

2540

reset_page_mapcount(page);

2540

SetPageReserved(page);

2541

SetPageReserved(page);

2541

/*

2542

/*

2542

* Mark the block movable so that blocks are reserved for

2543

* Mark the block movable so that blocks are reserved for

2543

* movable at startup. This will force kernel allocations

2544

* movable at startup. This will force kernel allocations

2544

* to reserve their blocks rather than leaking throughout

2545

* to reserve their blocks rather than leaking throughout

2545

* the address space during boot when many long-lived

2546

* the address space during boot when many long-lived

2546

* kernel allocations are made. Later some blocks near

2547

* kernel allocations are made. Later some blocks near

2547

* the start are marked MIGRATE_RESERVE by

2548

* the start are marked MIGRATE_RESERVE by

2548

* setup_zone_migrate_reserve()

2549

* setup_zone_migrate_reserve()

2549

*

2550

*

2550

* bitmap is created for zone's valid pfn range. but memmap

2551

* bitmap is created for zone's valid pfn range. but memmap

2551

* can be created for invalid pages (for alignment)

2552

* can be created for invalid pages (for alignment)

2552

* check here not to call set_pageblock_migratetype() against

2553

* check here not to call set_pageblock_migratetype() against

2553

* pfn out of zone.

2554

* pfn out of zone.

2554

*/

2555

*/

2555

if ((z->zone_start_pfn <= pfn)

2556

if ((z->zone_start_pfn <= pfn)

2556

&& (pfn < z->zone_start_pfn + z->spanned_pages)

2557

&& (pfn < z->zone_start_pfn + z->spanned_pages)

2557

&& !(pfn & (pageblock_nr_pages - 1)))

2558

&& !(pfn & (pageblock_nr_pages - 1)))

2558

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2559

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2559

2560

INIT_LIST_HEAD(&page->lru);

2561

INIT_LIST_HEAD(&page->lru);

2561

#ifdef WANT_PAGE_VIRTUAL

2562

#ifdef WANT_PAGE_VIRTUAL

2562

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

2563

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

2563

if (!is_highmem_idx(zone))

2564

if (!is_highmem_idx(zone))

2564

set_page_address(page, __va(pfn << PAGE_SHIFT));

2565

set_page_address(page, __va(pfn << PAGE_SHIFT));

2565

#endif

2566

#endif

2566

}

2567

}

2567

}

2568

}

2568

2569

static void __meminit zone_init_free_lists(struct zone *zone)

2570

static void __meminit zone_init_free_lists(struct zone *zone)

2570

{

2571

{

2571

int order, t;

2572

int order, t;

2572

for_each_migratetype_order(order, t) {

2573

for_each_migratetype_order(order, t) {

2573

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

2574

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

2574

zone->free_area[order].nr_free = 0;

2575

zone->free_area[order].nr_free = 0;

2575

}

2576

}

2576

}

2577

}

2577

2578

#ifndef __HAVE_ARCH_MEMMAP_INIT

2579

#ifndef __HAVE_ARCH_MEMMAP_INIT

2579

#define memmap_init(size, nid, zone, start_pfn) \

2580

#define memmap_init(size, nid, zone, start_pfn) \

2580

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

2581

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

2581

#endif

2582

#endif

2582

2583

static int zone_batchsize(struct zone *zone)

2584

static int zone_batchsize(struct zone *zone)

2584

{

2585

{

2585

int batch;

2586

int batch;

2586

2587

/*

2588

/*

2588

* The per-cpu-pages pools are set to around 1000th of the

2589

* The per-cpu-pages pools are set to around 1000th of the

2589

* size of the zone. But no more than 1/2 of a meg.

2590

* size of the zone. But no more than 1/2 of a meg.

2590

*

2591

*

2591

* OK, so we don't know how big the cache is. So guess.

2592

* OK, so we don't know how big the cache is. So guess.

2592

*/

2593

*/

2593

batch = zone->present_pages / 1024;

2594

batch = zone->present_pages / 1024;

2594

if (batch * PAGE_SIZE > 512 * 1024)

2595

if (batch * PAGE_SIZE > 512 * 1024)

2595

batch = (512 * 1024) / PAGE_SIZE;

2596

batch = (512 * 1024) / PAGE_SIZE;

2596

batch /= 4; /* We effectively *= 4 below */

2597

batch /= 4; /* We effectively *= 4 below */

2597

if (batch < 1)

2598

if (batch < 1)

2598

batch = 1;

2599

batch = 1;

2599

2600

/*

2601

/*

2601

* Clamp the batch to a 2^n - 1 value. Having a power

2602

* Clamp the batch to a 2^n - 1 value. Having a power

2602

* of 2 value was found to be more likely to have

2603

* of 2 value was found to be more likely to have

2603

* suboptimal cache aliasing properties in some cases.

2604

* suboptimal cache aliasing properties in some cases.

2604

*

2605

*

2605

* For example if 2 tasks are alternately allocating

2606

* For example if 2 tasks are alternately allocating

2606

* batches of pages, one task can end up with a lot

2607

* batches of pages, one task can end up with a lot

2607

* of pages of one half of the possible page colors

2608

* of pages of one half of the possible page colors

2608

* and the other with pages of the other colors.

2609

* and the other with pages of the other colors.

2609

*/

2610

*/

2610

batch = (1 << (fls(batch + batch/2)-1)) - 1;

2611

batch = (1 << (fls(batch + batch/2)-1)) - 1;

2611

2612

return batch;

2613

return batch;

2613

}

2614

}

2614

2615

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

2616

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

2616

{

2617

{

2617

struct per_cpu_pages *pcp;

2618

struct per_cpu_pages *pcp;

2618

2619

memset(p, 0, sizeof(*p));

2620

memset(p, 0, sizeof(*p));

2620

2621

pcp = &p->pcp;

2622

pcp = &p->pcp;

2622

pcp->count = 0;

2623

pcp->count = 0;

2623

pcp->high = 6 * batch;

2624

pcp->high = 6 * batch;

2624

pcp->batch = max(1UL, 1 * batch);

2625

pcp->batch = max(1UL, 1 * batch);

2625

INIT_LIST_HEAD(&pcp->list);

2626

INIT_LIST_HEAD(&pcp->list);

2626

}

2627

}

2627

2628

/*

2629

/*

2629

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

2630

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

2630

* to the value high for the pageset p.

2631

* to the value high for the pageset p.

2631

*/

2632

*/

2632

2633

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

2634

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

2634

unsigned long high)

2635

unsigned long high)

2635

{

2636

{

2636

struct per_cpu_pages *pcp;

2637

struct per_cpu_pages *pcp;

2637

2638

pcp = &p->pcp;

2639

pcp = &p->pcp;

2639

pcp->high = high;

2640

pcp->high = high;

2640

pcp->batch = max(1UL, high/4);

2641

pcp->batch = max(1UL, high/4);

2641

if ((high/4) > (PAGE_SHIFT * 8))

2642

if ((high/4) > (PAGE_SHIFT * 8))

2642

pcp->batch = PAGE_SHIFT * 8;

2643

pcp->batch = PAGE_SHIFT * 8;

2643

}

2644

}

2644

2645

2646

#ifdef CONFIG_NUMA

2647

#ifdef CONFIG_NUMA

2647

/*

2648

/*

2648

* Boot pageset table. One per cpu which is going to be used for all

2649

* Boot pageset table. One per cpu which is going to be used for all

2649

* zones and all nodes. The parameters will be set in such a way

2650

* zones and all nodes. The parameters will be set in such a way

2650

* that an item put on a list will immediately be handed over to

2651

* that an item put on a list will immediately be handed over to

2651

* the buddy list. This is safe since pageset manipulation is done

2652

* the buddy list. This is safe since pageset manipulation is done

2652

* with interrupts disabled.

2653

* with interrupts disabled.

2653

*

2654

*

2654

* Some NUMA counter updates may also be caught by the boot pagesets.

2655

* Some NUMA counter updates may also be caught by the boot pagesets.

2655

*

2656

*

2656

* The boot_pagesets must be kept even after bootup is complete for

2657

* The boot_pagesets must be kept even after bootup is complete for

2657

* unused processors and/or zones. They do play a role for bootstrapping

2658

* unused processors and/or zones. They do play a role for bootstrapping

2658

* hotplugged processors.

2659

* hotplugged processors.

2659

*

2660

*

2660

* zoneinfo_show() and maybe other functions do

2661

* zoneinfo_show() and maybe other functions do

2661

* not check if the processor is online before following the pageset pointer.

2662

* not check if the processor is online before following the pageset pointer.

2662

* Other parts of the kernel may not check if the zone is available.

2663

* Other parts of the kernel may not check if the zone is available.

2663

*/

2664

*/

2664

static struct per_cpu_pageset boot_pageset[NR_CPUS];

2665

static struct per_cpu_pageset boot_pageset[NR_CPUS];

2665

2666

/*

2667

/*

2667

* Dynamically allocate memory for the

2668

* Dynamically allocate memory for the

2668

* per cpu pageset array in struct zone.

2669

* per cpu pageset array in struct zone.

2669

*/

2670

*/

2670

static int __cpuinit process_zones(int cpu)

2671

static int __cpuinit process_zones(int cpu)

2671

{

2672

{

2672

struct zone *zone, *dzone;

2673

struct zone *zone, *dzone;

2673

int node = cpu_to_node(cpu);

2674

int node = cpu_to_node(cpu);

2674

2675

node_set_state(node, N_CPU); /* this node has a cpu */

2676

node_set_state(node, N_CPU); /* this node has a cpu */

2676

2677

for_each_zone(zone) {

2678

for_each_zone(zone) {

2678

2679

if (!populated_zone(zone))

2680

if (!populated_zone(zone))

2680

continue;

2681

continue;

2681

2682

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2683

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2683

GFP_KERNEL, node);

2684

GFP_KERNEL, node);

2684

if (!zone_pcp(zone, cpu))

2685

if (!zone_pcp(zone, cpu))

2685

goto bad;

2686

goto bad;

2686

2687

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2688

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2688

2689

if (percpu_pagelist_fraction)

2690

if (percpu_pagelist_fraction)

2690

setup_pagelist_highmark(zone_pcp(zone, cpu),

2691

setup_pagelist_highmark(zone_pcp(zone, cpu),

2691

(zone->present_pages / percpu_pagelist_fraction));

2692

(zone->present_pages / percpu_pagelist_fraction));

2692

}

2693

}

2693

2694

return 0;

2695

return 0;

2695

bad:

2696

bad:

2696

for_each_zone(dzone) {

2697

for_each_zone(dzone) {

2697

if (!populated_zone(dzone))

2698

if (!populated_zone(dzone))

2698

continue;

2699

continue;

2699

if (dzone == zone)

2700

if (dzone == zone)

2700

break;

2701

break;

2701

kfree(zone_pcp(dzone, cpu));

2702

kfree(zone_pcp(dzone, cpu));

2702

zone_pcp(dzone, cpu) = NULL;

2703

zone_pcp(dzone, cpu) = NULL;

2703

}

2704

}

2704

return -ENOMEM;

2705

return -ENOMEM;

2705

}

2706

}

2706

2707

static inline void free_zone_pagesets(int cpu)

2708

static inline void free_zone_pagesets(int cpu)

2708

{

2709

{

2709

struct zone *zone;

2710

struct zone *zone;

2710

2711

for_each_zone(zone) {

2712

for_each_zone(zone) {

2712

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2713

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2713

2714

/* Free per_cpu_pageset if it is slab allocated */

2715

/* Free per_cpu_pageset if it is slab allocated */

2715

if (pset != &boot_pageset[cpu])

2716

if (pset != &boot_pageset[cpu])

2716

kfree(pset);

2717

kfree(pset);

2717

zone_pcp(zone, cpu) = NULL;

2718

zone_pcp(zone, cpu) = NULL;

2718

}

2719

}

2719

}

2720

}

2720

2721

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2722

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2722

unsigned long action,

2723

unsigned long action,

2723

void *hcpu)

2724

void *hcpu)

2724

{

2725

{

2725

int cpu = (long)hcpu;

2726

int cpu = (long)hcpu;

2726

int ret = NOTIFY_OK;

2727

int ret = NOTIFY_OK;

2727

2728

switch (action) {

2729

switch (action) {

2729

case CPU_UP_PREPARE:

2730

case CPU_UP_PREPARE:

2730

case CPU_UP_PREPARE_FROZEN:

2731

case CPU_UP_PREPARE_FROZEN:

2731

if (process_zones(cpu))

2732

if (process_zones(cpu))

2732

ret = NOTIFY_BAD;

2733

ret = NOTIFY_BAD;

2733

break;

2734

break;

2734

case CPU_UP_CANCELED:

2735

case CPU_UP_CANCELED:

2735

case CPU_UP_CANCELED_FROZEN:

2736

case CPU_UP_CANCELED_FROZEN:

2736

case CPU_DEAD:

2737

case CPU_DEAD:

2737

case CPU_DEAD_FROZEN:

2738

case CPU_DEAD_FROZEN:

2738

free_zone_pagesets(cpu);

2739

free_zone_pagesets(cpu);

2739

break;

2740

break;

2740

default:

2741

default:

2741

break;

2742

break;

2742

}

2743

}

2743

return ret;

2744

return ret;

2744

}

2745

}

2745

2746

static struct notifier_block __cpuinitdata pageset_notifier =

2747

static struct notifier_block __cpuinitdata pageset_notifier =

2747

{ &pageset_cpuup_callback, NULL, 0 };

2748

{ &pageset_cpuup_callback, NULL, 0 };

2748

2749

void __init setup_per_cpu_pageset(void)

2750

void __init setup_per_cpu_pageset(void)

2750

{

2751

{

2751

int err;

2752

int err;

2752

2753

/* Initialize per_cpu_pageset for cpu 0.

2754

/* Initialize per_cpu_pageset for cpu 0.

2754

* A cpuup callback will do this for every cpu

2755

* A cpuup callback will do this for every cpu

2755

* as it comes online

2756

* as it comes online

2756

*/

2757

*/

2757

err = process_zones(smp_processor_id());

2758

err = process_zones(smp_processor_id());

2758

BUG_ON(err);

2759

BUG_ON(err);

2759

register_cpu_notifier(&pageset_notifier);

2760

register_cpu_notifier(&pageset_notifier);

2760

}

2761

}

2761

2762

#endif

2763

#endif

2763

2764

static noinline __init_refok

2765

static noinline __init_refok

2765

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2766

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2766

{

2767

{

2767

int i;

2768

int i;

2768

struct pglist_data *pgdat = zone->zone_pgdat;

2769

struct pglist_data *pgdat = zone->zone_pgdat;

2769

size_t alloc_size;

2770

size_t alloc_size;

2770

2771

/*

2772

/*

2772

* The per-page waitqueue mechanism uses hashed waitqueues

2773

* The per-page waitqueue mechanism uses hashed waitqueues

2773

* per zone.

2774

* per zone.

2774

*/

2775

*/

2775

zone->wait_table_hash_nr_entries =

2776

zone->wait_table_hash_nr_entries =

2776

wait_table_hash_nr_entries(zone_size_pages);

2777

wait_table_hash_nr_entries(zone_size_pages);

2777

zone->wait_table_bits =

2778

zone->wait_table_bits =

2778

wait_table_bits(zone->wait_table_hash_nr_entries);

2779

wait_table_bits(zone->wait_table_hash_nr_entries);

2779

alloc_size = zone->wait_table_hash_nr_entries

2780

alloc_size = zone->wait_table_hash_nr_entries

2780

* sizeof(wait_queue_head_t);

2781

* sizeof(wait_queue_head_t);

2781

2782

if (!slab_is_available()) {

2783

if (!slab_is_available()) {

2783

zone->wait_table = (wait_queue_head_t *)

2784

zone->wait_table = (wait_queue_head_t *)

2784

alloc_bootmem_node(pgdat, alloc_size);

2785

alloc_bootmem_node(pgdat, alloc_size);

2785

} else {

2786

} else {

2786

/*

2787

/*

2787

* This case means that a zone whose size was 0 gets new memory

2788

* This case means that a zone whose size was 0 gets new memory

2788

* via memory hot-add.

2789

* via memory hot-add.

2789

* But it may be the case that a new node was hot-added. In

2790

* But it may be the case that a new node was hot-added. In

2790

* this case vmalloc() will not be able to use this new node's

2791

* this case vmalloc() will not be able to use this new node's

2791

* memory - this wait_table must be initialized to use this new

2792

* memory - this wait_table must be initialized to use this new

2792

* node itself as well.

2793

* node itself as well.

2793

* To use this new node's memory, further consideration will be

2794

* To use this new node's memory, further consideration will be

2794

* necessary.

2795

* necessary.

2795

*/

2796

*/

2796

zone->wait_table = vmalloc(alloc_size);

2797

zone->wait_table = vmalloc(alloc_size);

2797

}

2798

}

2798

if (!zone->wait_table)

2799

if (!zone->wait_table)

2799

return -ENOMEM;

2800

return -ENOMEM;

2800

2801

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2802

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2802

init_waitqueue_head(zone->wait_table + i);

2803

init_waitqueue_head(zone->wait_table + i);

2803

2804

return 0;

2805

return 0;

2805

}

2806

}

2806

2807

static __meminit void zone_pcp_init(struct zone *zone)

2808

static __meminit void zone_pcp_init(struct zone *zone)

2808

{

2809

{

2809

int cpu;

2810

int cpu;

2810

unsigned long batch = zone_batchsize(zone);

2811

unsigned long batch = zone_batchsize(zone);

2811

2812

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2813

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2813

#ifdef CONFIG_NUMA

2814

#ifdef CONFIG_NUMA

2814

/* Early boot. Slab allocator not functional yet */

2815

/* Early boot. Slab allocator not functional yet */

2815

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2816

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2816

setup_pageset(&boot_pageset[cpu],0);

2817

setup_pageset(&boot_pageset[cpu],0);

2817

#else

2818

#else

2818

setup_pageset(zone_pcp(zone,cpu), batch);

2819

setup_pageset(zone_pcp(zone,cpu), batch);

2819

#endif

2820

#endif

2820

}

2821

}

2821

if (zone->present_pages)

2822

if (zone->present_pages)

2822

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2823

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2823

zone->name, zone->present_pages, batch);

2824

zone->name, zone->present_pages, batch);

2824

}

2825

}

2825

2826

__meminit int init_currently_empty_zone(struct zone *zone,

2827

__meminit int init_currently_empty_zone(struct zone *zone,

2827

unsigned long zone_start_pfn,

2828

unsigned long zone_start_pfn,

2828

unsigned long size,

2829

unsigned long size,

2829

enum memmap_context context)

2830

enum memmap_context context)

2830

{

2831

{

2831

struct pglist_data *pgdat = zone->zone_pgdat;

2832

struct pglist_data *pgdat = zone->zone_pgdat;

2832

int ret;

2833

int ret;

2833

ret = zone_wait_table_init(zone, size);

2834

ret = zone_wait_table_init(zone, size);

2834

if (ret)

2835

if (ret)

2835

return ret;

2836

return ret;

2836

pgdat->nr_zones = zone_idx(zone) + 1;

2837

pgdat->nr_zones = zone_idx(zone) + 1;

2837

2838

zone->zone_start_pfn = zone_start_pfn;

2839

zone->zone_start_pfn = zone_start_pfn;

2839

2840

mminit_dprintk(MMINIT_TRACE, "memmap_init",

2841

mminit_dprintk(MMINIT_TRACE, "memmap_init",

2841

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

2842

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

2842

pgdat->node_id,

2843

pgdat->node_id,

2843

(unsigned long)zone_idx(zone),

2844

(unsigned long)zone_idx(zone),

2844

zone_start_pfn, (zone_start_pfn + size));

2845

zone_start_pfn, (zone_start_pfn + size));

2845

2846

zone_init_free_lists(zone);

2847

zone_init_free_lists(zone);

2847

2848

return 0;

2849

return 0;

2849

}

2850

}

2850

2851

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2852

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2852

/*

2853

/*

2853

* Basic iterator support. Return the first range of PFNs for a node

2854

* Basic iterator support. Return the first range of PFNs for a node

2854

* Note: nid == MAX_NUMNODES returns first region regardless of node

2855

* Note: nid == MAX_NUMNODES returns first region regardless of node

2855

*/

2856

*/

2856

static int __meminit first_active_region_index_in_nid(int nid)

2857

static int __meminit first_active_region_index_in_nid(int nid)

2857

{

2858

{

2858

int i;

2859

int i;

2859

2860

for (i = 0; i < nr_nodemap_entries; i++)

2861

for (i = 0; i < nr_nodemap_entries; i++)

2861

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2862

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2862

return i;

2863

return i;

2863

2864

return -1;

2865

return -1;

2865

}

2866

}

2866

2867

/*

2868

/*

2868

* Basic iterator support. Return the next active range of PFNs for a node

2869

* Basic iterator support. Return the next active range of PFNs for a node

2869

* Note: nid == MAX_NUMNODES returns next region regardless of node

2870

* Note: nid == MAX_NUMNODES returns next region regardless of node

2870

*/

2871

*/

2871

static int __meminit next_active_region_index_in_nid(int index, int nid)

2872

static int __meminit next_active_region_index_in_nid(int index, int nid)

2872

{

2873

{

2873

for (index = index + 1; index < nr_nodemap_entries; index++)

2874

for (index = index + 1; index < nr_nodemap_entries; index++)

2874

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2875

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2875

return index;

2876

return index;

2876

2877

return -1;

2878

return -1;

2878

}

2879

}

2879

2880

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2881

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2881

/*

2882

/*

2882

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2883

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2883

* Architectures may implement their own version but if add_active_range()

2884

* Architectures may implement their own version but if add_active_range()

2884

* was used and there are no special requirements, this is a convenient

2885

* was used and there are no special requirements, this is a convenient

2885

* alternative

2886

* alternative

2886

*/

2887

*/

2887

int __meminit early_pfn_to_nid(unsigned long pfn)

2888

int __meminit early_pfn_to_nid(unsigned long pfn)

2888

{

2889

{

2889

int i;

2890

int i;

2890

2891

for (i = 0; i < nr_nodemap_entries; i++) {

2892

for (i = 0; i < nr_nodemap_entries; i++) {

2892

unsigned long start_pfn = early_node_map[i].start_pfn;

2893

unsigned long start_pfn = early_node_map[i].start_pfn;

2893

unsigned long end_pfn = early_node_map[i].end_pfn;

2894

unsigned long end_pfn = early_node_map[i].end_pfn;

2894

2895

if (start_pfn <= pfn && pfn < end_pfn)

2896

if (start_pfn <= pfn && pfn < end_pfn)

2896

return early_node_map[i].nid;

2897

return early_node_map[i].nid;

2897

}

2898

}

2898

2899

return 0;

2900

return 0;

2900

}

2901

}

2901

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2902

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2902

2903

/* Basic iterator support to walk early_node_map[] */

2904

/* Basic iterator support to walk early_node_map[] */

2904

#define for_each_active_range_index_in_nid(i, nid) \

2905

#define for_each_active_range_index_in_nid(i, nid) \

2905

for (i = first_active_region_index_in_nid(nid); i != -1; \

2906

for (i = first_active_region_index_in_nid(nid); i != -1; \

2906

i = next_active_region_index_in_nid(i, nid))

2907

i = next_active_region_index_in_nid(i, nid))

2907

2908

/**

2909

/**

2909

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2910

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2910

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2911

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2911

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2912

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2912

*

2913

*

2913

* If an architecture guarantees that all ranges registered with

2914

* If an architecture guarantees that all ranges registered with

2914

* add_active_ranges() contain no holes and may be freed, this

2915

* add_active_ranges() contain no holes and may be freed, this

2915

* this function may be used instead of calling free_bootmem() manually.

2916

* this function may be used instead of calling free_bootmem() manually.

2916

*/

2917

*/

2917

void __init free_bootmem_with_active_regions(int nid,

2918

void __init free_bootmem_with_active_regions(int nid,

2918

unsigned long max_low_pfn)

2919

unsigned long max_low_pfn)

2919

{

2920

{

2920

int i;

2921

int i;

2921

2922

for_each_active_range_index_in_nid(i, nid) {

2923

for_each_active_range_index_in_nid(i, nid) {

2923

unsigned long size_pages = 0;

2924

unsigned long size_pages = 0;

2924

unsigned long end_pfn = early_node_map[i].end_pfn;

2925

unsigned long end_pfn = early_node_map[i].end_pfn;

2925

2926

if (early_node_map[i].start_pfn >= max_low_pfn)

2927

if (early_node_map[i].start_pfn >= max_low_pfn)

2927

continue;

2928

continue;

2928

2929

if (end_pfn > max_low_pfn)

2930

if (end_pfn > max_low_pfn)

2930

end_pfn = max_low_pfn;

2931

end_pfn = max_low_pfn;

2931

2932

size_pages = end_pfn - early_node_map[i].start_pfn;

2933

size_pages = end_pfn - early_node_map[i].start_pfn;

2933

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2934

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2934

PFN_PHYS(early_node_map[i].start_pfn),

2935

PFN_PHYS(early_node_map[i].start_pfn),

2935

size_pages << PAGE_SHIFT);

2936

size_pages << PAGE_SHIFT);

2936

}

2937

}

2937

}

2938

}

2938

2939

void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)

2940

void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)

2940

{

2941

{

2941

int i;

2942

int i;

2942

int ret;

2943

int ret;

2943

2944

for_each_active_range_index_in_nid(i, nid) {

2945

for_each_active_range_index_in_nid(i, nid) {

2945

ret = work_fn(early_node_map[i].start_pfn,

2946

ret = work_fn(early_node_map[i].start_pfn,

2946

early_node_map[i].end_pfn, data);

2947

early_node_map[i].end_pfn, data);

2947

if (ret)

2948

if (ret)

2948

break;

2949

break;

2949

}

2950

}

2950

}

2951

}

2951

/**

2952

/**

2952

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2953

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2953

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2954

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2954

*

2955

*

2955

* If an architecture guarantees that all ranges registered with

2956

* If an architecture guarantees that all ranges registered with

2956

* add_active_ranges() contain no holes and may be freed, this

2957

* add_active_ranges() contain no holes and may be freed, this

2957

* function may be used instead of calling memory_present() manually.

2958

* function may be used instead of calling memory_present() manually.

2958

*/

2959

*/

2959

void __init sparse_memory_present_with_active_regions(int nid)

2960

void __init sparse_memory_present_with_active_regions(int nid)

2960

{

2961

{

2961

int i;

2962

int i;

2962

2963

for_each_active_range_index_in_nid(i, nid)

2964

for_each_active_range_index_in_nid(i, nid)

2964

memory_present(early_node_map[i].nid,

2965

memory_present(early_node_map[i].nid,

2965

early_node_map[i].start_pfn,

2966

early_node_map[i].start_pfn,

2966

early_node_map[i].end_pfn);

2967

early_node_map[i].end_pfn);

2967

}

2968

}

2968

2969

/**

2970

/**

2970

* push_node_boundaries - Push node boundaries to at least the requested boundary

2971

* push_node_boundaries - Push node boundaries to at least the requested boundary

2971

* @nid: The nid of the node to push the boundary for

2972

* @nid: The nid of the node to push the boundary for

2972

* @start_pfn: The start pfn of the node

2973

* @start_pfn: The start pfn of the node

2973

* @end_pfn: The end pfn of the node

2974

* @end_pfn: The end pfn of the node

2974

*

2975

*

2975

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2976

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2976

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2977

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2977

* be hotplugged even though no physical memory exists. This function allows

2978

* be hotplugged even though no physical memory exists. This function allows

2978

* an arch to push out the node boundaries so mem_map is allocated that can

2979

* an arch to push out the node boundaries so mem_map is allocated that can

2979

* be used later.

2980

* be used later.

2980

*/

2981

*/

2981

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2982

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2982

void __init push_node_boundaries(unsigned int nid,

2983

void __init push_node_boundaries(unsigned int nid,

2983

unsigned long start_pfn, unsigned long end_pfn)

2984

unsigned long start_pfn, unsigned long end_pfn)

2984

{

2985

{

2985

mminit_dprintk(MMINIT_TRACE, "zoneboundary",

2986

mminit_dprintk(MMINIT_TRACE, "zoneboundary",

2986

"Entering push_node_boundaries(%u, %lu, %lu)\n",

2987

"Entering push_node_boundaries(%u, %lu, %lu)\n",

2987

nid, start_pfn, end_pfn);

2988

nid, start_pfn, end_pfn);

2988

2989

/* Initialise the boundary for this node if necessary */

2990

/* Initialise the boundary for this node if necessary */

2990

if (node_boundary_end_pfn[nid] == 0)

2991

if (node_boundary_end_pfn[nid] == 0)

2991

node_boundary_start_pfn[nid] = -1UL;

2992

node_boundary_start_pfn[nid] = -1UL;

2992

2993

/* Update the boundaries */

2994

/* Update the boundaries */

2994

if (node_boundary_start_pfn[nid] > start_pfn)

2995

if (node_boundary_start_pfn[nid] > start_pfn)

2995

node_boundary_start_pfn[nid] = start_pfn;

2996

node_boundary_start_pfn[nid] = start_pfn;

2996

if (node_boundary_end_pfn[nid] < end_pfn)

2997

if (node_boundary_end_pfn[nid] < end_pfn)

2997

node_boundary_end_pfn[nid] = end_pfn;

2998

node_boundary_end_pfn[nid] = end_pfn;

2998

}

2999

}

2999

3000

/* If necessary, push the node boundary out for reserve hotadd */

3001

/* If necessary, push the node boundary out for reserve hotadd */

3001

static void __meminit account_node_boundary(unsigned int nid,

3002

static void __meminit account_node_boundary(unsigned int nid,

3002

unsigned long *start_pfn, unsigned long *end_pfn)

3003

unsigned long *start_pfn, unsigned long *end_pfn)

3003

{

3004

{

3004

mminit_dprintk(MMINIT_TRACE, "zoneboundary",

3005

mminit_dprintk(MMINIT_TRACE, "zoneboundary",

3005

"Entering account_node_boundary(%u, %lu, %lu)\n",

3006

"Entering account_node_boundary(%u, %lu, %lu)\n",

3006

nid, *start_pfn, *end_pfn);

3007

nid, *start_pfn, *end_pfn);

3007

3008

/* Return if boundary information has not been provided */

3009

/* Return if boundary information has not been provided */

3009

if (node_boundary_end_pfn[nid] == 0)

3010

if (node_boundary_end_pfn[nid] == 0)

3010

return;

3011

return;

3011

3012

/* Check the boundaries and update if necessary */

3013

/* Check the boundaries and update if necessary */

3013

if (node_boundary_start_pfn[nid] < *start_pfn)

3014

if (node_boundary_start_pfn[nid] < *start_pfn)

3014

*start_pfn = node_boundary_start_pfn[nid];

3015

*start_pfn = node_boundary_start_pfn[nid];

3015

if (node_boundary_end_pfn[nid] > *end_pfn)

3016

if (node_boundary_end_pfn[nid] > *end_pfn)

3016

*end_pfn = node_boundary_end_pfn[nid];

3017

*end_pfn = node_boundary_end_pfn[nid];

3017

}

3018

}

3018

#else

3019

#else

3019

void __init push_node_boundaries(unsigned int nid,

3020

void __init push_node_boundaries(unsigned int nid,

3020

unsigned long start_pfn, unsigned long end_pfn) {}

3021

unsigned long start_pfn, unsigned long end_pfn) {}

3021

3022

static void __meminit account_node_boundary(unsigned int nid,

3023

static void __meminit account_node_boundary(unsigned int nid,

3023

unsigned long *start_pfn, unsigned long *end_pfn) {}

3024

unsigned long *start_pfn, unsigned long *end_pfn) {}

3024

#endif

3025

#endif

3025

3026

3027

/**

3028

/**

3028

* get_pfn_range_for_nid - Return the start and end page frames for a node

3029

* get_pfn_range_for_nid - Return the start and end page frames for a node

3029

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

3030

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

3030

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

3031

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

3031

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

3032

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

3032

*

3033

*

3033

* It returns the start and end page frame of a node based on information

3034

* It returns the start and end page frame of a node based on information

3034

* provided by an arch calling add_active_range(). If called for a node

3035

* provided by an arch calling add_active_range(). If called for a node

3035

* with no available memory, a warning is printed and the start and end

3036

* with no available memory, a warning is printed and the start and end

3036

* PFNs will be 0.

3037

* PFNs will be 0.

3037

*/

3038

*/

3038

void __meminit get_pfn_range_for_nid(unsigned int nid,

3039

void __meminit get_pfn_range_for_nid(unsigned int nid,

3039

unsigned long *start_pfn, unsigned long *end_pfn)

3040

unsigned long *start_pfn, unsigned long *end_pfn)

3040

{

3041

{

3041

int i;

3042

int i;

3042

*start_pfn = -1UL;

3043

*start_pfn = -1UL;

3043

*end_pfn = 0;

3044

*end_pfn = 0;

3044

3045

for_each_active_range_index_in_nid(i, nid) {

3046

for_each_active_range_index_in_nid(i, nid) {

3046

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

3047

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

3047

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

3048

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

3048

}

3049

}

3049

3050

if (*start_pfn == -1UL)

3051

if (*start_pfn == -1UL)

3051

*start_pfn = 0;

3052

*start_pfn = 0;

3052

3053

/* Push the node boundaries out if requested */

3054

/* Push the node boundaries out if requested */

3054

account_node_boundary(nid, start_pfn, end_pfn);

3055

account_node_boundary(nid, start_pfn, end_pfn);

3055

}

3056

}

3056

3057

/*

3058

/*

3058

* This finds a zone that can be used for ZONE_MOVABLE pages. The

3059

* This finds a zone that can be used for ZONE_MOVABLE pages. The

3059

* assumption is made that zones within a node are ordered in monotonic

3060

* assumption is made that zones within a node are ordered in monotonic

3060

* increasing memory addresses so that the "highest" populated zone is used

3061

* increasing memory addresses so that the "highest" populated zone is used

3061

*/

3062

*/

3062

void __init find_usable_zone_for_movable(void)

3063

void __init find_usable_zone_for_movable(void)

3063

{

3064

{

3064

int zone_index;

3065

int zone_index;

3065

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

3066

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

3066

if (zone_index == ZONE_MOVABLE)

3067

if (zone_index == ZONE_MOVABLE)

3067

continue;

3068

continue;

3068

3069

if (arch_zone_highest_possible_pfn[zone_index] >

3070

if (arch_zone_highest_possible_pfn[zone_index] >

3070

arch_zone_lowest_possible_pfn[zone_index])

3071

arch_zone_lowest_possible_pfn[zone_index])

3071

break;

3072

break;

3072

}

3073

}

3073

3074

VM_BUG_ON(zone_index == -1);

3075

VM_BUG_ON(zone_index == -1);

3075

movable_zone = zone_index;

3076

movable_zone = zone_index;

3076

}

3077

}

3077

3078

/*

3079

/*

3079

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

3080

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

3080

* because it is sized independant of architecture. Unlike the other zones,

3081

* because it is sized independant of architecture. Unlike the other zones,

3081

* the starting point for ZONE_MOVABLE is not fixed. It may be different

3082

* the starting point for ZONE_MOVABLE is not fixed. It may be different

3082

* in each node depending on the size of each node and how evenly kernelcore

3083

* in each node depending on the size of each node and how evenly kernelcore

3083

* is distributed. This helper function adjusts the zone ranges

3084

* is distributed. This helper function adjusts the zone ranges

3084

* provided by the architecture for a given node by using the end of the

3085

* provided by the architecture for a given node by using the end of the

3085

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

3086

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

3086

* zones within a node are in order of monotonic increases memory addresses

3087

* zones within a node are in order of monotonic increases memory addresses

3087

*/

3088

*/

3088

void __meminit adjust_zone_range_for_zone_movable(int nid,

3089

void __meminit adjust_zone_range_for_zone_movable(int nid,

3089

unsigned long zone_type,

3090

unsigned long zone_type,

3090

unsigned long node_start_pfn,

3091

unsigned long node_start_pfn,

3091

unsigned long node_end_pfn,

3092

unsigned long node_end_pfn,

3092

unsigned long *zone_start_pfn,

3093

unsigned long *zone_start_pfn,

3093

unsigned long *zone_end_pfn)

3094

unsigned long *zone_end_pfn)

3094

{

3095

{

3095

/* Only adjust if ZONE_MOVABLE is on this node */

3096

/* Only adjust if ZONE_MOVABLE is on this node */

3096

if (zone_movable_pfn[nid]) {

3097

if (zone_movable_pfn[nid]) {

3097

/* Size ZONE_MOVABLE */

3098

/* Size ZONE_MOVABLE */

3098

if (zone_type == ZONE_MOVABLE) {

3099

if (zone_type == ZONE_MOVABLE) {

3099

*zone_start_pfn = zone_movable_pfn[nid];

3100

*zone_start_pfn = zone_movable_pfn[nid];

3100

*zone_end_pfn = min(node_end_pfn,

3101

*zone_end_pfn = min(node_end_pfn,

3101

arch_zone_highest_possible_pfn[movable_zone]);

3102

arch_zone_highest_possible_pfn[movable_zone]);

3102

3103

/* Adjust for ZONE_MOVABLE starting within this range */

3104

/* Adjust for ZONE_MOVABLE starting within this range */

3104

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

3105

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

3105

*zone_end_pfn > zone_movable_pfn[nid]) {

3106

*zone_end_pfn > zone_movable_pfn[nid]) {

3106

*zone_end_pfn = zone_movable_pfn[nid];

3107

*zone_end_pfn = zone_movable_pfn[nid];

3107

3108

/* Check if this whole range is within ZONE_MOVABLE */

3109

/* Check if this whole range is within ZONE_MOVABLE */

3109

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

3110

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

3110

*zone_start_pfn = *zone_end_pfn;

3111

*zone_start_pfn = *zone_end_pfn;

3111

}

3112

}

3112

}

3113

}

3113

3114

/*

3115

/*

3115

* Return the number of pages a zone spans in a node, including holes

3116

* Return the number of pages a zone spans in a node, including holes

3116

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

3117

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

3117

*/

3118

*/

3118

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

3119

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

3119

unsigned long zone_type,

3120

unsigned long zone_type,

3120

unsigned long *ignored)

3121

unsigned long *ignored)

3121

{

3122

{

3122

unsigned long node_start_pfn, node_end_pfn;

3123

unsigned long node_start_pfn, node_end_pfn;

3123

unsigned long zone_start_pfn, zone_end_pfn;

3124

unsigned long zone_start_pfn, zone_end_pfn;

3124

3125

/* Get the start and end of the node and zone */

3126

/* Get the start and end of the node and zone */

3126

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3127

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3127

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

3128

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

3128

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

3129

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

3129

adjust_zone_range_for_zone_movable(nid, zone_type,

3130

adjust_zone_range_for_zone_movable(nid, zone_type,

3130

node_start_pfn, node_end_pfn,

3131

node_start_pfn, node_end_pfn,

3131

&zone_start_pfn, &zone_end_pfn);

3132

&zone_start_pfn, &zone_end_pfn);

3132

3133

/* Check that this node has pages within the zone's required range */

3134

/* Check that this node has pages within the zone's required range */

3134

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

3135

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

3135

return 0;

3136

return 0;

3136

3137

/* Move the zone boundaries inside the node if necessary */

3138

/* Move the zone boundaries inside the node if necessary */

3138

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

3139

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

3139

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

3140

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

3140

3141

/* Return the spanned pages */

3142

/* Return the spanned pages */

3142

return zone_end_pfn - zone_start_pfn;

3143

return zone_end_pfn - zone_start_pfn;

3143

}

3144

}

3144

3145

/*

3146

/*

3146

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

3147

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

3147

* then all holes in the requested range will be accounted for.

3148

* then all holes in the requested range will be accounted for.

3148

*/

3149

*/

3149

unsigned long __meminit __absent_pages_in_range(int nid,

3150

unsigned long __meminit __absent_pages_in_range(int nid,

3150

unsigned long range_start_pfn,

3151

unsigned long range_start_pfn,

3151

unsigned long range_end_pfn)

3152

unsigned long range_end_pfn)

3152

{

3153

{

3153

int i = 0;

3154

int i = 0;

3154

unsigned long prev_end_pfn = 0, hole_pages = 0;

3155

unsigned long prev_end_pfn = 0, hole_pages = 0;

3155

unsigned long start_pfn;

3156

unsigned long start_pfn;

3156

3157

/* Find the end_pfn of the first active range of pfns in the node */

3158

/* Find the end_pfn of the first active range of pfns in the node */

3158

i = first_active_region_index_in_nid(nid);

3159

i = first_active_region_index_in_nid(nid);

3159

if (i == -1)

3160

if (i == -1)

3160

return 0;

3161

return 0;

3161

3162

prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3163

prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3163

3164

/* Account for ranges before physical memory on this node */

3165

/* Account for ranges before physical memory on this node */

3165

if (early_node_map[i].start_pfn > range_start_pfn)

3166

if (early_node_map[i].start_pfn > range_start_pfn)

3166

hole_pages = prev_end_pfn - range_start_pfn;

3167

hole_pages = prev_end_pfn - range_start_pfn;

3167

3168

/* Find all holes for the zone within the node */

3169

/* Find all holes for the zone within the node */

3169

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

3170

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

3170

3171

/* No need to continue if prev_end_pfn is outside the zone */

3172

/* No need to continue if prev_end_pfn is outside the zone */

3172

if (prev_end_pfn >= range_end_pfn)

3173

if (prev_end_pfn >= range_end_pfn)

3173

break;

3174

break;

3174

3175

/* Make sure the end of the zone is not within the hole */

3176

/* Make sure the end of the zone is not within the hole */

3176

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3177

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3177

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

3178

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

3178

3179

/* Update the hole size cound and move on */

3180

/* Update the hole size cound and move on */

3180

if (start_pfn > range_start_pfn) {

3181

if (start_pfn > range_start_pfn) {

3181

BUG_ON(prev_end_pfn > start_pfn);

3182

BUG_ON(prev_end_pfn > start_pfn);

3182

hole_pages += start_pfn - prev_end_pfn;

3183

hole_pages += start_pfn - prev_end_pfn;

3183

}

3184

}

3184

prev_end_pfn = early_node_map[i].end_pfn;

3185

prev_end_pfn = early_node_map[i].end_pfn;

3185

}

3186

}

3186

3187

/* Account for ranges past physical memory on this node */

3188

/* Account for ranges past physical memory on this node */

3188

if (range_end_pfn > prev_end_pfn)

3189

if (range_end_pfn > prev_end_pfn)

3189

hole_pages += range_end_pfn -

3190

hole_pages += range_end_pfn -

3190

max(range_start_pfn, prev_end_pfn);

3191

max(range_start_pfn, prev_end_pfn);

3191

3192

return hole_pages;

3193

return hole_pages;

3193

}

3194

}

3194

3195

/**

3196

/**

3196

* absent_pages_in_range - Return number of page frames in holes within a range

3197

* absent_pages_in_range - Return number of page frames in holes within a range

3197

* @start_pfn: The start PFN to start searching for holes

3198

* @start_pfn: The start PFN to start searching for holes

3198

* @end_pfn: The end PFN to stop searching for holes

3199

* @end_pfn: The end PFN to stop searching for holes

3199

*

3200

*

3200

* It returns the number of pages frames in memory holes within a range.

3201

* It returns the number of pages frames in memory holes within a range.

3201

*/

3202

*/

3202

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

3203

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

3203

unsigned long end_pfn)

3204

unsigned long end_pfn)

3204

{

3205

{

3205

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

3206

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

3206

}

3207

}

3207

3208

/* Return the number of page frames in holes in a zone on a node */

3209

/* Return the number of page frames in holes in a zone on a node */

3209

static unsigned long __meminit zone_absent_pages_in_node(int nid,

3210

static unsigned long __meminit zone_absent_pages_in_node(int nid,

3210

unsigned long zone_type,

3211

unsigned long zone_type,

3211

unsigned long *ignored)

3212

unsigned long *ignored)

3212

{

3213

{

3213

unsigned long node_start_pfn, node_end_pfn;

3214

unsigned long node_start_pfn, node_end_pfn;

3214

unsigned long zone_start_pfn, zone_end_pfn;

3215

unsigned long zone_start_pfn, zone_end_pfn;

3215

3216

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3217

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3217

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

3218

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

3218

node_start_pfn);

3219

node_start_pfn);

3219

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

3220

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

3220

node_end_pfn);

3221

node_end_pfn);

3221

3222

adjust_zone_range_for_zone_movable(nid, zone_type,

3223

adjust_zone_range_for_zone_movable(nid, zone_type,

3223

node_start_pfn, node_end_pfn,

3224

node_start_pfn, node_end_pfn,

3224

&zone_start_pfn, &zone_end_pfn);

3225

&zone_start_pfn, &zone_end_pfn);

3225

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

3226

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

3226

}

3227

}

3227

3228

#else

3229

#else

3229

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

3230

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

3230

unsigned long zone_type,

3231

unsigned long zone_type,

3231

unsigned long *zones_size)

3232

unsigned long *zones_size)

3232

{

3233

{

3233

return zones_size[zone_type];

3234

return zones_size[zone_type];

3234

}

3235

}

3235

3236

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

3237

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

3237

unsigned long zone_type,

3238

unsigned long zone_type,

3238

unsigned long *zholes_size)

3239

unsigned long *zholes_size)

3239

{

3240

{

3240

if (!zholes_size)

3241

if (!zholes_size)

3241

return 0;

3242

return 0;

3242

3243

return zholes_size[zone_type];

3244

return zholes_size[zone_type];

3244

}

3245

}

3245

3246

#endif

3247

#endif

3247

3248

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

3249

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

3249

unsigned long *zones_size, unsigned long *zholes_size)

3250

unsigned long *zones_size, unsigned long *zholes_size)

3250

{

3251

{

3251

unsigned long realtotalpages, totalpages = 0;

3252

unsigned long realtotalpages, totalpages = 0;

3252

enum zone_type i;

3253

enum zone_type i;

3253

3254

for (i = 0; i < MAX_NR_ZONES; i++)

3255

for (i = 0; i < MAX_NR_ZONES; i++)

3255

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

3256

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

3256

zones_size);

3257

zones_size);

3257

pgdat->node_spanned_pages = totalpages;

3258

pgdat->node_spanned_pages = totalpages;

3258

3259

realtotalpages = totalpages;

3260

realtotalpages = totalpages;

3260

for (i = 0; i < MAX_NR_ZONES; i++)

3261

for (i = 0; i < MAX_NR_ZONES; i++)

3261

realtotalpages -=

3262

realtotalpages -=

3262

zone_absent_pages_in_node(pgdat->node_id, i,

3263

zone_absent_pages_in_node(pgdat->node_id, i,

3263

zholes_size);

3264

zholes_size);

3264

pgdat->node_present_pages = realtotalpages;

3265

pgdat->node_present_pages = realtotalpages;

3265

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

3266

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

3266

realtotalpages);

3267

realtotalpages);

3267

}

3268

}

3268

3269

#ifndef CONFIG_SPARSEMEM

3270

#ifndef CONFIG_SPARSEMEM

3270

/*

3271

/*

3271

* Calculate the size of the zone->blockflags rounded to an unsigned long

3272

* Calculate the size of the zone->blockflags rounded to an unsigned long

3272

* Start by making sure zonesize is a multiple of pageblock_order by rounding

3273

* Start by making sure zonesize is a multiple of pageblock_order by rounding

3273

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

3274

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

3274

* round what is now in bits to nearest long in bits, then return it in

3275

* round what is now in bits to nearest long in bits, then return it in

3275

* bytes.

3276

* bytes.

3276

*/

3277

*/

3277

static unsigned long __init usemap_size(unsigned long zonesize)

3278

static unsigned long __init usemap_size(unsigned long zonesize)

3278

{

3279

{

3279

unsigned long usemapsize;

3280

unsigned long usemapsize;

3280

3281

usemapsize = roundup(zonesize, pageblock_nr_pages);

3282

usemapsize = roundup(zonesize, pageblock_nr_pages);

3282

usemapsize = usemapsize >> pageblock_order;

3283

usemapsize = usemapsize >> pageblock_order;

3283

usemapsize *= NR_PAGEBLOCK_BITS;

3284

usemapsize *= NR_PAGEBLOCK_BITS;

3284

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

3285

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

3285

3286

return usemapsize / 8;

3287

return usemapsize / 8;

3287

}

3288

}

3288

3289

static void __init setup_usemap(struct pglist_data *pgdat,

3290

static void __init setup_usemap(struct pglist_data *pgdat,

3290

struct zone *zone, unsigned long zonesize)

3291

struct zone *zone, unsigned long zonesize)

3291

{

3292

{

3292

unsigned long usemapsize = usemap_size(zonesize);

3293

unsigned long usemapsize = usemap_size(zonesize);

3293

zone->pageblock_flags = NULL;

3294

zone->pageblock_flags = NULL;

3294

if (usemapsize) {

3295

if (usemapsize) {

3295

zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);

3296

zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);

3296

memset(zone->pageblock_flags, 0, usemapsize);

3297

memset(zone->pageblock_flags, 0, usemapsize);

3297

}

3298

}

3298

}

3299

}

3299

#else

3300

#else

3300

static void inline setup_usemap(struct pglist_data *pgdat,

3301

static void inline setup_usemap(struct pglist_data *pgdat,

3301

struct zone *zone, unsigned long zonesize) {}

3302

struct zone *zone, unsigned long zonesize) {}

3302

#endif /* CONFIG_SPARSEMEM */

3303

#endif /* CONFIG_SPARSEMEM */

3303

3304

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

3305

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

3305

3306

/* Return a sensible default order for the pageblock size. */

3307

/* Return a sensible default order for the pageblock size. */

3307

static inline int pageblock_default_order(void)

3308

static inline int pageblock_default_order(void)

3308

{

3309

{

3309

if (HPAGE_SHIFT > PAGE_SHIFT)

3310

if (HPAGE_SHIFT > PAGE_SHIFT)

3310

return HUGETLB_PAGE_ORDER;

3311

return HUGETLB_PAGE_ORDER;

3311

3312

return MAX_ORDER-1;

3313

return MAX_ORDER-1;

3313

}

3314

}

3314

3315

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

3316

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

3316

static inline void __init set_pageblock_order(unsigned int order)

3317

static inline void __init set_pageblock_order(unsigned int order)

3317

{

3318

{

3318

/* Check that pageblock_nr_pages has not already been setup */

3319

/* Check that pageblock_nr_pages has not already been setup */

3319

if (pageblock_order)

3320

if (pageblock_order)

3320

return;

3321

return;

3321

3322

/*

3323

/*

3323

* Assume the largest contiguous order of interest is a huge page.

3324

* Assume the largest contiguous order of interest is a huge page.

3324

* This value may be variable depending on boot parameters on IA64

3325

* This value may be variable depending on boot parameters on IA64

3325

*/

3326

*/

3326

pageblock_order = order;

3327

pageblock_order = order;

3327

}

3328

}

3328

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3329

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3329

3330

/*

3331

/*

3331

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

3332

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

3332

* and pageblock_default_order() are unused as pageblock_order is set

3333

* and pageblock_default_order() are unused as pageblock_order is set

3333

* at compile-time. See include/linux/pageblock-flags.h for the values of

3334

* at compile-time. See include/linux/pageblock-flags.h for the values of

3334

* pageblock_order based on the kernel config

3335

* pageblock_order based on the kernel config

3335

*/

3336

*/

3336

static inline int pageblock_default_order(unsigned int order)

3337

static inline int pageblock_default_order(unsigned int order)

3337

{

3338

{

3338

return MAX_ORDER-1;

3339

return MAX_ORDER-1;

3339

}

3340

}

3340

#define set_pageblock_order(x) do {} while (0)

3341

#define set_pageblock_order(x) do {} while (0)

3341

3342

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3343

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3343

3344

/*

3345

/*

3345

* Set up the zone data structures:

3346

* Set up the zone data structures:

3346

* - mark all pages reserved

3347

* - mark all pages reserved

3347

* - mark all memory queues empty

3348

* - mark all memory queues empty

3348

* - clear the memory bitmaps

3349

* - clear the memory bitmaps

3349

*/

3350

*/

3350

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

3351

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

3351

unsigned long *zones_size, unsigned long *zholes_size)

3352

unsigned long *zones_size, unsigned long *zholes_size)

3352

{

3353

{

3353

enum zone_type j;

3354

enum zone_type j;

3354

int nid = pgdat->node_id;

3355

int nid = pgdat->node_id;

3355

unsigned long zone_start_pfn = pgdat->node_start_pfn;

3356

unsigned long zone_start_pfn = pgdat->node_start_pfn;

3356

int ret;

3357

int ret;

3357

3358

pgdat_resize_init(pgdat);

3359

pgdat_resize_init(pgdat);

3359

pgdat->nr_zones = 0;

3360

pgdat->nr_zones = 0;

3360

init_waitqueue_head(&pgdat->kswapd_wait);

3361

init_waitqueue_head(&pgdat->kswapd_wait);

3361

pgdat->kswapd_max_order = 0;

3362

pgdat->kswapd_max_order = 0;

3362

3363

for (j = 0; j < MAX_NR_ZONES; j++) {

3364

for (j = 0; j < MAX_NR_ZONES; j++) {

3364

struct zone *zone = pgdat->node_zones + j;

3365

struct zone *zone = pgdat->node_zones + j;

3365

unsigned long size, realsize, memmap_pages;

3366

unsigned long size, realsize, memmap_pages;

3366

3367

size = zone_spanned_pages_in_node(nid, j, zones_size);

3368

size = zone_spanned_pages_in_node(nid, j, zones_size);

3368

realsize = size - zone_absent_pages_in_node(nid, j,

3369

realsize = size - zone_absent_pages_in_node(nid, j,

3369

zholes_size);

3370

zholes_size);

3370

3371

/*

3372

/*

3372

* Adjust realsize so that it accounts for how much memory

3373

* Adjust realsize so that it accounts for how much memory

3373

* is used by this zone for memmap. This affects the watermark

3374

* is used by this zone for memmap. This affects the watermark

3374

* and per-cpu initialisations

3375

* and per-cpu initialisations

3375

*/

3376

*/

3376

memmap_pages =

3377

memmap_pages =

3377

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

3378

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

3378

if (realsize >= memmap_pages) {

3379

if (realsize >= memmap_pages) {

3379

realsize -= memmap_pages;

3380

realsize -= memmap_pages;

3380

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3381

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3381

"%s zone: %lu pages used for memmap\n",

3382

"%s zone: %lu pages used for memmap\n",

3382

zone_names[j], memmap_pages);

3383

zone_names[j], memmap_pages);

3383

} else

3384

} else

3384

printk(KERN_WARNING

3385

printk(KERN_WARNING

3385

" %s zone: %lu pages exceeds realsize %lu\n",

3386

" %s zone: %lu pages exceeds realsize %lu\n",

3386

zone_names[j], memmap_pages, realsize);

3387

zone_names[j], memmap_pages, realsize);

3387

3388

/* Account for reserved pages */

3389

/* Account for reserved pages */

3389

if (j == 0 && realsize > dma_reserve) {

3390

if (j == 0 && realsize > dma_reserve) {

3390

realsize -= dma_reserve;

3391

realsize -= dma_reserve;

3391

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3392

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3392

"%s zone: %lu pages reserved\n",

3393

"%s zone: %lu pages reserved\n",

3393

zone_names[0], dma_reserve);

3394

zone_names[0], dma_reserve);

3394

}

3395

}

3395

3396

if (!is_highmem_idx(j))

3397

if (!is_highmem_idx(j))

3397

nr_kernel_pages += realsize;

3398

nr_kernel_pages += realsize;

3398

nr_all_pages += realsize;

3399

nr_all_pages += realsize;

3399

3400

zone->spanned_pages = size;

3401

zone->spanned_pages = size;

3401

zone->present_pages = realsize;

3402

zone->present_pages = realsize;

3402

#ifdef CONFIG_NUMA

3403

#ifdef CONFIG_NUMA

3403

zone->node = nid;

3404

zone->node = nid;

3404

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

3405

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

3405

/ 100;

3406

/ 100;

3406

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

3407

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

3407

#endif

3408

#endif

3408

zone->name = zone_names[j];

3409

zone->name = zone_names[j];

3409

spin_lock_init(&zone->lock);

3410

spin_lock_init(&zone->lock);

3410

spin_lock_init(&zone->lru_lock);

3411

spin_lock_init(&zone->lru_lock);

3411

zone_seqlock_init(zone);

3412

zone_seqlock_init(zone);

3412

zone->zone_pgdat = pgdat;

3413

zone->zone_pgdat = pgdat;

3413

3414

zone->prev_priority = DEF_PRIORITY;

3415

zone->prev_priority = DEF_PRIORITY;

3415

3416

zone_pcp_init(zone);

3417

zone_pcp_init(zone);

3417

INIT_LIST_HEAD(&zone->active_list);

3418

INIT_LIST_HEAD(&zone->active_list);

3418

INIT_LIST_HEAD(&zone->inactive_list);

3419

INIT_LIST_HEAD(&zone->inactive_list);

3419

zone->nr_scan_active = 0;

3420

zone->nr_scan_active = 0;

3420

zone->nr_scan_inactive = 0;

3421

zone->nr_scan_inactive = 0;

3421

zap_zone_vm_stats(zone);

3422

zap_zone_vm_stats(zone);

3422

zone->flags = 0;

3423

zone->flags = 0;

3423

if (!size)

3424

if (!size)

3424

continue;

3425

continue;

3425

3426

set_pageblock_order(pageblock_default_order());

3427

set_pageblock_order(pageblock_default_order());

3427

setup_usemap(pgdat, zone, size);

3428

setup_usemap(pgdat, zone, size);

3428

ret = init_currently_empty_zone(zone, zone_start_pfn,

3429

ret = init_currently_empty_zone(zone, zone_start_pfn,

3429

size, MEMMAP_EARLY);

3430

size, MEMMAP_EARLY);

3430

BUG_ON(ret);

3431

BUG_ON(ret);

3431

memmap_init(size, nid, j, zone_start_pfn);

3432

memmap_init(size, nid, j, zone_start_pfn);

3432

zone_start_pfn += size;

3433

zone_start_pfn += size;

3433

}

3434

}

3434

}

3435

}

3435

3436

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

3437

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

3437

{

3438

{

3438

/* Skip empty nodes */

3439

/* Skip empty nodes */

3439

if (!pgdat->node_spanned_pages)

3440

if (!pgdat->node_spanned_pages)

3440

return;

3441

return;

3441

3442

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3443

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3443

/* ia64 gets its own node_mem_map, before this, without bootmem */

3444

/* ia64 gets its own node_mem_map, before this, without bootmem */

3444

if (!pgdat->node_mem_map) {

3445

if (!pgdat->node_mem_map) {

3445

unsigned long size, start, end;

3446

unsigned long size, start, end;

3446

struct page *map;

3447

struct page *map;

3447

3448

/*

3449

/*

3449

* The zone's endpoints aren't required to be MAX_ORDER

3450

* The zone's endpoints aren't required to be MAX_ORDER

3450

* aligned but the node_mem_map endpoints must be in order

3451

* aligned but the node_mem_map endpoints must be in order

3451

* for the buddy allocator to function correctly.

3452

* for the buddy allocator to function correctly.

3452

*/

3453

*/

3453

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

3454

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

3454

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

3455

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

3455

end = ALIGN(end, MAX_ORDER_NR_PAGES);

3456

end = ALIGN(end, MAX_ORDER_NR_PAGES);

3456

size = (end - start) * sizeof(struct page);

3457

size = (end - start) * sizeof(struct page);

3457

map = alloc_remap(pgdat->node_id, size);

3458

map = alloc_remap(pgdat->node_id, size);

3458

if (!map)

3459

if (!map)

3459

map = alloc_bootmem_node(pgdat, size);

3460

map = alloc_bootmem_node(pgdat, size);

3460

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

3461

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

3461

}

3462

}

3462

#ifndef CONFIG_NEED_MULTIPLE_NODES

3463

#ifndef CONFIG_NEED_MULTIPLE_NODES

3463

/*

3464

/*

3464

* With no DISCONTIG, the global mem_map is just set as node 0's

3465

* With no DISCONTIG, the global mem_map is just set as node 0's

3465

*/

3466

*/

3466

if (pgdat == NODE_DATA(0)) {

3467

if (pgdat == NODE_DATA(0)) {

3467

mem_map = NODE_DATA(0)->node_mem_map;

3468

mem_map = NODE_DATA(0)->node_mem_map;

3468

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3469

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3469

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

3470

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

3470

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

3471

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

3471

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3472

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3472

}

3473

}

3473

#endif

3474

#endif

3474

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

3475

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

3475

}

3476

}

3476

3477

void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,

3478

void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,

3478

unsigned long *zones_size, unsigned long node_start_pfn,

3479

unsigned long *zones_size, unsigned long node_start_pfn,

3479

unsigned long *zholes_size)

3480

unsigned long *zholes_size)

3480

{

3481

{

3481

pgdat->node_id = nid;

3482

pgdat->node_id = nid;

3482

pgdat->node_start_pfn = node_start_pfn;

3483

pgdat->node_start_pfn = node_start_pfn;

3483

calculate_node_totalpages(pgdat, zones_size, zholes_size);

3484

calculate_node_totalpages(pgdat, zones_size, zholes_size);

3484

3485

alloc_node_mem_map(pgdat);

3486

alloc_node_mem_map(pgdat);

3486

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3487

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3487

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

3488

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

3488

nid, (unsigned long)pgdat,

3489

nid, (unsigned long)pgdat,

3489

(unsigned long)pgdat->node_mem_map);

3490

(unsigned long)pgdat->node_mem_map);

3490

#endif

3491

#endif

3491

3492

free_area_init_core(pgdat, zones_size, zholes_size);

3493

free_area_init_core(pgdat, zones_size, zholes_size);

3493

}

3494

}

3494

3495

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3496

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3496

3497

#if MAX_NUMNODES > 1

3498

#if MAX_NUMNODES > 1

3498

/*

3499

/*

3499

* Figure out the number of possible node ids.

3500

* Figure out the number of possible node ids.

3500

*/

3501

*/

3501

static void __init setup_nr_node_ids(void)

3502

static void __init setup_nr_node_ids(void)

3502

{

3503

{

3503

unsigned int node;

3504

unsigned int node;

3504

unsigned int highest = 0;

3505

unsigned int highest = 0;

3505

3506

for_each_node_mask(node, node_possible_map)

3507

for_each_node_mask(node, node_possible_map)

3507

highest = node;

3508

highest = node;

3508

nr_node_ids = highest + 1;

3509

nr_node_ids = highest + 1;

3509

}

3510

}

3510

#else

3511

#else

3511

static inline void setup_nr_node_ids(void)

3512

static inline void setup_nr_node_ids(void)

3512

{

3513

{

3513

}

3514

}

3514

#endif

3515

#endif

3515

3516

/**

3517

/**

3517

* add_active_range - Register a range of PFNs backed by physical memory

3518

* add_active_range - Register a range of PFNs backed by physical memory

3518

* @nid: The node ID the range resides on

3519

* @nid: The node ID the range resides on

3519

* @start_pfn: The start PFN of the available physical memory

3520

* @start_pfn: The start PFN of the available physical memory

3520

* @end_pfn: The end PFN of the available physical memory

3521

* @end_pfn: The end PFN of the available physical memory

3521

*

3522

*

3522

* These ranges are stored in an early_node_map[] and later used by

3523

* These ranges are stored in an early_node_map[] and later used by

3523

* free_area_init_nodes() to calculate zone sizes and holes. If the

3524

* free_area_init_nodes() to calculate zone sizes and holes. If the

3524

* range spans a memory hole, it is up to the architecture to ensure

3525

* range spans a memory hole, it is up to the architecture to ensure

3525

* the memory is not freed by the bootmem allocator. If possible

3526

* the memory is not freed by the bootmem allocator. If possible

3526

* the range being registered will be merged with existing ranges.

3527

* the range being registered will be merged with existing ranges.

3527

*/

3528

*/

3528

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

3529

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

3529

unsigned long end_pfn)

3530

unsigned long end_pfn)

3530

{

3531

{

3531

int i;

3532

int i;

3532

3533

mminit_dprintk(MMINIT_TRACE, "memory_register",

3534

mminit_dprintk(MMINIT_TRACE, "memory_register",

3534

"Entering add_active_range(%d, %#lx, %#lx) "

3535

"Entering add_active_range(%d, %#lx, %#lx) "

3535

"%d entries of %d used\n",

3536

"%d entries of %d used\n",

3536

nid, start_pfn, end_pfn,

3537

nid, start_pfn, end_pfn,

3537

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

3538

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

3538

3539

mminit_validate_memmodel_limits(&start_pfn, &end_pfn);

3540

mminit_validate_memmodel_limits(&start_pfn, &end_pfn);

3540

3541

/* Merge with existing active regions if possible */

3542

/* Merge with existing active regions if possible */

3542

for (i = 0; i < nr_nodemap_entries; i++) {

3543

for (i = 0; i < nr_nodemap_entries; i++) {

3543

if (early_node_map[i].nid != nid)

3544

if (early_node_map[i].nid != nid)

3544

continue;

3545

continue;

3545

3546

/* Skip if an existing region covers this new one */

3547

/* Skip if an existing region covers this new one */

3547

if (start_pfn >= early_node_map[i].start_pfn &&

3548

if (start_pfn >= early_node_map[i].start_pfn &&

3548

end_pfn <= early_node_map[i].end_pfn)

3549

end_pfn <= early_node_map[i].end_pfn)

3549

return;

3550

return;

3550

3551

/* Merge forward if suitable */

3552

/* Merge forward if suitable */

3552

if (start_pfn <= early_node_map[i].end_pfn &&

3553

if (start_pfn <= early_node_map[i].end_pfn &&

3553

end_pfn > early_node_map[i].end_pfn) {

3554

end_pfn > early_node_map[i].end_pfn) {

3554

early_node_map[i].end_pfn = end_pfn;

3555

early_node_map[i].end_pfn = end_pfn;

3555

return;

3556

return;

3556

}

3557

}

3557

3558

/* Merge backward if suitable */

3559

/* Merge backward if suitable */

3559

if (start_pfn < early_node_map[i].end_pfn &&

3560

if (start_pfn < early_node_map[i].end_pfn &&

3560

end_pfn >= early_node_map[i].start_pfn) {

3561

end_pfn >= early_node_map[i].start_pfn) {

3561

early_node_map[i].start_pfn = start_pfn;

3562

early_node_map[i].start_pfn = start_pfn;

3562

return;

3563

return;

3563

}

3564

}

3564

}

3565

}

3565

3566

/* Check that early_node_map is large enough */

3567

/* Check that early_node_map is large enough */

3567

if (i >= MAX_ACTIVE_REGIONS) {

3568

if (i >= MAX_ACTIVE_REGIONS) {

3568

printk(KERN_CRIT "More than %d memory regions, truncating\n",

3569

printk(KERN_CRIT "More than %d memory regions, truncating\n",

3569

MAX_ACTIVE_REGIONS);

3570

MAX_ACTIVE_REGIONS);

3570

return;

3571

return;

3571

}

3572

}

3572

3573

early_node_map[i].nid = nid;

3574

early_node_map[i].nid = nid;

3574

early_node_map[i].start_pfn = start_pfn;

3575

early_node_map[i].start_pfn = start_pfn;

3575

early_node_map[i].end_pfn = end_pfn;

3576

early_node_map[i].end_pfn = end_pfn;

3576

nr_nodemap_entries = i + 1;

3577

nr_nodemap_entries = i + 1;

3577

}

3578

}

3578

3579

/**

3580

/**

3580

* remove_active_range - Shrink an existing registered range of PFNs

3581

* remove_active_range - Shrink an existing registered range of PFNs

3581

* @nid: The node id the range is on that should be shrunk

3582

* @nid: The node id the range is on that should be shrunk

3582

* @start_pfn: The new PFN of the range

3583

* @start_pfn: The new PFN of the range

3583

* @end_pfn: The new PFN of the range

3584

* @end_pfn: The new PFN of the range

3584

*

3585

*

3585

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

3586

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

3586

* The map is kept near the end physical page range that has already been

3587

* The map is kept near the end physical page range that has already been

3587

* registered. This function allows an arch to shrink an existing registered

3588

* registered. This function allows an arch to shrink an existing registered

3588

* range.

3589

* range.

3589

*/

3590

*/

3590

void __init remove_active_range(unsigned int nid, unsigned long start_pfn,

3591

void __init remove_active_range(unsigned int nid, unsigned long start_pfn,

3591

unsigned long end_pfn)

3592

unsigned long end_pfn)

3592

{

3593

{

3593

int i, j;

3594

int i, j;

3594

int removed = 0;

3595

int removed = 0;

3595

3596

printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",

3597

printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",

3597

nid, start_pfn, end_pfn);

3598

nid, start_pfn, end_pfn);

3598

3599

/* Find the old active region end and shrink */

3600

/* Find the old active region end and shrink */

3600

for_each_active_range_index_in_nid(i, nid) {

3601

for_each_active_range_index_in_nid(i, nid) {

3601

if (early_node_map[i].start_pfn >= start_pfn &&

3602

if (early_node_map[i].start_pfn >= start_pfn &&

3602

early_node_map[i].end_pfn <= end_pfn) {

3603

early_node_map[i].end_pfn <= end_pfn) {

3603

/* clear it */

3604

/* clear it */

3604

early_node_map[i].start_pfn = 0;

3605

early_node_map[i].start_pfn = 0;

3605

early_node_map[i].end_pfn = 0;

3606

early_node_map[i].end_pfn = 0;

3606

removed = 1;

3607

removed = 1;

3607

continue;

3608

continue;

3608

}

3609

}

3609

if (early_node_map[i].start_pfn < start_pfn &&

3610

if (early_node_map[i].start_pfn < start_pfn &&

3610

early_node_map[i].end_pfn > start_pfn) {

3611

early_node_map[i].end_pfn > start_pfn) {

3611

unsigned long temp_end_pfn = early_node_map[i].end_pfn;

3612

unsigned long temp_end_pfn = early_node_map[i].end_pfn;

3612

early_node_map[i].end_pfn = start_pfn;

3613

early_node_map[i].end_pfn = start_pfn;

3613

if (temp_end_pfn > end_pfn)

3614

if (temp_end_pfn > end_pfn)

3614

add_active_range(nid, end_pfn, temp_end_pfn);

3615

add_active_range(nid, end_pfn, temp_end_pfn);

3615

continue;

3616

continue;

3616

}

3617

}

3617

if (early_node_map[i].start_pfn >= start_pfn &&

3618

if (early_node_map[i].start_pfn >= start_pfn &&

3618

early_node_map[i].end_pfn > end_pfn &&

3619

early_node_map[i].end_pfn > end_pfn &&

3619

early_node_map[i].start_pfn < end_pfn) {

3620

early_node_map[i].start_pfn < end_pfn) {

3620

early_node_map[i].start_pfn = end_pfn;

3621

early_node_map[i].start_pfn = end_pfn;

3621

continue;

3622

continue;

3622

}

3623

}

3623

}

3624

}

3624

3625

if (!removed)

3626

if (!removed)

3626

return;

3627

return;

3627

3628

/* remove the blank ones */

3629

/* remove the blank ones */

3629

for (i = nr_nodemap_entries - 1; i > 0; i--) {

3630

for (i = nr_nodemap_entries - 1; i > 0; i--) {

3630

if (early_node_map[i].nid != nid)

3631

if (early_node_map[i].nid != nid)

3631

continue;

3632

continue;

3632

if (early_node_map[i].end_pfn)

3633

if (early_node_map[i].end_pfn)

3633

continue;

3634

continue;

3634

/* we found it, get rid of it */

3635

/* we found it, get rid of it */

3635

for (j = i; j < nr_nodemap_entries - 1; j++)

3636

for (j = i; j < nr_nodemap_entries - 1; j++)

3636

memcpy(&early_node_map[j], &early_node_map[j+1],

3637

memcpy(&early_node_map[j], &early_node_map[j+1],

3637

sizeof(early_node_map[j]));

3638

sizeof(early_node_map[j]));

3638

j = nr_nodemap_entries - 1;

3639

j = nr_nodemap_entries - 1;

3639

memset(&early_node_map[j], 0, sizeof(early_node_map[j]));

3640

memset(&early_node_map[j], 0, sizeof(early_node_map[j]));

3640

nr_nodemap_entries--;

3641

nr_nodemap_entries--;

3641

}

3642

}

3642

}

3643

}

3643

3644

/**

3645

/**

3645

* remove_all_active_ranges - Remove all currently registered regions

3646

* remove_all_active_ranges - Remove all currently registered regions

3646

*

3647

*

3647

* During discovery, it may be found that a table like SRAT is invalid

3648

* During discovery, it may be found that a table like SRAT is invalid

3648

* and an alternative discovery method must be used. This function removes

3649

* and an alternative discovery method must be used. This function removes

3649

* all currently registered regions.

3650

* all currently registered regions.

3650

*/

3651

*/

3651

void __init remove_all_active_ranges(void)

3652

void __init remove_all_active_ranges(void)

3652

{

3653

{

3653

memset(early_node_map, 0, sizeof(early_node_map));

3654

memset(early_node_map, 0, sizeof(early_node_map));

3654

nr_nodemap_entries = 0;

3655

nr_nodemap_entries = 0;

3655

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

3656

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

3656

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

3657

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

3657

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

3658

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

3658

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

3659

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

3659

}

3660

}

3660

3661

/* Compare two active node_active_regions */

3662

/* Compare two active node_active_regions */

3662

static int __init cmp_node_active_region(const void *a, const void *b)

3663

static int __init cmp_node_active_region(const void *a, const void *b)

3663

{

3664

{

3664

struct node_active_region *arange = (struct node_active_region *)a;

3665

struct node_active_region *arange = (struct node_active_region *)a;

3665

struct node_active_region *brange = (struct node_active_region *)b;

3666

struct node_active_region *brange = (struct node_active_region *)b;

3666

3667

/* Done this way to avoid overflows */

3668

/* Done this way to avoid overflows */

3668

if (arange->start_pfn > brange->start_pfn)

3669

if (arange->start_pfn > brange->start_pfn)

3669

return 1;

3670

return 1;

3670

if (arange->start_pfn < brange->start_pfn)

3671

if (arange->start_pfn < brange->start_pfn)

3671

return -1;

3672

return -1;

3672

3673

return 0;

3674

return 0;

3674

}

3675

}

3675

3676

/* sort the node_map by start_pfn */

3677

/* sort the node_map by start_pfn */

3677

static void __init sort_node_map(void)

3678

static void __init sort_node_map(void)

3678

{

3679

{

3679

sort(early_node_map, (size_t)nr_nodemap_entries,

3680

sort(early_node_map, (size_t)nr_nodemap_entries,

3680

sizeof(struct node_active_region),

3681

sizeof(struct node_active_region),

3681

cmp_node_active_region, NULL);

3682

cmp_node_active_region, NULL);

3682

}

3683

}

3683

3684

/* Find the lowest pfn for a node */

3685

/* Find the lowest pfn for a node */

3685

unsigned long __init find_min_pfn_for_node(int nid)

3686

unsigned long __init find_min_pfn_for_node(int nid)

3686

{

3687

{

3687

int i;

3688

int i;

3688

unsigned long min_pfn = ULONG_MAX;

3689

unsigned long min_pfn = ULONG_MAX;

3689

3690

/* Assuming a sorted map, the first range found has the starting pfn */

3691

/* Assuming a sorted map, the first range found has the starting pfn */

3691

for_each_active_range_index_in_nid(i, nid)

3692

for_each_active_range_index_in_nid(i, nid)

3692

min_pfn = min(min_pfn, early_node_map[i].start_pfn);

3693

min_pfn = min(min_pfn, early_node_map[i].start_pfn);

3693

3694

if (min_pfn == ULONG_MAX) {

3695

if (min_pfn == ULONG_MAX) {

3695

printk(KERN_WARNING

3696

printk(KERN_WARNING

3696

"Could not find start_pfn for node %d\n", nid);

3697

"Could not find start_pfn for node %d\n", nid);

3697

return 0;

3698

return 0;

3698

}

3699

}

3699

3700

return min_pfn;

3701

return min_pfn;

3701

}

3702

}

3702

3703

/**

3704

/**

3704

* find_min_pfn_with_active_regions - Find the minimum PFN registered

3705

* find_min_pfn_with_active_regions - Find the minimum PFN registered

3705

*

3706

*

3706

* It returns the minimum PFN based on information provided via

3707

* It returns the minimum PFN based on information provided via

3707

* add_active_range().

3708

* add_active_range().

3708

*/

3709

*/

3709

unsigned long __init find_min_pfn_with_active_regions(void)

3710

unsigned long __init find_min_pfn_with_active_regions(void)

3710

{

3711

{

3711

return find_min_pfn_for_node(MAX_NUMNODES);

3712

return find_min_pfn_for_node(MAX_NUMNODES);

3712

}

3713

}

3713

3714

/**

3715

/**

3715

* find_max_pfn_with_active_regions - Find the maximum PFN registered

3716

* find_max_pfn_with_active_regions - Find the maximum PFN registered

3716

*

3717

*

3717

* It returns the maximum PFN based on information provided via

3718

* It returns the maximum PFN based on information provided via

3718

* add_active_range().

3719

* add_active_range().

3719

*/

3720

*/

3720

unsigned long __init find_max_pfn_with_active_regions(void)

3721

unsigned long __init find_max_pfn_with_active_regions(void)

3721

{

3722

{

3722

int i;

3723

int i;

3723

unsigned long max_pfn = 0;

3724

unsigned long max_pfn = 0;

3724

3725

for (i = 0; i < nr_nodemap_entries; i++)

3726

for (i = 0; i < nr_nodemap_entries; i++)

3726

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

3727

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

3727

3728

return max_pfn;

3729

return max_pfn;

3729

}

3730

}

3730

3731

/*

3732

/*

3732

* early_calculate_totalpages()

3733

* early_calculate_totalpages()

3733

* Sum pages in active regions for movable zone.

3734

* Sum pages in active regions for movable zone.

3734

* Populate N_HIGH_MEMORY for calculating usable_nodes.

3735

* Populate N_HIGH_MEMORY for calculating usable_nodes.

3735

*/

3736

*/

3736

static unsigned long __init early_calculate_totalpages(void)

3737

static unsigned long __init early_calculate_totalpages(void)

3737

{

3738

{

3738

int i;

3739

int i;

3739

unsigned long totalpages = 0;

3740

unsigned long totalpages = 0;

3740

3741

for (i = 0; i < nr_nodemap_entries; i++) {

3742

for (i = 0; i < nr_nodemap_entries; i++) {

3742

unsigned long pages = early_node_map[i].end_pfn -

3743

unsigned long pages = early_node_map[i].end_pfn -

3743

early_node_map[i].start_pfn;

3744

early_node_map[i].start_pfn;

3744

totalpages += pages;

3745

totalpages += pages;

3745

if (pages)

3746

if (pages)

3746

node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);

3747

node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);

3747

}

3748

}

3748

return totalpages;

3749

return totalpages;

3749

}

3750

}

3750

3751

/*

3752

/*

3752

* Find the PFN the Movable zone begins in each node. Kernel memory

3753

* Find the PFN the Movable zone begins in each node. Kernel memory

3753

* is spread evenly between nodes as long as the nodes have enough

3754

* is spread evenly between nodes as long as the nodes have enough

3754

* memory. When they don't, some nodes will have more kernelcore than

3755

* memory. When they don't, some nodes will have more kernelcore than

3755

* others

3756

* others

3756

*/

3757

*/

3757

void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)

3758

void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)

3758

{

3759

{

3759

int i, nid;

3760

int i, nid;

3760

unsigned long usable_startpfn;

3761

unsigned long usable_startpfn;

3761

unsigned long kernelcore_node, kernelcore_remaining;

3762

unsigned long kernelcore_node, kernelcore_remaining;

3762

unsigned long totalpages = early_calculate_totalpages();

3763

unsigned long totalpages = early_calculate_totalpages();

3763

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

3764

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

3764

3765

/*

3766

/*

3766

* If movablecore was specified, calculate what size of

3767

* If movablecore was specified, calculate what size of

3767

* kernelcore that corresponds so that memory usable for

3768

* kernelcore that corresponds so that memory usable for

3768

* any allocation type is evenly spread. If both kernelcore

3769

* any allocation type is evenly spread. If both kernelcore

3769

* and movablecore are specified, then the value of kernelcore

3770

* and movablecore are specified, then the value of kernelcore

3770

* will be used for required_kernelcore if it's greater than

3771

* will be used for required_kernelcore if it's greater than

3771

* what movablecore would have allowed.

3772

* what movablecore would have allowed.

3772

*/

3773

*/

3773

if (required_movablecore) {

3774

if (required_movablecore) {

3774

unsigned long corepages;

3775

unsigned long corepages;

3775

3776

/*

3777

/*

3777

* Round-up so that ZONE_MOVABLE is at least as large as what

3778

* Round-up so that ZONE_MOVABLE is at least as large as what

3778

* was requested by the user

3779

* was requested by the user

3779

*/

3780

*/

3780

required_movablecore =

3781

required_movablecore =

3781

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

3782

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

3782

corepages = totalpages - required_movablecore;

3783

corepages = totalpages - required_movablecore;

3783

3784

required_kernelcore = max(required_kernelcore, corepages);

3785

required_kernelcore = max(required_kernelcore, corepages);

3785

}

3786

}

3786

3787

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

3788

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

3788

if (!required_kernelcore)

3789

if (!required_kernelcore)

3789

return;

3790

return;

3790

3791

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

3792

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

3792

find_usable_zone_for_movable();

3793

find_usable_zone_for_movable();

3793

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

3794

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

3794

3795

restart:

3796

restart:

3796

/* Spread kernelcore memory as evenly as possible throughout nodes */

3797

/* Spread kernelcore memory as evenly as possible throughout nodes */

3797

kernelcore_node = required_kernelcore / usable_nodes;

3798

kernelcore_node = required_kernelcore / usable_nodes;

3798

for_each_node_state(nid, N_HIGH_MEMORY) {

3799

for_each_node_state(nid, N_HIGH_MEMORY) {

3799

/*

3800

/*

3800

* Recalculate kernelcore_node if the division per node

3801

* Recalculate kernelcore_node if the division per node

3801

* now exceeds what is necessary to satisfy the requested

3802

* now exceeds what is necessary to satisfy the requested

3802

* amount of memory for the kernel

3803

* amount of memory for the kernel

3803

*/

3804

*/

3804

if (required_kernelcore < kernelcore_node)

3805

if (required_kernelcore < kernelcore_node)

3805

kernelcore_node = required_kernelcore / usable_nodes;

3806

kernelcore_node = required_kernelcore / usable_nodes;

3806

3807

/*

3808

/*

3808

* As the map is walked, we track how much memory is usable

3809

* As the map is walked, we track how much memory is usable

3809

* by the kernel using kernelcore_remaining. When it is

3810

* by the kernel using kernelcore_remaining. When it is

3810

* 0, the rest of the node is usable by ZONE_MOVABLE

3811

* 0, the rest of the node is usable by ZONE_MOVABLE

3811

*/

3812

*/

3812

kernelcore_remaining = kernelcore_node;

3813

kernelcore_remaining = kernelcore_node;

3813

3814

/* Go through each range of PFNs within this node */

3815

/* Go through each range of PFNs within this node */

3815

for_each_active_range_index_in_nid(i, nid) {

3816

for_each_active_range_index_in_nid(i, nid) {

3816

unsigned long start_pfn, end_pfn;

3817

unsigned long start_pfn, end_pfn;

3817

unsigned long size_pages;

3818

unsigned long size_pages;

3818

3819

start_pfn = max(early_node_map[i].start_pfn,

3820

start_pfn = max(early_node_map[i].start_pfn,

3820

zone_movable_pfn[nid]);

3821

zone_movable_pfn[nid]);

3821

end_pfn = early_node_map[i].end_pfn;

3822

end_pfn = early_node_map[i].end_pfn;

3822

if (start_pfn >= end_pfn)

3823

if (start_pfn >= end_pfn)

3823

continue;

3824

continue;

3824

3825

/* Account for what is only usable for kernelcore */

3826

/* Account for what is only usable for kernelcore */

3826

if (start_pfn < usable_startpfn) {

3827

if (start_pfn < usable_startpfn) {

3827

unsigned long kernel_pages;

3828

unsigned long kernel_pages;

3828

kernel_pages = min(end_pfn, usable_startpfn)

3829

kernel_pages = min(end_pfn, usable_startpfn)

3829

- start_pfn;

3830

- start_pfn;

3830

3831

kernelcore_remaining -= min(kernel_pages,

3832

kernelcore_remaining -= min(kernel_pages,

3832

kernelcore_remaining);

3833

kernelcore_remaining);

3833

required_kernelcore -= min(kernel_pages,

3834

required_kernelcore -= min(kernel_pages,

3834

required_kernelcore);

3835

required_kernelcore);

3835

3836

/* Continue if range is now fully accounted */

3837

/* Continue if range is now fully accounted */

3837

if (end_pfn <= usable_startpfn) {

3838

if (end_pfn <= usable_startpfn) {

3838

3839

/*

3840

/*

3840

* Push zone_movable_pfn to the end so

3841

* Push zone_movable_pfn to the end so

3841

* that if we have to rebalance

3842

* that if we have to rebalance

3842

* kernelcore across nodes, we will

3843

* kernelcore across nodes, we will

3843

* not double account here

3844

* not double account here

3844

*/

3845

*/

3845

zone_movable_pfn[nid] = end_pfn;

3846

zone_movable_pfn[nid] = end_pfn;

3846

continue;

3847

continue;

3847

}

3848

}

3848

start_pfn = usable_startpfn;

3849

start_pfn = usable_startpfn;

3849

}

3850

}

3850

3851

/*

3852

/*

3852

* The usable PFN range for ZONE_MOVABLE is from

3853

* The usable PFN range for ZONE_MOVABLE is from

3853

* start_pfn->end_pfn. Calculate size_pages as the

3854

* start_pfn->end_pfn. Calculate size_pages as the

3854

* number of pages used as kernelcore

3855

* number of pages used as kernelcore

3855

*/

3856

*/

3856

size_pages = end_pfn - start_pfn;

3857

size_pages = end_pfn - start_pfn;

3857

if (size_pages > kernelcore_remaining)

3858

if (size_pages > kernelcore_remaining)

3858

size_pages = kernelcore_remaining;

3859

size_pages = kernelcore_remaining;

3859

zone_movable_pfn[nid] = start_pfn + size_pages;

3860

zone_movable_pfn[nid] = start_pfn + size_pages;

3860

3861

/*

3862

/*

3862

* Some kernelcore has been met, update counts and

3863

* Some kernelcore has been met, update counts and

3863

* break if the kernelcore for this node has been

3864

* break if the kernelcore for this node has been

3864

* satisified

3865

* satisified

3865

*/

3866

*/

3866

required_kernelcore -= min(required_kernelcore,

3867

required_kernelcore -= min(required_kernelcore,

3867

size_pages);

3868

size_pages);

3868

kernelcore_remaining -= size_pages;

3869

kernelcore_remaining -= size_pages;

3869

if (!kernelcore_remaining)

3870

if (!kernelcore_remaining)

3870

break;

3871

break;

3871

}

3872

}

3872

}

3873

}

3873

3874

/*

3875

/*

3875

* If there is still required_kernelcore, we do another pass with one

3876

* If there is still required_kernelcore, we do another pass with one

3876

* less node in the count. This will push zone_movable_pfn[nid] further

3877

* less node in the count. This will push zone_movable_pfn[nid] further

3877

* along on the nodes that still have memory until kernelcore is

3878

* along on the nodes that still have memory until kernelcore is

3878

* satisified

3879

* satisified

3879

*/

3880

*/

3880

usable_nodes--;

3881

usable_nodes--;

3881

if (usable_nodes && required_kernelcore > usable_nodes)

3882

if (usable_nodes && required_kernelcore > usable_nodes)

3882

goto restart;

3883

goto restart;

3883

3884

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

3885

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

3885

for (nid = 0; nid < MAX_NUMNODES; nid++)

3886

for (nid = 0; nid < MAX_NUMNODES; nid++)

3886

zone_movable_pfn[nid] =

3887

zone_movable_pfn[nid] =

3887

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

3888

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

3888

}

3889

}

3889

3890

/* Any regular memory on that node ? */

3891

/* Any regular memory on that node ? */

3891

static void check_for_regular_memory(pg_data_t *pgdat)

3892

static void check_for_regular_memory(pg_data_t *pgdat)

3892

{

3893

{

3893

#ifdef CONFIG_HIGHMEM

3894

#ifdef CONFIG_HIGHMEM

3894

enum zone_type zone_type;

3895

enum zone_type zone_type;

3895

3896

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

3897

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

3897

struct zone *zone = &pgdat->node_zones[zone_type];

3898

struct zone *zone = &pgdat->node_zones[zone_type];

3898

if (zone->present_pages)

3899

if (zone->present_pages)

3899

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

3900

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

3900

}

3901

}

3901

#endif

3902

#endif

3902

}

3903

}

3903

3904

/**

3905

/**

3905

* free_area_init_nodes - Initialise all pg_data_t and zone data

3906

* free_area_init_nodes - Initialise all pg_data_t and zone data

3906

* @max_zone_pfn: an array of max PFNs for each zone

3907

* @max_zone_pfn: an array of max PFNs for each zone

3907

*

3908

*

3908

* This will call free_area_init_node() for each active node in the system.

3909

* This will call free_area_init_node() for each active node in the system.

3909

* Using the page ranges provided by add_active_range(), the size of each

3910

* Using the page ranges provided by add_active_range(), the size of each

3910

* zone in each node and their holes is calculated. If the maximum PFN

3911

* zone in each node and their holes is calculated. If the maximum PFN

3911

* between two adjacent zones match, it is assumed that the zone is empty.

3912

* between two adjacent zones match, it is assumed that the zone is empty.

3912

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

3913

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

3913

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

3914

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

3914

* starts where the previous one ended. For example, ZONE_DMA32 starts

3915

* starts where the previous one ended. For example, ZONE_DMA32 starts

3915

* at arch_max_dma_pfn.

3916

* at arch_max_dma_pfn.

3916

*/

3917

*/

3917

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

3918

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

3918

{

3919

{

3919

unsigned long nid;

3920

unsigned long nid;

3920

enum zone_type i;

3921

enum zone_type i;

3921

3922

/* Sort early_node_map as initialisation assumes it is sorted */

3923

/* Sort early_node_map as initialisation assumes it is sorted */

3923

sort_node_map();

3924

sort_node_map();

3924

3925

/* Record where the zone boundaries are */

3926

/* Record where the zone boundaries are */

3926

memset(arch_zone_lowest_possible_pfn, 0,

3927

memset(arch_zone_lowest_possible_pfn, 0,

3927

sizeof(arch_zone_lowest_possible_pfn));

3928

sizeof(arch_zone_lowest_possible_pfn));

3928

memset(arch_zone_highest_possible_pfn, 0,

3929

memset(arch_zone_highest_possible_pfn, 0,

3929

sizeof(arch_zone_highest_possible_pfn));

3930

sizeof(arch_zone_highest_possible_pfn));

3930

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

3931

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

3931

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

3932

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

3932

for (i = 1; i < MAX_NR_ZONES; i++) {

3933

for (i = 1; i < MAX_NR_ZONES; i++) {

3933

if (i == ZONE_MOVABLE)

3934

if (i == ZONE_MOVABLE)

3934

continue;

3935

continue;

3935

arch_zone_lowest_possible_pfn[i] =

3936

arch_zone_lowest_possible_pfn[i] =

3936

arch_zone_highest_possible_pfn[i-1];

3937

arch_zone_highest_possible_pfn[i-1];

3937

arch_zone_highest_possible_pfn[i] =

3938

arch_zone_highest_possible_pfn[i] =

3938

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

3939

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

3939

}

3940

}

3940

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

3941

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

3941

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

3942

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

3942

3943

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

3944

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

3944

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

3945

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

3945

find_zone_movable_pfns_for_nodes(zone_movable_pfn);

3946

find_zone_movable_pfns_for_nodes(zone_movable_pfn);

3946

3947

/* Print out the zone ranges */

3948

/* Print out the zone ranges */

3948

printk("Zone PFN ranges:\n");

3949

printk("Zone PFN ranges:\n");

3949

for (i = 0; i < MAX_NR_ZONES; i++) {

3950

for (i = 0; i < MAX_NR_ZONES; i++) {

3950

if (i == ZONE_MOVABLE)

3951

if (i == ZONE_MOVABLE)

3951

continue;

3952

continue;

3952

printk(" %-8s %0#10lx -> %0#10lx\n",

3953

printk(" %-8s %0#10lx -> %0#10lx\n",

3953

zone_names[i],

3954

zone_names[i],

3954

arch_zone_lowest_possible_pfn[i],

3955

arch_zone_lowest_possible_pfn[i],

3955

arch_zone_highest_possible_pfn[i]);

3956

arch_zone_highest_possible_pfn[i]);

3956

}

3957

}

3957

3958

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

3959

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

3959

printk("Movable zone start PFN for each node\n");

3960

printk("Movable zone start PFN for each node\n");

3960

for (i = 0; i < MAX_NUMNODES; i++) {

3961

for (i = 0; i < MAX_NUMNODES; i++) {

3961

if (zone_movable_pfn[i])

3962

if (zone_movable_pfn[i])

3962

printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);

3963

printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);

3963

}

3964

}

3964

3965

/* Print out the early_node_map[] */

3966

/* Print out the early_node_map[] */

3966

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

3967

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

3967

for (i = 0; i < nr_nodemap_entries; i++)

3968

for (i = 0; i < nr_nodemap_entries; i++)

3968

printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,

3969

printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,

3969

early_node_map[i].start_pfn,

3970

early_node_map[i].start_pfn,

3970

early_node_map[i].end_pfn);

3971

early_node_map[i].end_pfn);

3971

3972

/* Initialise every node */

3973

/* Initialise every node */

3973

mminit_verify_pageflags_layout();

3974

mminit_verify_pageflags_layout();

3974

setup_nr_node_ids();

3975

setup_nr_node_ids();

3975

for_each_online_node(nid) {

3976

for_each_online_node(nid) {

3976

pg_data_t *pgdat = NODE_DATA(nid);

3977

pg_data_t *pgdat = NODE_DATA(nid);

3977

free_area_init_node(nid, pgdat, NULL,

3978

free_area_init_node(nid, pgdat, NULL,

3978

find_min_pfn_for_node(nid), NULL);

3979

find_min_pfn_for_node(nid), NULL);

3979

3980

/* Any memory on that node */

3981

/* Any memory on that node */

3981

if (pgdat->node_present_pages)

3982

if (pgdat->node_present_pages)

3982

node_set_state(nid, N_HIGH_MEMORY);

3983

node_set_state(nid, N_HIGH_MEMORY);

3983

check_for_regular_memory(pgdat);

3984

check_for_regular_memory(pgdat);

3984

}

3985

}

3985

}

3986

}

3986

3987

static int __init cmdline_parse_core(char *p, unsigned long *core)

3988

static int __init cmdline_parse_core(char *p, unsigned long *core)

3988

{

3989

{

3989

unsigned long long coremem;

3990

unsigned long long coremem;

3990

if (!p)

3991

if (!p)

3991

return -EINVAL;

3992

return -EINVAL;

3992

3993

coremem = memparse(p, &p);

3994

coremem = memparse(p, &p);

3994

*core = coremem >> PAGE_SHIFT;

3995

*core = coremem >> PAGE_SHIFT;

3995

3996

/* Paranoid check that UL is enough for the coremem value */

3997

/* Paranoid check that UL is enough for the coremem value */

3997

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

3998

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

3998

3999

return 0;

4000

return 0;

4000

}

4001

}

4001

4002

/*

4003

/*

4003

* kernelcore=size sets the amount of memory for use for allocations that

4004

* kernelcore=size sets the amount of memory for use for allocations that

4004

* cannot be reclaimed or migrated.

4005

* cannot be reclaimed or migrated.

4005

*/

4006

*/

4006

static int __init cmdline_parse_kernelcore(char *p)

4007

static int __init cmdline_parse_kernelcore(char *p)

4007

{

4008

{

4008

return cmdline_parse_core(p, &required_kernelcore);

4009

return cmdline_parse_core(p, &required_kernelcore);

4009

}

4010

}

4010

4011

/*

4012

/*

4012

* movablecore=size sets the amount of memory for use for allocations that

4013

* movablecore=size sets the amount of memory for use for allocations that

4013

* can be reclaimed or migrated.

4014

* can be reclaimed or migrated.

4014

*/

4015

*/

4015

static int __init cmdline_parse_movablecore(char *p)

4016

static int __init cmdline_parse_movablecore(char *p)

4016

{

4017

{

4017

return cmdline_parse_core(p, &required_movablecore);

4018

return cmdline_parse_core(p, &required_movablecore);

4018

}

4019

}

4019

4020

early_param("kernelcore", cmdline_parse_kernelcore);

4021

early_param("kernelcore", cmdline_parse_kernelcore);

4021

early_param("movablecore", cmdline_parse_movablecore);

4022

early_param("movablecore", cmdline_parse_movablecore);

4022

4023

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

4024

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

4024

4025

/**

4026

/**

4026

* set_dma_reserve - set the specified number of pages reserved in the first zone

4027

* set_dma_reserve - set the specified number of pages reserved in the first zone

4027

* @new_dma_reserve: The number of pages to mark reserved

4028

* @new_dma_reserve: The number of pages to mark reserved

4028

*

4029

*

4029

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4030

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4030

* In the DMA zone, a significant percentage may be consumed by kernel image

4031

* In the DMA zone, a significant percentage may be consumed by kernel image

4031

* and other unfreeable allocations which can skew the watermarks badly. This

4032

* and other unfreeable allocations which can skew the watermarks badly. This

4032

* function may optionally be used to account for unfreeable pages in the

4033

* function may optionally be used to account for unfreeable pages in the

4033

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4034

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4034

* smaller per-cpu batchsize.

4035

* smaller per-cpu batchsize.

4035

*/

4036

*/

4036

void __init set_dma_reserve(unsigned long new_dma_reserve)

4037

void __init set_dma_reserve(unsigned long new_dma_reserve)

4037

{

4038

{

4038

dma_reserve = new_dma_reserve;

4039

dma_reserve = new_dma_reserve;

4039

}

4040

}

4040

4041

#ifndef CONFIG_NEED_MULTIPLE_NODES

4042

#ifndef CONFIG_NEED_MULTIPLE_NODES

4042

static bootmem_data_t contig_bootmem_data;

4043

static bootmem_data_t contig_bootmem_data;

4043

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

4044

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

4044

4045

EXPORT_SYMBOL(contig_page_data);

4046

EXPORT_SYMBOL(contig_page_data);

4046

#endif

4047

#endif

4047

4048

void __init free_area_init(unsigned long *zones_size)

4049

void __init free_area_init(unsigned long *zones_size)

4049

{

4050

{

4050

free_area_init_node(0, NODE_DATA(0), zones_size,

4051

free_area_init_node(0, NODE_DATA(0), zones_size,

4051

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4052

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4052

}

4053

}

4053

4054

static int page_alloc_cpu_notify(struct notifier_block *self,

4055

static int page_alloc_cpu_notify(struct notifier_block *self,

4055

unsigned long action, void *hcpu)

4056

unsigned long action, void *hcpu)

4056

{

4057

{

4057

int cpu = (unsigned long)hcpu;

4058

int cpu = (unsigned long)hcpu;

4058

4059

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4060

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4060

drain_pages(cpu);

4061

drain_pages(cpu);

4061

4062

/*

4063

/*

4063

* Spill the event counters of the dead processor

4064

* Spill the event counters of the dead processor

4064

* into the current processors event counters.

4065

* into the current processors event counters.

4065

* This artificially elevates the count of the current

4066

* This artificially elevates the count of the current

4066

* processor.

4067

* processor.

4067

*/

4068

*/

4068

vm_events_fold_cpu(cpu);

4069

vm_events_fold_cpu(cpu);

4069

4070

/*

4071

/*

4071

* Zero the differential counters of the dead processor

4072

* Zero the differential counters of the dead processor

4072

* so that the vm statistics are consistent.

4073

* so that the vm statistics are consistent.

4073

*

4074

*

4074

* This is only okay since the processor is dead and cannot

4075

* This is only okay since the processor is dead and cannot

4075

* race with what we are doing.

4076

* race with what we are doing.

4076

*/

4077

*/

4077

refresh_cpu_vm_stats(cpu);

4078

refresh_cpu_vm_stats(cpu);

4078

}

4079

}

4079

return NOTIFY_OK;

4080

return NOTIFY_OK;

4080

}

4081

}

4081

4082

void __init page_alloc_init(void)

4083

void __init page_alloc_init(void)

4083

{

4084

{

4084

hotcpu_notifier(page_alloc_cpu_notify, 0);

4085

hotcpu_notifier(page_alloc_cpu_notify, 0);

4085

}

4086

}

4086

4087

/*

4088

/*

4088

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4089

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4089

* or min_free_kbytes changes.

4090

* or min_free_kbytes changes.

4090

*/

4091

*/

4091

static void calculate_totalreserve_pages(void)

4092

static void calculate_totalreserve_pages(void)

4092

{

4093

{

4093

struct pglist_data *pgdat;

4094

struct pglist_data *pgdat;

4094

unsigned long reserve_pages = 0;

4095

unsigned long reserve_pages = 0;

4095

enum zone_type i, j;

4096

enum zone_type i, j;

4096

4097

for_each_online_pgdat(pgdat) {

4098

for_each_online_pgdat(pgdat) {

4098

for (i = 0; i < MAX_NR_ZONES; i++) {

4099

for (i = 0; i < MAX_NR_ZONES; i++) {

4099

struct zone *zone = pgdat->node_zones + i;

4100

struct zone *zone = pgdat->node_zones + i;

4100

unsigned long max = 0;

4101

unsigned long max = 0;

4101

4102

/* Find valid and maximum lowmem_reserve in the zone */

4103

/* Find valid and maximum lowmem_reserve in the zone */

4103

for (j = i; j < MAX_NR_ZONES; j++) {

4104

for (j = i; j < MAX_NR_ZONES; j++) {

4104

if (zone->lowmem_reserve[j] > max)

4105

if (zone->lowmem_reserve[j] > max)

4105

max = zone->lowmem_reserve[j];

4106

max = zone->lowmem_reserve[j];

4106

}

4107

}

4107

4108

/* we treat pages_high as reserved pages. */

4109

/* we treat pages_high as reserved pages. */

4109

max += zone->pages_high;

4110

max += zone->pages_high;

4110

4111

if (max > zone->present_pages)

4112

if (max > zone->present_pages)

4112

max = zone->present_pages;

4113

max = zone->present_pages;

4113

reserve_pages += max;

4114

reserve_pages += max;

4114

}

4115

}

4115

}

4116

}

4116

totalreserve_pages = reserve_pages;

4117

totalreserve_pages = reserve_pages;

4117

}

4118

}

4118

4119

/*

4120

/*

4120

* setup_per_zone_lowmem_reserve - called whenever

4121

* setup_per_zone_lowmem_reserve - called whenever

4121

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

4122

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

4122

* has a correct pages reserved value, so an adequate number of

4123

* has a correct pages reserved value, so an adequate number of

4123

* pages are left in the zone after a successful __alloc_pages().

4124

* pages are left in the zone after a successful __alloc_pages().

4124

*/

4125

*/

4125

static void setup_per_zone_lowmem_reserve(void)

4126

static void setup_per_zone_lowmem_reserve(void)

4126

{

4127

{

4127

struct pglist_data *pgdat;

4128

struct pglist_data *pgdat;

4128

enum zone_type j, idx;

4129

enum zone_type j, idx;

4129

4130

for_each_online_pgdat(pgdat) {

4131

for_each_online_pgdat(pgdat) {

4131

for (j = 0; j < MAX_NR_ZONES; j++) {

4132

for (j = 0; j < MAX_NR_ZONES; j++) {

4132

struct zone *zone = pgdat->node_zones + j;

4133

struct zone *zone = pgdat->node_zones + j;

4133

unsigned long present_pages = zone->present_pages;

4134

unsigned long present_pages = zone->present_pages;

4134

4135

zone->lowmem_reserve[j] = 0;

4136

zone->lowmem_reserve[j] = 0;

4136

4137

idx = j;

4138

idx = j;

4138

while (idx) {

4139

while (idx) {

4139

struct zone *lower_zone;

4140

struct zone *lower_zone;

4140

4141

idx--;

4142

idx--;

4142

4143

if (sysctl_lowmem_reserve_ratio[idx] < 1)

4144

if (sysctl_lowmem_reserve_ratio[idx] < 1)

4144

sysctl_lowmem_reserve_ratio[idx] = 1;

4145

sysctl_lowmem_reserve_ratio[idx] = 1;

4145

4146

lower_zone = pgdat->node_zones + idx;

4147

lower_zone = pgdat->node_zones + idx;

4147

lower_zone->lowmem_reserve[j] = present_pages /

4148

lower_zone->lowmem_reserve[j] = present_pages /

4148

sysctl_lowmem_reserve_ratio[idx];

4149

sysctl_lowmem_reserve_ratio[idx];

4149

present_pages += lower_zone->present_pages;

4150

present_pages += lower_zone->present_pages;

4150

}

4151

}

4151

}

4152

}

4152

}

4153

}

4153

4154

/* update totalreserve_pages */

4155

/* update totalreserve_pages */

4155

calculate_totalreserve_pages();

4156

calculate_totalreserve_pages();

4156

}

4157

}

4157

4158

/**

4159

/**

4159

* setup_per_zone_pages_min - called when min_free_kbytes changes.

4160

* setup_per_zone_pages_min - called when min_free_kbytes changes.

4160

*

4161

*

4161

* Ensures that the pages_{min,low,high} values for each zone are set correctly

4162

* Ensures that the pages_{min,low,high} values for each zone are set correctly

4162

* with respect to min_free_kbytes.

4163

* with respect to min_free_kbytes.

4163

*/

4164

*/

4164

void setup_per_zone_pages_min(void)

4165

void setup_per_zone_pages_min(void)

4165

{

4166

{

4166

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

4167

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

4167

unsigned long lowmem_pages = 0;

4168

unsigned long lowmem_pages = 0;

4168

struct zone *zone;

4169

struct zone *zone;

4169

unsigned long flags;

4170

unsigned long flags;

4170

4171

/* Calculate total number of !ZONE_HIGHMEM pages */

4172

/* Calculate total number of !ZONE_HIGHMEM pages */

4172

for_each_zone(zone) {

4173

for_each_zone(zone) {

4173

if (!is_highmem(zone))

4174

if (!is_highmem(zone))

4174

lowmem_pages += zone->present_pages;

4175

lowmem_pages += zone->present_pages;

4175

}

4176

}

4176

4177

for_each_zone(zone) {

4178

for_each_zone(zone) {

4178

u64 tmp;

4179

u64 tmp;

4179

4180

spin_lock_irqsave(&zone->lru_lock, flags);

4181

spin_lock_irqsave(&zone->lru_lock, flags);

4181

tmp = (u64)pages_min * zone->present_pages;

4182

tmp = (u64)pages_min * zone->present_pages;

4182

do_div(tmp, lowmem_pages);

4183

do_div(tmp, lowmem_pages);

4183

if (is_highmem(zone)) {

4184

if (is_highmem(zone)) {

4184

/*

4185

/*

4185

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

4186

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

4186

* need highmem pages, so cap pages_min to a small

4187

* need highmem pages, so cap pages_min to a small

4187

* value here.

4188

* value here.

4188

*

4189

*

4189

* The (pages_high-pages_low) and (pages_low-pages_min)

4190

* The (pages_high-pages_low) and (pages_low-pages_min)

4190

* deltas controls asynch page reclaim, and so should

4191

* deltas controls asynch page reclaim, and so should

4191

* not be capped for highmem.

4192

* not be capped for highmem.

4192

*/

4193

*/

4193

int min_pages;

4194

int min_pages;

4194

4195

min_pages = zone->present_pages / 1024;

4196

min_pages = zone->present_pages / 1024;

4196

if (min_pages < SWAP_CLUSTER_MAX)

4197

if (min_pages < SWAP_CLUSTER_MAX)

4197

min_pages = SWAP_CLUSTER_MAX;

4198

min_pages = SWAP_CLUSTER_MAX;

4198

if (min_pages > 128)

4199

if (min_pages > 128)

4199

min_pages = 128;

4200

min_pages = 128;

4200

zone->pages_min = min_pages;

4201

zone->pages_min = min_pages;

4201

} else {

4202

} else {

4202

/*

4203

/*

4203

* If it's a lowmem zone, reserve a number of pages

4204

* If it's a lowmem zone, reserve a number of pages

4204

* proportionate to the zone's size.

4205

* proportionate to the zone's size.

4205

*/

4206

*/

4206

zone->pages_min = tmp;

4207

zone->pages_min = tmp;

4207

}

4208

}

4208

4209

zone->pages_low = zone->pages_min + (tmp >> 2);

4210

zone->pages_low = zone->pages_min + (tmp >> 2);

4210

zone->pages_high = zone->pages_min + (tmp >> 1);

4211

zone->pages_high = zone->pages_min + (tmp >> 1);

4211

setup_zone_migrate_reserve(zone);

4212

setup_zone_migrate_reserve(zone);

4212

spin_unlock_irqrestore(&zone->lru_lock, flags);

4213

spin_unlock_irqrestore(&zone->lru_lock, flags);

4213

}

4214

}

4214

4215

/* update totalreserve_pages */

4216

/* update totalreserve_pages */

4216

calculate_totalreserve_pages();

4217

calculate_totalreserve_pages();

4217

}

4218

}

4218

4219

/*

4220

/*

4220

* Initialise min_free_kbytes.

4221

* Initialise min_free_kbytes.

4221

*

4222

*

4222

* For small machines we want it small (128k min). For large machines

4223

* For small machines we want it small (128k min). For large machines

4223

* we want it large (64MB max). But it is not linear, because network

4224

* we want it large (64MB max). But it is not linear, because network

4224

* bandwidth does not increase linearly with machine size. We use

4225

* bandwidth does not increase linearly with machine size. We use

4225

*

4226

*

4226

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

4227

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

4227

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

4228

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

4228

*

4229

*

4229

* which yields

4230

* which yields

4230

*

4231

*

4231

* 16MB: 512k

4232

* 16MB: 512k

4232

* 32MB: 724k

4233

* 32MB: 724k

4233

* 64MB: 1024k

4234

* 64MB: 1024k

4234

* 128MB: 1448k

4235

* 128MB: 1448k

4235

* 256MB: 2048k

4236

* 256MB: 2048k

4236

* 512MB: 2896k

4237

* 512MB: 2896k

4237

* 1024MB: 4096k

4238

* 1024MB: 4096k

4238

* 2048MB: 5792k

4239

* 2048MB: 5792k

4239

* 4096MB: 8192k

4240

* 4096MB: 8192k

4240

* 8192MB: 11584k

4241

* 8192MB: 11584k

4241

* 16384MB: 16384k

4242

* 16384MB: 16384k

4242

*/

4243

*/

4243

static int __init init_per_zone_pages_min(void)

4244

static int __init init_per_zone_pages_min(void)

4244

{

4245

{

4245

unsigned long lowmem_kbytes;

4246

unsigned long lowmem_kbytes;

4246

4247

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

4248

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

4248

4249

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

4250

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

4250

if (min_free_kbytes < 128)

4251

if (min_free_kbytes < 128)

4251

min_free_kbytes = 128;

4252

min_free_kbytes = 128;

4252

if (min_free_kbytes > 65536)

4253

if (min_free_kbytes > 65536)

4253

min_free_kbytes = 65536;

4254

min_free_kbytes = 65536;

4254

setup_per_zone_pages_min();

4255

setup_per_zone_pages_min();

4255

setup_per_zone_lowmem_reserve();

4256

setup_per_zone_lowmem_reserve();

4256

return 0;

4257

return 0;

4257

}

4258

}

4258

module_init(init_per_zone_pages_min)

4259

module_init(init_per_zone_pages_min)

4259

4260

/*

4261

/*

4261

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

4262

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

4262

* that we can call two helper functions whenever min_free_kbytes

4263

* that we can call two helper functions whenever min_free_kbytes

4263

* changes.

4264

* changes.

4264

*/

4265

*/

4265

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

4266

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

4266

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4267

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4267

{

4268

{

4268

proc_dointvec(table, write, file, buffer, length, ppos);

4269

proc_dointvec(table, write, file, buffer, length, ppos);

4269

if (write)

4270

if (write)

4270

setup_per_zone_pages_min();

4271

setup_per_zone_pages_min();

4271

return 0;

4272

return 0;

4272

}

4273

}

4273

4274

#ifdef CONFIG_NUMA

4275

#ifdef CONFIG_NUMA

4275

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

4276

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

4276

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4277

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4277

{

4278

{

4278

struct zone *zone;

4279

struct zone *zone;

4279

int rc;

4280

int rc;

4280

4281

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4282

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4282

if (rc)

4283

if (rc)

4283

return rc;

4284

return rc;

4284

4285

for_each_zone(zone)

4286

for_each_zone(zone)

4286

zone->min_unmapped_pages = (zone->present_pages *

4287

zone->min_unmapped_pages = (zone->present_pages *

4287

sysctl_min_unmapped_ratio) / 100;

4288

sysctl_min_unmapped_ratio) / 100;

4288

return 0;

4289

return 0;

4289

}

4290

}

4290

4291

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

4292

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

4292

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4293

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4293

{

4294

{

4294

struct zone *zone;

4295

struct zone *zone;

4295

int rc;

4296

int rc;

4296

4297

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4298

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4298

if (rc)

4299

if (rc)

4299

return rc;

4300

return rc;

4300

4301

for_each_zone(zone)

4302

for_each_zone(zone)

4302

zone->min_slab_pages = (zone->present_pages *

4303

zone->min_slab_pages = (zone->present_pages *

4303

sysctl_min_slab_ratio) / 100;

4304

sysctl_min_slab_ratio) / 100;

4304

return 0;

4305

return 0;

4305

}

4306

}

4306

#endif

4307

#endif

4307

4308

/*

4309

/*

4309

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

4310

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

4310

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

4311

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

4311

* whenever sysctl_lowmem_reserve_ratio changes.

4312

* whenever sysctl_lowmem_reserve_ratio changes.

4312

*

4313

*

4313

* The reserve ratio obviously has absolutely no relation with the

4314

* The reserve ratio obviously has absolutely no relation with the

4314

* pages_min watermarks. The lowmem reserve ratio can only make sense

4315

* pages_min watermarks. The lowmem reserve ratio can only make sense

4315

* if in function of the boot time zone sizes.

4316

* if in function of the boot time zone sizes.

4316

*/

4317

*/

4317

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

4318

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

4318

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4319

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4319

{

4320

{

4320

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4321

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4321

setup_per_zone_lowmem_reserve();

4322

setup_per_zone_lowmem_reserve();

4322

return 0;

4323

return 0;

4323

}

4324

}

4324

4325

/*

4326

/*

4326

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

4327

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

4327

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

4328

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

4328

* can have before it gets flushed back to buddy allocator.

4329

* can have before it gets flushed back to buddy allocator.

4329

*/

4330

*/

4330

4331

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

4332

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

4332

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4333

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4333

{

4334

{

4334

struct zone *zone;

4335

struct zone *zone;

4335

unsigned int cpu;

4336

unsigned int cpu;

4336

int ret;

4337

int ret;

4337

4338

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4339

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4339

if (!write || (ret == -EINVAL))

4340

if (!write || (ret == -EINVAL))

4340

return ret;

4341

return ret;

4341

for_each_zone(zone) {

4342

for_each_zone(zone) {

4342

for_each_online_cpu(cpu) {

4343

for_each_online_cpu(cpu) {

4343

unsigned long high;

4344

unsigned long high;

4344

high = zone->present_pages / percpu_pagelist_fraction;

4345

high = zone->present_pages / percpu_pagelist_fraction;

4345

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

4346

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

4346

}

4347

}

4347

}

4348

}

4348

return 0;

4349

return 0;

4349

}

4350

}

4350

4351

int hashdist = HASHDIST_DEFAULT;

4352

int hashdist = HASHDIST_DEFAULT;

4352

4353

#ifdef CONFIG_NUMA

4354

#ifdef CONFIG_NUMA

4354

static int __init set_hashdist(char *str)

4355

static int __init set_hashdist(char *str)

4355

{

4356

{

4356

if (!str)

4357

if (!str)

4357

return 0;

4358

return 0;

4358

hashdist = simple_strtoul(str, &str, 0);

4359

hashdist = simple_strtoul(str, &str, 0);

4359

return 1;

4360

return 1;

4360

}

4361

}

4361

__setup("hashdist=", set_hashdist);

4362

__setup("hashdist=", set_hashdist);

4362

#endif

4363

#endif

4363

4364

/*

4365

/*

4365

* allocate a large system hash table from bootmem

4366

* allocate a large system hash table from bootmem

4366

* - it is assumed that the hash table must contain an exact power-of-2

4367

* - it is assumed that the hash table must contain an exact power-of-2

4367

* quantity of entries

4368

* quantity of entries

4368

* - limit is the number of hash buckets, not the total allocation size

4369

* - limit is the number of hash buckets, not the total allocation size

4369

*/

4370

*/

4370

void *__init alloc_large_system_hash(const char *tablename,

4371

void *__init alloc_large_system_hash(const char *tablename,

4371

unsigned long bucketsize,

4372

unsigned long bucketsize,

4372

unsigned long numentries,

4373

unsigned long numentries,

4373

int scale,

4374

int scale,

4374

int flags,

4375

int flags,

4375

unsigned int *_hash_shift,

4376

unsigned int *_hash_shift,

4376

unsigned int *_hash_mask,

4377

unsigned int *_hash_mask,

4377

unsigned long limit)

4378

unsigned long limit)

4378

{

4379

{

4379

unsigned long long max = limit;

4380

unsigned long long max = limit;

4380

unsigned long log2qty, size;

4381

unsigned long log2qty, size;

4381

void *table = NULL;

4382

void *table = NULL;

4382

4383

/* allow the kernel cmdline to have a say */

4384

/* allow the kernel cmdline to have a say */

4384

if (!numentries) {

4385

if (!numentries) {

4385

/* round applicable memory size up to nearest megabyte */

4386

/* round applicable memory size up to nearest megabyte */

4386

numentries = nr_kernel_pages;

4387

numentries = nr_kernel_pages;

4387

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

4388

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

4388

numentries >>= 20 - PAGE_SHIFT;

4389

numentries >>= 20 - PAGE_SHIFT;

4389

numentries <<= 20 - PAGE_SHIFT;

4390

numentries <<= 20 - PAGE_SHIFT;

4390

4391

/* limit to 1 bucket per 2^scale bytes of low memory */

4392

/* limit to 1 bucket per 2^scale bytes of low memory */

4392

if (scale > PAGE_SHIFT)

4393

if (scale > PAGE_SHIFT)

4393

numentries >>= (scale - PAGE_SHIFT);

4394

numentries >>= (scale - PAGE_SHIFT);

4394

else

4395

else

4395

numentries <<= (PAGE_SHIFT - scale);

4396

numentries <<= (PAGE_SHIFT - scale);

4396

4397

/* Make sure we've got at least a 0-order allocation.. */

4398

/* Make sure we've got at least a 0-order allocation.. */

4398

if (unlikely((numentries * bucketsize) < PAGE_SIZE))

4399

if (unlikely((numentries * bucketsize) < PAGE_SIZE))

4399

numentries = PAGE_SIZE / bucketsize;

4400

numentries = PAGE_SIZE / bucketsize;

4400

}

4401

}

4401

numentries = roundup_pow_of_two(numentries);

4402

numentries = roundup_pow_of_two(numentries);

4402

4403

/* limit allocation size to 1/16 total memory by default */

4404

/* limit allocation size to 1/16 total memory by default */

4404

if (max == 0) {

4405

if (max == 0) {

4405

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

4406

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

4406

do_div(max, bucketsize);

4407

do_div(max, bucketsize);

4407

}

4408

}

4408

4409

if (numentries > max)

4410

if (numentries > max)

4410

numentries = max;

4411

numentries = max;

4411

4412

log2qty = ilog2(numentries);

4413

log2qty = ilog2(numentries);

4413

4414

do {

4415

do {

4415

size = bucketsize << log2qty;

4416

size = bucketsize << log2qty;

4416

if (flags & HASH_EARLY)

4417

if (flags & HASH_EARLY)

4417

table = alloc_bootmem(size);

4418

table = alloc_bootmem(size);

4418

else if (hashdist)

4419

else if (hashdist)

4419

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

4420

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

4420

else {

4421

else {

4421

unsigned long order = get_order(size);

4422

unsigned long order = get_order(size);

4422

table = (void*) __get_free_pages(GFP_ATOMIC, order);

4423

table = (void*) __get_free_pages(GFP_ATOMIC, order);

4423

/*

4424

/*

4424

* If bucketsize is not a power-of-two, we may free

4425

* If bucketsize is not a power-of-two, we may free

4425

* some pages at the end of hash table.

4426

* some pages at the end of hash table.

4426

*/

4427

*/

4427

if (table) {

4428

if (table) {

4428

unsigned long alloc_end = (unsigned long)table +

4429

unsigned long alloc_end = (unsigned long)table +

4429

(PAGE_SIZE << order);

4430

(PAGE_SIZE << order);

4430

unsigned long used = (unsigned long)table +

4431

unsigned long used = (unsigned long)table +

4431

PAGE_ALIGN(size);

4432

PAGE_ALIGN(size);

4432

split_page(virt_to_page(table), order);

4433

split_page(virt_to_page(table), order);

4433

while (used < alloc_end) {

4434

while (used < alloc_end) {

4434

free_page(used);

4435

free_page(used);

4435

used += PAGE_SIZE;

4436

used += PAGE_SIZE;

4436

}

4437

}

4437

}

4438

}

4438

}

4439

}

4439

} while (!table && size > PAGE_SIZE && --log2qty);

4440

} while (!table && size > PAGE_SIZE && --log2qty);

4440

4441

if (!table)

4442

if (!table)

4442

panic("Failed to allocate %s hash table\n", tablename);

4443

panic("Failed to allocate %s hash table\n", tablename);

4443

4444

printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",

4445

printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",

4445

tablename,

4446

tablename,

4446

(1U << log2qty),

4447

(1U << log2qty),

4447

ilog2(size) - PAGE_SHIFT,

4448

ilog2(size) - PAGE_SHIFT,

4448

size);

4449

size);

4449

4450

if (_hash_shift)

4451

if (_hash_shift)

4451

*_hash_shift = log2qty;

4452

*_hash_shift = log2qty;

4452

if (_hash_mask)

4453

if (_hash_mask)

4453

*_hash_mask = (1 << log2qty) - 1;

4454

*_hash_mask = (1 << log2qty) - 1;

4454

4455

return table;

4456

return table;

4456

}

4457

}

4457

4458

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

4459

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

4459

struct page *pfn_to_page(unsigned long pfn)

4460

struct page *pfn_to_page(unsigned long pfn)

4460

{

4461

{

4461

return __pfn_to_page(pfn);

4462

return __pfn_to_page(pfn);

4462

}

4463

}

4463

unsigned long page_to_pfn(struct page *page)

4464

unsigned long page_to_pfn(struct page *page)

4464

{

4465

{

4465

return __page_to_pfn(page);

4466

return __page_to_pfn(page);

4466

}

4467

}

4467

EXPORT_SYMBOL(pfn_to_page);

4468

EXPORT_SYMBOL(pfn_to_page);

4468

EXPORT_SYMBOL(page_to_pfn);

4469

EXPORT_SYMBOL(page_to_pfn);

4469

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

4470

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

4470

4471

/* Return a pointer to the bitmap storing bits affecting a block of pages */

4472

/* Return a pointer to the bitmap storing bits affecting a block of pages */

4472

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

4473

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

4473

unsigned long pfn)

4474

unsigned long pfn)

4474

{

4475

{

4475

#ifdef CONFIG_SPARSEMEM

4476

#ifdef CONFIG_SPARSEMEM

4476

return __pfn_to_section(pfn)->pageblock_flags;

4477

return __pfn_to_section(pfn)->pageblock_flags;

4477

#else

4478

#else

4478

return zone->pageblock_flags;

4479

return zone->pageblock_flags;

4479

#endif /* CONFIG_SPARSEMEM */

4480

#endif /* CONFIG_SPARSEMEM */

4480

}

4481

}

4481

4482

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

4483

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

4483

{

4484

{

4484

#ifdef CONFIG_SPARSEMEM

4485

#ifdef CONFIG_SPARSEMEM

4485

pfn &= (PAGES_PER_SECTION-1);

4486

pfn &= (PAGES_PER_SECTION-1);

4486

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4487

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4487

#else

4488

#else

4488

pfn = pfn - zone->zone_start_pfn;

4489

pfn = pfn - zone->zone_start_pfn;

4489

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4490

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4490

#endif /* CONFIG_SPARSEMEM */

4491

#endif /* CONFIG_SPARSEMEM */

4491

}

4492

}

4492

4493

/**

4494

/**

4494

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

4495

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

4495

* @page: The page within the block of interest

4496

* @page: The page within the block of interest

4496

* @start_bitidx: The first bit of interest to retrieve

4497

* @start_bitidx: The first bit of interest to retrieve

4497

* @end_bitidx: The last bit of interest

4498

* @end_bitidx: The last bit of interest

4498

* returns pageblock_bits flags

4499

* returns pageblock_bits flags

4499

*/

4500

*/

4500

unsigned long get_pageblock_flags_group(struct page *page,

4501

unsigned long get_pageblock_flags_group(struct page *page,

4501

int start_bitidx, int end_bitidx)

4502

int start_bitidx, int end_bitidx)

4502

{

4503

{

4503

struct zone *zone;

4504

struct zone *zone;

4504

unsigned long *bitmap;

4505

unsigned long *bitmap;

4505

unsigned long pfn, bitidx;

4506

unsigned long pfn, bitidx;

4506

unsigned long flags = 0;

4507

unsigned long flags = 0;

4507

unsigned long value = 1;

4508

unsigned long value = 1;

4508

4509

zone = page_zone(page);

4510

zone = page_zone(page);

4510

pfn = page_to_pfn(page);

4511

pfn = page_to_pfn(page);

4511

bitmap = get_pageblock_bitmap(zone, pfn);

4512

bitmap = get_pageblock_bitmap(zone, pfn);

4512

bitidx = pfn_to_bitidx(zone, pfn);

4513

bitidx = pfn_to_bitidx(zone, pfn);

4513

4514

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4515

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4515

if (test_bit(bitidx + start_bitidx, bitmap))

4516

if (test_bit(bitidx + start_bitidx, bitmap))

4516

flags |= value;

4517

flags |= value;

4517

4518

return flags;

4519

return flags;

4519

}

4520

}

4520

4521

/**

4522

/**

4522

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

4523

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

4523

* @page: The page within the block of interest

4524

* @page: The page within the block of interest

4524

* @start_bitidx: The first bit of interest

4525

* @start_bitidx: The first bit of interest

4525

* @end_bitidx: The last bit of interest

4526

* @end_bitidx: The last bit of interest

4526

* @flags: The flags to set

4527

* @flags: The flags to set

4527

*/

4528

*/

4528

void set_pageblock_flags_group(struct page *page, unsigned long flags,

4529

void set_pageblock_flags_group(struct page *page, unsigned long flags,

4529

int start_bitidx, int end_bitidx)

4530

int start_bitidx, int end_bitidx)

4530

{

4531

{

4531

struct zone *zone;

4532

struct zone *zone;

4532

unsigned long *bitmap;

4533

unsigned long *bitmap;

4533

unsigned long pfn, bitidx;

4534

unsigned long pfn, bitidx;

4534

unsigned long value = 1;

4535

unsigned long value = 1;

4535

4536

zone = page_zone(page);

4537

zone = page_zone(page);

4537

pfn = page_to_pfn(page);

4538

pfn = page_to_pfn(page);

4538

bitmap = get_pageblock_bitmap(zone, pfn);

4539

bitmap = get_pageblock_bitmap(zone, pfn);

4539

bitidx = pfn_to_bitidx(zone, pfn);

4540

bitidx = pfn_to_bitidx(zone, pfn);

4540

VM_BUG_ON(pfn < zone->zone_start_pfn);

4541

VM_BUG_ON(pfn < zone->zone_start_pfn);

4541

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

4542

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

4542

4543

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4544

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4544

if (flags & value)

4545

if (flags & value)

4545

__set_bit(bitidx + start_bitidx, bitmap);

4546

__set_bit(bitidx + start_bitidx, bitmap);

4546

else

4547

else

4547

__clear_bit(bitidx + start_bitidx, bitmap);

4548

__clear_bit(bitidx + start_bitidx, bitmap);

4548

}

4549

}

4549

4550

/*

4551

/*

4551

* This is designed as sub function...plz see page_isolation.c also.

4552

* This is designed as sub function...plz see page_isolation.c also.

4552

* set/clear page block's type to be ISOLATE.

4553

* set/clear page block's type to be ISOLATE.

4553

* page allocater never alloc memory from ISOLATE block.

4554

* page allocater never alloc memory from ISOLATE block.

4554

*/

4555

*/

4555

4556

int set_migratetype_isolate(struct page *page)

4557

int set_migratetype_isolate(struct page *page)

4557

{

4558

{

4558

struct zone *zone;

4559

struct zone *zone;

4559

unsigned long flags;

4560

unsigned long flags;

4560

int ret = -EBUSY;

4561

int ret = -EBUSY;

4561

4562

zone = page_zone(page);

4563

zone = page_zone(page);

4563

spin_lock_irqsave(&zone->lock, flags);

4564

spin_lock_irqsave(&zone->lock, flags);

4564

/*

4565

/*

4565

* In future, more migrate types will be able to be isolation target.

4566

* In future, more migrate types will be able to be isolation target.

4566

*/

4567

*/

4567

if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)

4568

if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)

4568

goto out;

4569

goto out;

4569

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

4570

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

4570

move_freepages_block(zone, page, MIGRATE_ISOLATE);

4571

move_freepages_block(zone, page, MIGRATE_ISOLATE);

4571

ret = 0;

4572

ret = 0;

4572

out:

4573

out:

4573

spin_unlock_irqrestore(&zone->lock, flags);

4574

spin_unlock_irqrestore(&zone->lock, flags);

4574

if (!ret)

4575

if (!ret)

4575

drain_all_pages();

4576

drain_all_pages();

4576

return ret;

4577

return ret;

4577

}

4578

}

4578

4579

void unset_migratetype_isolate(struct page *page)

4580

void unset_migratetype_isolate(struct page *page)

4580

{

4581

{

4581

struct zone *zone;

4582

struct zone *zone;

4582

unsigned long flags;

4583

unsigned long flags;

4583

zone = page_zone(page);

4584

zone = page_zone(page);

4584

spin_lock_irqsave(&zone->lock, flags);

4585

spin_lock_irqsave(&zone->lock, flags);

4585

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

4586

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

4586

goto out;

4587

goto out;

4587

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4588

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4588

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4589

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4589

out:

4590

out:

4590

spin_unlock_irqrestore(&zone->lock, flags);

4591

spin_unlock_irqrestore(&zone->lock, flags);

4591

}

4592

}

4592

4593

#ifdef CONFIG_MEMORY_HOTREMOVE

4594

#ifdef CONFIG_MEMORY_HOTREMOVE

4594

/*

4595

/*

4595

* All pages in the range must be isolated before calling this.

4596

* All pages in the range must be isolated before calling this.

4596

*/

4597

*/

4597

void

4598

void

4598

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

4599

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

4599

{

4600

{

4600

struct page *page;

4601

struct page *page;

4601

struct zone *zone;

4602

struct zone *zone;

4602

int order, i;

4603

int order, i;

4603

unsigned long pfn;

4604

unsigned long pfn;

4604

unsigned long flags;

4605

unsigned long flags;

4605

/* find the first valid pfn */

4606

/* find the first valid pfn */

4606

for (pfn = start_pfn; pfn < end_pfn; pfn++)

4607

for (pfn = start_pfn; pfn < end_pfn; pfn++)

4607

if (pfn_valid(pfn))

4608

if (pfn_valid(pfn))

4608

break;

4609

break;

4609

if (pfn == end_pfn)

4610

if (pfn == end_pfn)

4610

return;

4611

return;

4611

zone = page_zone(pfn_to_page(pfn));

4612

zone = page_zone(pfn_to_page(pfn));

4612

spin_lock_irqsave(&zone->lock, flags);

4613

spin_lock_irqsave(&zone->lock, flags);

4613

pfn = start_pfn;

4614

pfn = start_pfn;

4614

while (pfn < end_pfn) {

4615

while (pfn < end_pfn) {

4615

if (!pfn_valid(pfn)) {

4616

if (!pfn_valid(pfn)) {

4616

pfn++;

4617

pfn++;

4617

continue;

4618

continue;

4618

}

4619

}

4619

page = pfn_to_page(pfn);

4620

page = pfn_to_page(pfn);

4620

BUG_ON(page_count(page));

4621

BUG_ON(page_count(page));

4621

BUG_ON(!PageBuddy(page));

4622

BUG_ON(!PageBuddy(page));

4622

order = page_order(page);

4623

order = page_order(page);

4623

#ifdef CONFIG_DEBUG_VM

4624

#ifdef CONFIG_DEBUG_VM

4624

printk(KERN_INFO "remove from free list %lx %d %lx\n",

4625

printk(KERN_INFO "remove from free list %lx %d %lx\n",

4625

pfn, 1 << order, end_pfn);

4626

pfn, 1 << order, end_pfn);

4626

#endif

4627

#endif

4627

list_del(&page->lru);

4628

list_del(&page->lru);

4628

rmv_page_order(page);

4629

rmv_page_order(page);

4629

zone->free_area[order].nr_free--;

4630

zone->free_area[order].nr_free--;

4630

__mod_zone_page_state(zone, NR_FREE_PAGES,

4631

__mod_zone_page_state(zone, NR_FREE_PAGES,

4631

- (1UL << order));

4632

- (1UL << order));

4632

for (i = 0; i < (1 << order); i++)

4633

for (i = 0; i < (1 << order); i++)

4633

SetPageReserved((page+i));

4634

SetPageReserved((page+i));

4634

pfn += (1 << order);

4635

pfn += (1 << order);

4635

}

4636

}

4636

spin_unlock_irqrestore(&zone->lock, flags);

4637

spin_unlock_irqrestore(&zone->lock, flags);

4637

}

4638

}

4638

#endif

4639

#endif

4639

4640

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

mm: print out the zonelists on request for manual verification

 /* internal.h: mm/ internal definitions
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 #ifndef __MM_INTERNAL_H
 #define __MM_INTERNAL_H
 #include <linux/mm.h>
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
 }
 /*
  * Turn a non-refcounted page (->_count == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
 {
 	VM_BUG_ON(PageTail(page));
 	VM_BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 static inline void __put_page(struct page *page)
 {
 	atomic_dec(&page->_count);
 }
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page)
 {
 	VM_BUG_ON(!PageBuddy(page));
 	return page_private(page);
 }
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
  * in those cases. SPARSEMEM, however, allows for memory hotplug,
  * and alloc_bootmem_node is not used.
  */
 #ifdef CONFIG_SPARSEMEM
 #define __paginginit __meminit
 #else
 #define __paginginit __init
 #endif
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
 	MMINIT_VERIFY,
 	MMINIT_TRACE
 };
 #ifdef CONFIG_DEBUG_MEMORY_INIT
 extern int mminit_loglevel;
 #define mminit_dprintk(level, prefix, fmt, arg...) \
 do { \
 	if (level < mminit_loglevel) { \
 		printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
 		printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
 	} \
 } while (0)
 extern void mminit_verify_pageflags_layout(void);
 extern void mminit_verify_page_links(struct page *page,
 		enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
 #else
 static inline void mminit_dprintk(enum mminit_level level,
 				const char *prefix, const char *fmt, ...)
 {
 }
 static inline void mminit_verify_pageflags_layout(void)
 {
 }
 static inline void mminit_verify_page_links(struct page *page,
 		enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+static inline void mminit_verify_zonelist(void)
 {
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
 #if defined(CONFIG_SPARSEMEM)
 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 				unsigned long *end_pfn);
 #else
 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 				unsigned long *end_pfn)
 {
 }
 #endif /* CONFIG_SPARSEMEM */
 #endif

 /*
  * mm_init.c - Memory initialisation verification and debugging
  *
  * Copyright 2008 IBM Corporation, 2008
  * Author Mel Gorman <mel@csn.ul.ie>
  *
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include "internal.h"
 int __meminitdata mminit_loglevel;
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+	int nid;
+	if (mminit_loglevel < MMINIT_VERIFY)
+		return;
+	for_each_online_node(nid) {
+		pg_data_t *pgdat = NODE_DATA(nid);
+		struct zone *zone;
+		struct zoneref *z;
+		struct zonelist *zonelist;
+		int i, listid, zoneid;
+		BUG_ON(MAX_ZONELISTS > 2);
+		for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+			/* Identify the zone and nodelist */
+			zoneid = i % MAX_NR_ZONES;
+			listid = i / MAX_NR_ZONES;
+			zonelist = &pgdat->node_zonelists[listid];
+			zone = &pgdat->node_zones[zoneid];
+			if (!populated_zone(zone))
+				continue;
+			/* Print information about the zonelist */
+			printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+				listid > 0 ? "thisnode" : "general", nid,
+				zone->name);
+			/* Iterate the zonelist */
+			for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+				printk(KERN_CONT "%d:%s ",
+					zone->node, zone->name);
+#else
+				printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+			}
+			printk(KERN_CONT "\n");
+		}
+	}
+}
 void __init mminit_verify_pageflags_layout(void)
 {
 	int shift, width;
 	unsigned long or_mask, add_mask;
 	shift = 8 * sizeof(unsigned long);
 	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
 		"Section %d Node %d Zone %d Flags %d\n",
 		SECTIONS_WIDTH,
 		NODES_WIDTH,
 		ZONES_WIDTH,
 		NR_PAGEFLAGS);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
 		"Section %d Node %d Zone %d\n",
 #ifdef SECTIONS_SHIFT
 		SECTIONS_SHIFT,
 #else
 		0,
 #endif
 		NODES_SHIFT,
 		ZONES_SHIFT);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
 		"Section %lu Node %lu Zone %lu\n",
 		(unsigned long)SECTIONS_PGSHIFT,
 		(unsigned long)NODES_PGSHIFT,
 		(unsigned long)ZONES_PGSHIFT);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
 		"Zone ID: %lu -> %lu\n",
 		(unsigned long)ZONEID_PGOFF,
 		(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
 		"location: %d -> %d unused %d -> %d flags %d -> %d\n",
 		shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
 #ifdef NODE_NOT_IN_PAGE_FLAGS
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
 		"Node not in page flags");
 #endif
 	if (SECTIONS_WIDTH) {
 		shift -= SECTIONS_WIDTH;
 		BUG_ON(shift != SECTIONS_PGSHIFT);
 	}
 	if (NODES_WIDTH) {
 		shift -= NODES_WIDTH;
 		BUG_ON(shift != NODES_PGSHIFT);
 	}
 	if (ZONES_WIDTH) {
 		shift -= ZONES_WIDTH;
 		BUG_ON(shift != ZONES_PGSHIFT);
 	}
 	/* Check for bitmask overlaps */
 	or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
 			(NODES_MASK << NODES_PGSHIFT) |
 			(SECTIONS_MASK << SECTIONS_PGSHIFT);
 	add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
 			(NODES_MASK << NODES_PGSHIFT) +
 			(SECTIONS_MASK << SECTIONS_PGSHIFT);
 	BUG_ON(or_mask != add_mask);
 }
 void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
 			unsigned long nid, unsigned long pfn)
 {
 	BUG_ON(page_to_nid(page) != nid);
 	BUG_ON(page_zonenum(page) != zone);
 	BUG_ON(page_to_pfn(page) != pfn);
 }
 static __init int set_mminit_loglevel(char *str)
 {
 	get_option(&str, &mminit_loglevel);
 	return 0;
 }
 early_param("mminit_loglevel", set_mminit_loglevel);

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/memcontrol.h>
 #include <linux/debugobjects.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
   /*
    * MAX_ACTIVE_REGIONS determines the maximum number of distinct
    * ranges of memory (RAM) that may be registered with add_active_range().
    * Ranges passed to add_active_range() will be merged if possible
    * so the number of times add_active_range() can be called is
    * related to the number of nodes and the number of holes
    */
   #ifdef CONFIG_MAX_ACTIVE_REGIONS
     /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
     #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
   #else
     #if MAX_NUMNODES >= 32
       /* If there can be many nodes, allow up to 50 holes per node */
       #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
     #else
       /* By default, allow up to 256 distinct regions */
       #define MAX_ACTIVE_REGIONS 256
     #endif
   #endif
   static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
   static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
   static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
   /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
   int movable_zone;
   EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 EXPORT_SYMBOL(nr_node_ids);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 static void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	do {
 		seq = zone_span_seqbegin(zone);
 		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
 			ret = 1;
 		else if (pfn < zone->zone_start_pfn)
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	void *pc = page_get_page_cgroup(page);
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
 	if (pc) {
 		printk(KERN_EMERG "cgroup:%p\n", pc);
 		page_reset_bad_cgroup(page);
 	}
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
 	page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
 	add_taint(TAINT_BAD_PAGE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		__SetPageTail(p);
 		p->first_page = page;
 	}
 }
 static void destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	if (unlikely(compound_order(page) != order))
 		bad_page(page);
 	if (unlikely(!PageHead(page)))
 			bad_page(page);
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) |
 				(p->first_page != page)))
 			bad_page(page);
 		__ClearPageTail(p);
 	}
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
 {
 	unsigned long buddy_idx = page_idx ^ (1 << order);
 	return page + (buddy_idx - page_idx);
 }
 static inline unsigned long
 __find_combined_index(unsigned long page_idx, unsigned int order)
 {
 	return (page_idx & ~(1 << order));
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_buddy. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
 	int order_size = 1 << order;
 	int migratetype = get_pageblock_migratetype(page);
 	if (unlikely(PageCompound(page)))
 		destroy_compound_page(page, order);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & (order_size - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
 	while (order < MAX_ORDER-1) {
 		unsigned long combined_idx;
 		struct page *buddy;
 		buddy = __page_find_buddy(page, page_idx, order);
 		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
 		zone->free_area[order].nr_free--;
 		rmv_page_order(buddy);
 		combined_idx = __find_combined_index(page_idx, order);
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	list_add(&page->lru,
 		&zone->free_area[order].free_list[migratetype]);
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not free the page.  But we shall soon need
 	 * to do more, for when the ZERO_PAGE count wraps negative.
 	 */
 	return PageReserved(page);
 }
 /*
  * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
 	while (count--) {
 		struct page *page;
 		VM_BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
 		__free_one_page(page, zone, order);
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order);
 	spin_unlock(&zone->lock);
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int i;
 	int reserved = 0;
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
 		return;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order);
 	local_irq_restore(flags);
 }
 /*
  * permit the bootmem allocator to evade page validation on high-order frees
  */
 void __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
 		set_page_refcounted(page);
 		__free_page(page);
 	} else {
 		int loop;
 		prefetchw(page);
 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
 			struct page *p = &page[loop];
 			if (loop + 1 < BITS_PER_LONG)
 				prefetchw(p + 1);
 			__ClearPageReserved(p);
 			set_page_count(p, 0);
 		}
 		set_page_refcounted(page);
 		__free_pages(page, order);
 	}
 }
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- wli
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
 		bad_page(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not allocate the page: as a safety net.
 	 */
 	if (PageReserved(page))
 		return 1;
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area * area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			struct page *start_page, struct page *end_page,
 			int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_del(&page->lru);
 		list_add(&page->lru,
 			&zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (start_pfn < zone->zone_start_pfn)
 		start_page = page;
 	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static struct page *__rmqueue_fallback(struct zone *zone, int order,
 						int start_migratetype)
 {
 	struct free_area * area;
 	int current_order;
 	struct page *page;
 	int migratetype, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				continue;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			/*
 			 * If breaking a large block of pages, move all free
 			 * pages to the preferred allocation list. If falling
 			 * back for a reclaimable kernel allocation, be more
 			 * agressive about taking ownership of free pages
 			 */
 			if (unlikely(current_order >= (pageblock_order >> 1)) ||
 					start_migratetype == MIGRATE_RECLAIMABLE) {
 				unsigned long pages;
 				pages = move_freepages_block(zone, page,
 								start_migratetype);
 				/* Claim the whole block if over half of it is free */
 				if (pages >= (1 << (pageblock_order-1)))
 					set_pageblock_migratetype(page,
 								start_migratetype);
 				migratetype = start_migratetype;
 			}
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			__mod_zone_page_state(zone, NR_FREE_PAGES,
 							-(1UL << order));
 			if (current_order == pageblock_order)
 				set_pageblock_migratetype(page,
 							start_migratetype);
 			expand(zone, page, order, current_order, area, migratetype);
 			return page;
 		}
 	}
 	/* Use MIGRATE_RESERVE rather than fail an allocation */
 	return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page))
 		page = __rmqueue_fallback(zone, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype)
 {
 	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		list_add(&page->lru, list);
 		set_page_private(page, migratetype);
 		list = &page->lru;
 	}
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	local_irq_save(flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	free_pages_bulk(zone, to_drain, &pcp->list, 0);
 	pcp->count -= to_drain;
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		if (!populated_zone(zone))
 			continue;
 		pset = zone_pcp(zone, cpu);
 		pcp = &pset->pcp;
 		local_irq_save(flags);
 		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 		pcp->count = 0;
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator
  */
 void drain_all_pages(void)
 {
 	on_each_cpu(drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (!zone->spanned_pages)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  */
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	if (PageAnon(page))
 		page->mapping = NULL;
 	if (free_pages_check(page))
 		return;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
 	}
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
 	else
 		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 		pcp->count -= pcp->batch;
 	}
 	local_irq_restore(flags);
 	put_cpu();
 }
 void free_hot_page(struct page *page)
 {
 	free_hot_cold_page(page, 0);
 }
 void free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 again:
 	cpu  = get_cpu();
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		pcp = &zone_pcp(zone, cpu)->pcp;
 		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			if (unlikely(!pcp->count))
 				goto failed;
 		}
 		/* Find a page of the appropriate migrate type */
 		if (cold) {
 			list_for_each_entry_reverse(page, &pcp->list, lru)
 				if (page_private(page) == migratetype)
 					break;
 		} else {
 			list_for_each_entry(page, &pcp->list, lru)
 				if (page_private(page) == migratetype)
 					break;
 		}
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			page = list_entry(pcp->list.next, struct page, lru);
 		}
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 	}
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 	put_cpu();
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	put_cpu();
 	return NULL;
 }
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
 #define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
 #define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
 #define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct fail_page_alloc_attr {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 	struct dentry *ignore_gfp_highmem_file;
 	struct dentry *ignore_gfp_wait_file;
 	struct dentry *min_order_file;
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return 0;
 	if (gfp_mask & __GFP_NOFAIL)
 		return 0;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return 0;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return 0;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	int err;
 	err = init_fault_attr_dentries(&fail_page_alloc.attr,
 				       "fail_page_alloc");
 	if (err)
 		return err;
 	dir = fail_page_alloc.attr.dentries.dir;
 	fail_page_alloc.ignore_gfp_wait_file =
 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				      &fail_page_alloc.ignore_gfp_wait);
 	fail_page_alloc.ignore_gfp_highmem_file =
 		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				      &fail_page_alloc.ignore_gfp_highmem);
 	fail_page_alloc.min_order_file =
 		debugfs_create_u32("min-order", mode, dir,
 				   &fail_page_alloc.min_order);
 	if (!fail_page_alloc.ignore_gfp_wait_file ||
             !fail_page_alloc.ignore_gfp_highmem_file ||
             !fail_page_alloc.min_order_file) {
 		err = -ENOMEM;
 		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
 		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
 		debugfs_remove(fail_page_alloc.min_order_file);
 		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
 	}
 	return err;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
 	int o;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return 0;
 	}
 	return 1;
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_HIGH_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone, *preferred_zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
 							&preferred_zone);
 	if (!preferred_zone)
 		return NULL;
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
 			if (alloc_flags & ALLOC_WMARK_MIN)
 				mark = zone->pages_min;
 			else if (alloc_flags & ALLOC_WMARK_LOW)
 				mark = zone->pages_low;
 			else
 				mark = zone->pages_high;
 			if (!zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
 				    !zone_reclaim(zone, gfp_mask, order))
 					goto this_zone_full;
 			}
 		}
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
 		if (page)
 			break;
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
 		if (NUMA_BUILD && !did_zlc_setup) {
 			/* we do zlc_setup after the first zone is tried */
 			allowednodes = zlc_setup(zonelist, alloc_flags);
 			zlc_active = 1;
 			did_zlc_setup = 1;
 		}
 	}
 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 static struct page *
 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zoneref *z;
 	struct zone *zone;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int do_retry;
 	int alloc_flags;
 	unsigned long did_some_progress;
 	unsigned long pages_reclaimed = 0;
 	might_sleep_if(wait);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 restart:
 	z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
 	if (unlikely(!z->zone)) {
 		/*
 		 * Happens if we have an empty zonelist as a result of
 		 * GFP_THISNODE being used on a memoryless node
 		 */
 		return NULL;
 	}
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 *
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags = ALLOC_WMARK_MIN;
 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
 	/* This allocation should allow future memory freeing. */
 rebalance:
 	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
 			page = get_page_from_freelist(gfp_mask, nodemask, order,
 				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
 				congestion_wait(WRITE, HZ/50);
 				goto nofail_alloc;
 			}
 		}
 		goto nopage;
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	if (order != 0)
 		drain_all_pages();
 	if (likely(did_some_progress)) {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		if (!try_set_zone_oom(zonelist, gfp_mask)) {
 			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
 		/*
 		 * Go through the zonelist yet one more time, keep
 		 * very high watermark here, this is only to catch
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 			order, zonelist, high_zoneidx,
 			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
 		}
 		/* The OOM killer will not help higher order allocs so fail */
 		if (order > PAGE_ALLOC_COSTLY_ORDER) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto nopage;
 		}
 		out_of_memory(zonelist, gfp_mask, order);
 		clear_zonelist_oom(zonelist, gfp_mask);
 		goto restart;
 	}
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
 	 *
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 *
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	pages_reclaimed += did_some_progress;
 	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
 		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
 			do_retry = 1;
 		} else {
 			if (gfp_mask & __GFP_REPEAT &&
 				pages_reclaimed < (1 << order))
 					do_retry = 1;
 		}
 		if (gfp_mask & __GFP_NOFAIL)
 			do_retry = 1;
 	}
 	if (do_retry) {
 		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 nopage:
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
 			p->comm, order, gfp_mask);
 		dump_stack();
 		show_mem();
 	}
 got_pg:
 	return page;
 }
 struct page *
 __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
 	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
 }
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
 }
 EXPORT_SYMBOL(__alloc_pages);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page * page;
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	struct page * page;
 	/*
 	 * get_zeroed_page() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 	if (page)
 		return (unsigned long) page_address(page);
 	return 0;
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __pagevec_free(struct pagevec *pvec)
 {
 	int i = pagevec_count(pvec);
 	while (--i >= 0)
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 static unsigned int nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned int sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = zone->pages_high;
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /*
  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  */
 unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /*
  * Amount of free RAM allocatable within all zones
  */
 unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (NUMA_BUILD)
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  */
 void show_free_areas(void)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = zone_pcp(zone, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE),
 		global_page_state(NR_INACTIVE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE) +
 			global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 	for_each_zone(zone) {
 		int i;
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
 			K(zone_page_state(zone, NR_ACTIVE)),
 			K(zone_page_state(zone, NR_INACTIVE)),
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			nr[order] = zone->free_area[order].nr_free;
 			total += nr[order] << order;
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++)
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 		printk("= %lukB\n", K(total));
 	}
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 	BUG_ON(zone_type >= MAX_NR_ZONES);
 	zone_type++;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	if (s)
 		return __parse_numa_zonelist_order(s);
 	return 0;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	if (write)
 		strncpy(saved_string, (char*)table->data,
 			NUMA_ZONELIST_ORDER_LEN);
 	ret = proc_dostring(table, write, file, buffer, length, ppos);
 	if (ret)
 		return ret;
 	if (write) {
 		int oldval = user_zonelist_order;
 		if (__parse_numa_zonelist_order((char*)table->data)) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char*)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order)
 			build_all_zonelists();
 	}
 	return 0;
 }
 #define MAX_NODE_LOAD (num_online_nodes())
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 	node_to_cpumask_ptr(tmp, 0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_HIGH_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		node_to_cpumask_ptr_next(tmp, n);
 		if (!cpus_empty(*tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size,total_size;
 	struct zone *z;
 	int average_size;
 	/*
          * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
   	 * If there is a node whose DMA/DMA32 memory is very big area on
  	 * local memory, NODE_ORDER may be suitable.
          */
 	average_size = total_size /
 				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_load, 0, sizeof(node_load));
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
 		/*
 		 * If another node is sufficiently far away then it is better
 		 * to reclaim pages in a zone before going off node.
 		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (distance != node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /* return values int ....just for stop_machine_run() */
 static int __build_all_zonelists(void *dummy)
 {
 	int nid;
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	return 0;
 }
 void build_all_zonelists(void)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
+		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			num_online_nodes(),
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on zone->pages_min. The memory within the
  * reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn;
 	struct page *page;
 	unsigned long reserve, block_migratetype;
 	/* Get the start pfn, end pfn and the number of blocks to reserve */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
 	reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
 							pageblock_order;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Blocks with reserved pages will never free, skip them. */
 		if (PageReserved(page))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* If this block is reserved, account for it */
 		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
 			reserve--;
 			continue;
 		}
 		/* Suitable for reserving if this block is movable */
 		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
 			set_pageblock_migratetype(page, MIGRATE_RESERVE);
 			move_freepages_block(zone, page, MIGRATE_RESERVE);
 			reserve--;
 			continue;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < z->zone_start_pfn + z->spanned_pages)
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = (1 << (fls(batch + batch/2)-1)) - 1;
 	return batch;
 }
 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 }
 /*
  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	struct per_cpu_pages *pcp;
 	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))
 		pcp->batch = PAGE_SHIFT * 8;
 }
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * Some NUMA counter updates may also be caught by the boot pagesets.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static struct per_cpu_pageset boot_pageset[NR_CPUS];
 /*
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
 static int __cpuinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 	int node = cpu_to_node(cpu);
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
 			goto bad;
 		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(zone_pcp(zone, cpu),
 			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 	return 0;
 bad:
 	for_each_zone(dzone) {
 		if (!populated_zone(dzone))
 			continue;
 		if (dzone == zone)
 			break;
 		kfree(zone_pcp(dzone, cpu));
 		zone_pcp(dzone, cpu) = NULL;
 	}
 	return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
 	struct zone *zone;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
 		/* Free per_cpu_pageset if it is slab allocated */
 		if (pset != &boot_pageset[cpu])
 			kfree(pset);
 		zone_pcp(zone, cpu) = NULL;
 	}
 }
 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
 	int cpu = (long)hcpu;
 	int ret = NOTIFY_OK;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		if (process_zones(cpu))
 			ret = NOTIFY_BAD;
 		break;
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_zone_pagesets(cpu);
 		break;
 	default:
 		break;
 	}
 	return ret;
 }
 static struct notifier_block __cpuinitdata pageset_notifier =
 	{ &pageset_cpuup_callback, NULL, 0 };
 void __init setup_per_cpu_pageset(void)
 {
 	int err;
 	/* Initialize per_cpu_pageset for cpu 0.
 	 * A cpuup callback will do this for every cpu
 	 * as it comes online
 	 */
 	err = process_zones(smp_processor_id());
 	BUG_ON(err);
 	register_cpu_notifier(&pageset_notifier);
 }
 #endif
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
 		zone_pcp(zone, cpu) = &boot_pageset[cpu];
 		setup_pageset(&boot_pageset[cpu],0);
 #else
 		setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
 	}
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 			zone->name, zone->present_pages, batch);
 }
 __meminit int init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 /*
  * Basic iterator support. Return the first range of PFNs for a node
  * Note: nid == MAX_NUMNODES returns first region regardless of node
  */
 static int __meminit first_active_region_index_in_nid(int nid)
 {
 	int i;
 	for (i = 0; i < nr_nodemap_entries; i++)
 		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
 			return i;
 	return -1;
 }
 /*
  * Basic iterator support. Return the next active range of PFNs for a node
  * Note: nid == MAX_NUMNODES returns next region regardless of node
  */
 static int __meminit next_active_region_index_in_nid(int index, int nid)
 {
 	for (index = index + 1; index < nr_nodemap_entries; index++)
 		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
 			return index;
 	return -1;
 }
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int i;
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		unsigned long start_pfn = early_node_map[i].start_pfn;
 		unsigned long end_pfn = early_node_map[i].end_pfn;
 		if (start_pfn <= pfn && pfn < end_pfn)
 			return early_node_map[i].nid;
 	}
 	return 0;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 /* Basic iterator support to walk early_node_map[] */
 #define for_each_active_range_index_in_nid(i, nid) \
 	for (i = first_active_region_index_in_nid(nid); i != -1; \
 				i = next_active_region_index_in_nid(i, nid))
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn)
 {
 	int i;
 	for_each_active_range_index_in_nid(i, nid) {
 		unsigned long size_pages = 0;
 		unsigned long end_pfn = early_node_map[i].end_pfn;
 		if (early_node_map[i].start_pfn >= max_low_pfn)
 			continue;
 		if (end_pfn > max_low_pfn)
 			end_pfn = max_low_pfn;
 		size_pages = end_pfn - early_node_map[i].start_pfn;
 		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
 				PFN_PHYS(early_node_map[i].start_pfn),
 				size_pages << PAGE_SHIFT);
 	}
 }
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
 	int i;
 	int ret;
 	for_each_active_range_index_in_nid(i, nid) {
 		ret = work_fn(early_node_map[i].start_pfn,
 			      early_node_map[i].end_pfn, data);
 		if (ret)
 			break;
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	int i;
 	for_each_active_range_index_in_nid(i, nid)
 		memory_present(early_node_map[i].nid,
 				early_node_map[i].start_pfn,
 				early_node_map[i].end_pfn);
 }
 /**
  * push_node_boundaries - Push node boundaries to at least the requested boundary
  * @nid: The nid of the node to push the boundary for
  * @start_pfn: The start pfn of the node
  * @end_pfn: The end pfn of the node
  *
  * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
  * time. Specifically, on x86_64, SRAT will report ranges that can potentially
  * be hotplugged even though no physical memory exists. This function allows
  * an arch to push out the node boundaries so mem_map is allocated that can
  * be used later.
  */
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 void __init push_node_boundaries(unsigned int nid,
 		unsigned long start_pfn, unsigned long end_pfn)
 {
 	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
 			"Entering push_node_boundaries(%u, %lu, %lu)\n",
 			nid, start_pfn, end_pfn);
 	/* Initialise the boundary for this node if necessary */
 	if (node_boundary_end_pfn[nid] == 0)
 		node_boundary_start_pfn[nid] = -1UL;
 	/* Update the boundaries */
 	if (node_boundary_start_pfn[nid] > start_pfn)
 		node_boundary_start_pfn[nid] = start_pfn;
 	if (node_boundary_end_pfn[nid] < end_pfn)
 		node_boundary_end_pfn[nid] = end_pfn;
 }
 /* If necessary, push the node boundary out for reserve hotadd */
 static void __meminit account_node_boundary(unsigned int nid,
 		unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
 			"Entering account_node_boundary(%u, %lu, %lu)\n",
 			nid, *start_pfn, *end_pfn);
 	/* Return if boundary information has not been provided */
 	if (node_boundary_end_pfn[nid] == 0)
 		return;
 	/* Check the boundaries and update if necessary */
 	if (node_boundary_start_pfn[nid] < *start_pfn)
 		*start_pfn = node_boundary_start_pfn[nid];
 	if (node_boundary_end_pfn[nid] > *end_pfn)
 		*end_pfn = node_boundary_end_pfn[nid];
 }
 #else
 void __init push_node_boundaries(unsigned int nid,
 		unsigned long start_pfn, unsigned long end_pfn) {}
 static void __meminit account_node_boundary(unsigned int nid,
 		unsigned long *start_pfn, unsigned long *end_pfn) {}
 #endif
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_active_range_index_in_nid(i, nid) {
 		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
 		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 	/* Push the node boundaries out if requested */
 	account_node_boundary(nid, start_pfn, end_pfn);
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independant of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the node and zone */
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	int i = 0;
 	unsigned long prev_end_pfn = 0, hole_pages = 0;
 	unsigned long start_pfn;
 	/* Find the end_pfn of the first active range of pfns in the node */
 	i = first_active_region_index_in_nid(nid);
 	if (i == -1)
 		return 0;
 	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
 	/* Account for ranges before physical memory on this node */
 	if (early_node_map[i].start_pfn > range_start_pfn)
 		hole_pages = prev_end_pfn - range_start_pfn;
 	/* Find all holes for the zone within the node */
 	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
 		/* No need to continue if prev_end_pfn is outside the zone */
 		if (prev_end_pfn >= range_end_pfn)
 			break;
 		/* Make sure the end of the zone is not within the hole */
 		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
 		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
 		/* Update the hole size cound and move on */
 		if (start_pfn > range_start_pfn) {
 			BUG_ON(prev_end_pfn > start_pfn);
 			hole_pages += start_pfn - prev_end_pfn;
 		}
 		prev_end_pfn = early_node_map[i].end_pfn;
 	}
 	/* Account for ranges past physical memory on this node */
 	if (range_end_pfn > prev_end_pfn)
 		hole_pages += range_end_pfn -
 				max(range_start_pfn, prev_end_pfn);
 	return hole_pages;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
 							node_start_pfn);
 	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
 							node_end_pfn);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 								zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 								zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize) {
 		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 		memset(zone->pageblock_flags, 0, usemapsize);
 	}
 }
 #else
 static void inline setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Return a sensible default order for the pageblock size. */
 static inline int pageblock_default_order(void)
 {
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		return HUGETLB_PAGE_ORDER;
 	return MAX_ORDER-1;
 }
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 static inline void __init set_pageblock_order(unsigned int order)
 {
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * and pageblock_default_order() are unused as pageblock_order is set
  * at compile-time. See include/linux/pageblock-flags.h for the values of
  * pageblock_order based on the kernel config
  */
 static inline int pageblock_default_order(unsigned int order)
 {
 	return MAX_ORDER-1;
 }
 #define set_pageblock_order(x)	do {} while (0)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
 								zholes_size);
 		/*
 		 * Adjust realsize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages =
 			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
 		if (realsize >= memmap_pages) {
 			realsize -= memmap_pages;
 			mminit_dprintk(MMINIT_TRACE, "memmap_init",
 				"%s zone: %lu pages used for memmap\n",
 				zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds realsize %lu\n",
 				zone_names[j], memmap_pages, realsize);
 		/* Account for reserved pages */
 		if (j == 0 && realsize > dma_reserve) {
 			realsize -= dma_reserve;
 			mminit_dprintk(MMINIT_TRACE, "memmap_init",
 					"%s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone->prev_priority = DEF_PRIORITY;
 		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
 			continue;
 		set_pageblock_order(pageblock_default_order());
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 static void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #else
 static inline void setup_nr_node_ids(void)
 {
 }
 #endif
 /**
  * add_active_range - Register a range of PFNs backed by physical memory
  * @nid: The node ID the range resides on
  * @start_pfn: The start PFN of the available physical memory
  * @end_pfn: The end PFN of the available physical memory
  *
  * These ranges are stored in an early_node_map[] and later used by
  * free_area_init_nodes() to calculate zone sizes and holes. If the
  * range spans a memory hole, it is up to the architecture to ensure
  * the memory is not freed by the bootmem allocator. If possible
  * the range being registered will be merged with existing ranges.
  */
 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 						unsigned long end_pfn)
 {
 	int i;
 	mminit_dprintk(MMINIT_TRACE, "memory_register",
 			"Entering add_active_range(%d, %#lx, %#lx) "
 			"%d entries of %d used\n",
 			nid, start_pfn, end_pfn,
 			nr_nodemap_entries, MAX_ACTIVE_REGIONS);
 	mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
 	/* Merge with existing active regions if possible */
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		if (early_node_map[i].nid != nid)
 			continue;
 		/* Skip if an existing region covers this new one */
 		if (start_pfn >= early_node_map[i].start_pfn &&
 				end_pfn <= early_node_map[i].end_pfn)
 			return;
 		/* Merge forward if suitable */
 		if (start_pfn <= early_node_map[i].end_pfn &&
 				end_pfn > early_node_map[i].end_pfn) {
 			early_node_map[i].end_pfn = end_pfn;
 			return;
 		}
 		/* Merge backward if suitable */
 		if (start_pfn < early_node_map[i].end_pfn &&
 				end_pfn >= early_node_map[i].start_pfn) {
 			early_node_map[i].start_pfn = start_pfn;
 			return;
 		}
 	}
 	/* Check that early_node_map is large enough */
 	if (i >= MAX_ACTIVE_REGIONS) {
 		printk(KERN_CRIT "More than %d memory regions, truncating\n",
 							MAX_ACTIVE_REGIONS);
 		return;
 	}
 	early_node_map[i].nid = nid;
 	early_node_map[i].start_pfn = start_pfn;
 	early_node_map[i].end_pfn = end_pfn;
 	nr_nodemap_entries = i + 1;
 }
 /**
  * remove_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
  * @start_pfn: The new PFN of the range
  * @end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
  * The map is kept near the end physical page range that has already been
  * registered. This function allows an arch to shrink an existing registered
  * range.
  */
 void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
 				unsigned long end_pfn)
 {
 	int i, j;
 	int removed = 0;
 	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
 			  nid, start_pfn, end_pfn);
 	/* Find the old active region end and shrink */
 	for_each_active_range_index_in_nid(i, nid) {
 		if (early_node_map[i].start_pfn >= start_pfn &&
 		    early_node_map[i].end_pfn <= end_pfn) {
 			/* clear it */
 			early_node_map[i].start_pfn = 0;
 			early_node_map[i].end_pfn = 0;
 			removed = 1;
 			continue;
 		}
 		if (early_node_map[i].start_pfn < start_pfn &&
 		    early_node_map[i].end_pfn > start_pfn) {
 			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
 			early_node_map[i].end_pfn = start_pfn;
 			if (temp_end_pfn > end_pfn)
 				add_active_range(nid, end_pfn, temp_end_pfn);
 			continue;
 		}
 		if (early_node_map[i].start_pfn >= start_pfn &&
 		    early_node_map[i].end_pfn > end_pfn &&
 		    early_node_map[i].start_pfn < end_pfn) {
 			early_node_map[i].start_pfn = end_pfn;
 			continue;
 		}
 	}
 	if (!removed)
 		return;
 	/* remove the blank ones */
 	for (i = nr_nodemap_entries - 1; i > 0; i--) {
 		if (early_node_map[i].nid != nid)
 			continue;
 		if (early_node_map[i].end_pfn)
 			continue;
 		/* we found it, get rid of it */
 		for (j = i; j < nr_nodemap_entries - 1; j++)
 			memcpy(&early_node_map[j], &early_node_map[j+1],
 				sizeof(early_node_map[j]));
 		j = nr_nodemap_entries - 1;
 		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
 		nr_nodemap_entries--;
 	}
 }
 /**
  * remove_all_active_ranges - Remove all currently registered regions
  *
  * During discovery, it may be found that a table like SRAT is invalid
  * and an alternative discovery method must be used. This function removes
  * all currently registered regions.
  */
 void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
 	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 /* Compare two active node_active_regions */
 static int __init cmp_node_active_region(const void *a, const void *b)
 {
 	struct node_active_region *arange = (struct node_active_region *)a;
 	struct node_active_region *brange = (struct node_active_region *)b;
 	/* Done this way to avoid overflows */
 	if (arange->start_pfn > brange->start_pfn)
 		return 1;
 	if (arange->start_pfn < brange->start_pfn)
 		return -1;
 	return 0;
 }
 /* sort the node_map by start_pfn */
 static void __init sort_node_map(void)
 {
 	sort(early_node_map, (size_t)nr_nodemap_entries,
 			sizeof(struct node_active_region),
 			cmp_node_active_region, NULL);
 }
 /* Find the lowest pfn for a node */
 unsigned long __init find_min_pfn_for_node(int nid)
 {
 	int i;
 	unsigned long min_pfn = ULONG_MAX;
 	/* Assuming a sorted map, the first range found has the starting pfn */
 	for_each_active_range_index_in_nid(i, nid)
 		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /**
  * find_max_pfn_with_active_regions - Find the maximum PFN registered
  *
  * It returns the maximum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_max_pfn_with_active_regions(void)
 {
 	int i;
 	unsigned long max_pfn = 0;
 	for (i = 0; i < nr_nodemap_entries; i++)
 		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
 	return max_pfn;
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_HIGH_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	int i;
 	unsigned long totalpages = 0;
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		unsigned long pages = early_node_map[i].end_pfn -
 						early_node_map[i].start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
 	}
   	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		return;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_active_range_index_in_nid(i, nid) {
 			unsigned long start_pfn, end_pfn;
 			unsigned long size_pages;
 			start_pfn = max(early_node_map[i].start_pfn,
 						zone_movable_pfn[nid]);
 			end_pfn = early_node_map[i].end_pfn;
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisified
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisified
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 }
 /* Any regular memory on that node ? */
 static void check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
 	enum zone_type zone_type;
 	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages)
 			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 	}
 #endif
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long nid;
 	enum zone_type i;
 	/* Sort early_node_map as initialisation assumes it is sorted */
 	sort_node_map();
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
 	/* Print out the zone ranges */
 	printk("Zone PFN ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk("  %-8s %0#10lx -> %0#10lx\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start PFN for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
 	}
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
 		printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, pgdat, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 		check_for_regular_memory(pgdat);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, NODE_DATA(0), zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat pages_high as reserved pages. */
 			max += zone->pages_high;
 			if (max > zone->present_pages)
 				max = zone->present_pages;
 			reserve_pages += max;
 		}
 	}
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = present_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_pages_min - called when min_free_kbytes changes.
  *
  * Ensures that the pages_{min,low,high} values for each zone are set correctly
  * with respect to min_free_kbytes.
  */
 void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		tmp = (u64)pages_min * zone->present_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The (pages_high-pages_low) and (pages_low-pages_min)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			int min_pages;
 			min_pages = zone->present_pages / 1024;
 			if (min_pages < SWAP_CLUSTER_MAX)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->pages_min = tmp;
 		}
 		zone->pages_low   = zone->pages_min + (tmp >> 2);
 		zone->pages_high  = zone->pages_min + (tmp >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 static int __init init_per_zone_pages_min(void)
 {
 	unsigned long lowmem_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (min_free_kbytes < 128)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (write)
 		setup_per_zone_pages_min();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->present_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->present_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * pages_min watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
  * can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
 	for_each_zone(zone) {
 		for_each_online_cpu(cpu) {
 			unsigned long  high;
 			high = zone->present_pages / percpu_pagelist_fraction;
 			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
 		}
 	}
 	return 0;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long limit)
 {
 	unsigned long long max = limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			unsigned long order = get_order(size);
 			table = (void*) __get_free_pages(GFP_ATOMIC, order);
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table.
 			 */
 			if (table) {
 				unsigned long alloc_end = (unsigned long)table +
 						(PAGE_SIZE << order);
 				unsigned long used = (unsigned long)table +
 						PAGE_ALIGN(size);
 				split_page(virt_to_page(table), order);
 				while (used < alloc_end) {
 					free_page(used);
 					used += PAGE_SIZE;
 				}
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1U << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
 	return __pfn_to_page(pfn);
 }
 unsigned long page_to_pfn(struct page *page)
 {
 	return __page_to_pfn(page);
 }
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - zone->zone_start_pfn;
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_group(struct page *page,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long flags = 0;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (test_bit(bitidx + start_bitidx, bitmap))
 			flags |= value;
 	return flags;
 }
 /**
  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	VM_BUG_ON(pfn < zone->zone_start_pfn);
 	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (flags & value)
 			__set_bit(bitidx + start_bitidx, bitmap);
 		else
 			__clear_bit(bitidx + start_bitidx, bitmap);
 }
 /*
  * This is designed as sub function...plz see page_isolation.c also.
  * set/clear page block's type to be ISOLATE.
  * page allocater never alloc memory from ISOLATE block.
  */
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags;
 	int ret = -EBUSY;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	/*
 	 * In future, more migrate types will be able to be isolation target.
 	 */
 	if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
 		goto out;
 	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 	move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	ret = 0;
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 	if (!ret)
 		drain_all_pages();
 	return ret;
 }
 void unset_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 	move_freepages_block(zone, page, MIGRATE_MOVABLE);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES,
 				      - (1UL << order));
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif