Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/compiler.h>

24

#include <linux/compiler.h>

25

#include <linux/kernel.h>

25

#include <linux/kernel.h>

26

#include <linux/module.h>

26

#include <linux/module.h>

27

#include <linux/suspend.h>

27

#include <linux/suspend.h>

28

#include <linux/pagevec.h>

28

#include <linux/pagevec.h>

29

#include <linux/blkdev.h>

29

#include <linux/blkdev.h>

30

#include <linux/slab.h>

30

#include <linux/slab.h>

31

#include <linux/oom.h>

31

#include <linux/oom.h>

32

#include <linux/notifier.h>

32

#include <linux/notifier.h>

33

#include <linux/topology.h>

33

#include <linux/topology.h>

34

#include <linux/sysctl.h>

34

#include <linux/sysctl.h>

35

#include <linux/cpu.h>

35

#include <linux/cpu.h>

36

#include <linux/cpuset.h>

36

#include <linux/cpuset.h>

37

#include <linux/memory_hotplug.h>

37

#include <linux/memory_hotplug.h>

38

#include <linux/nodemask.h>

38

#include <linux/nodemask.h>

39

#include <linux/vmalloc.h>

39

#include <linux/vmalloc.h>

40

#include <linux/mempolicy.h>

40

#include <linux/mempolicy.h>

41

#include <linux/stop_machine.h>

41

#include <linux/stop_machine.h>

42

#include <linux/sort.h>

42

#include <linux/sort.h>

43

#include <linux/pfn.h>

43

#include <linux/pfn.h>

44

#include <linux/backing-dev.h>

44

#include <linux/backing-dev.h>

45

#include <linux/fault-inject.h>

45

#include <linux/fault-inject.h>

46

#include <linux/page-isolation.h>

46

#include <linux/page-isolation.h>

47

#include <linux/memcontrol.h>

47

#include <linux/memcontrol.h>

48

49

#include <asm/tlbflush.h>

49

#include <asm/tlbflush.h>

50

#include <asm/div64.h>

50

#include <asm/div64.h>

51

#include "internal.h"

51

#include "internal.h"

52

53

/*

53

/*

54

* Array of node states.

54

* Array of node states.

55

*/

55

*/

56

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

56

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

57

[N_POSSIBLE] = NODE_MASK_ALL,

57

[N_POSSIBLE] = NODE_MASK_ALL,

58

[N_ONLINE] = { { [0] = 1UL } },

58

[N_ONLINE] = { { [0] = 1UL } },

59

#ifndef CONFIG_NUMA

59

#ifndef CONFIG_NUMA

60

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

60

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

61

#ifdef CONFIG_HIGHMEM

61

#ifdef CONFIG_HIGHMEM

62

[N_HIGH_MEMORY] = { { [0] = 1UL } },

62

[N_HIGH_MEMORY] = { { [0] = 1UL } },

63

#endif

63

#endif

64

[N_CPU] = { { [0] = 1UL } },

64

[N_CPU] = { { [0] = 1UL } },

65

#endif /* NUMA */

65

#endif /* NUMA */

66

};

66

};

67

EXPORT_SYMBOL(node_states);

67

EXPORT_SYMBOL(node_states);

68

69

unsigned long totalram_pages __read_mostly;

69

unsigned long totalram_pages __read_mostly;

70

unsigned long totalreserve_pages __read_mostly;

70

unsigned long totalreserve_pages __read_mostly;

71

long nr_swap_pages;

71

long nr_swap_pages;

72

int percpu_pagelist_fraction;

72

int percpu_pagelist_fraction;

73

74

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

74

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

75

int pageblock_order __read_mostly;

75

int pageblock_order __read_mostly;

76

#endif

76

#endif

77

78

static void __free_pages_ok(struct page *page, unsigned int order);

78

static void __free_pages_ok(struct page *page, unsigned int order);

79

80

/*

80

/*

81

* results with 256, 32 in the lowmem_reserve sysctl:

81

* results with 256, 32 in the lowmem_reserve sysctl:

82

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

82

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

83

* 1G machine -> (16M dma, 784M normal, 224M high)

83

* 1G machine -> (16M dma, 784M normal, 224M high)

84

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

84

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

85

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

85

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

86

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

86

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

87

*

87

*

88

* TBD: should special case ZONE_DMA32 machines here - in those we normally

88

* TBD: should special case ZONE_DMA32 machines here - in those we normally

89

* don't need any ZONE_NORMAL reservation

89

* don't need any ZONE_NORMAL reservation

90

*/

90

*/

91

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

91

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

92

#ifdef CONFIG_ZONE_DMA

92

#ifdef CONFIG_ZONE_DMA

93

256,

93

256,

94

#endif

94

#endif

95

#ifdef CONFIG_ZONE_DMA32

95

#ifdef CONFIG_ZONE_DMA32

96

256,

96

256,

97

#endif

97

#endif

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

32,

99

32,

100

#endif

100

#endif

101

32,

101

32,

102

};

102

};

103

104

EXPORT_SYMBOL(totalram_pages);

104

EXPORT_SYMBOL(totalram_pages);

105

106

static char * const zone_names[MAX_NR_ZONES] = {

106

static char * const zone_names[MAX_NR_ZONES] = {

107

#ifdef CONFIG_ZONE_DMA

107

#ifdef CONFIG_ZONE_DMA

108

"DMA",

108

"DMA",

109

#endif

109

#endif

110

#ifdef CONFIG_ZONE_DMA32

110

#ifdef CONFIG_ZONE_DMA32

111

"DMA32",

111

"DMA32",

112

#endif

112

#endif

113

"Normal",

113

"Normal",

114

#ifdef CONFIG_HIGHMEM

114

#ifdef CONFIG_HIGHMEM

115

"HighMem",

115

"HighMem",

116

#endif

116

#endif

117

"Movable",

117

"Movable",

118

};

118

};

119

120

int min_free_kbytes = 1024;

120

int min_free_kbytes = 1024;

121

122

unsigned long __meminitdata nr_kernel_pages;

122

unsigned long __meminitdata nr_kernel_pages;

123

unsigned long __meminitdata nr_all_pages;

123

unsigned long __meminitdata nr_all_pages;

124

static unsigned long __meminitdata dma_reserve;

124

static unsigned long __meminitdata dma_reserve;

125

126

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

126

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

127

/*

127

/*

128

* MAX_ACTIVE_REGIONS determines the maximum number of distinct

128

* MAX_ACTIVE_REGIONS determines the maximum number of distinct

129

* ranges of memory (RAM) that may be registered with add_active_range().

129

* ranges of memory (RAM) that may be registered with add_active_range().

130

* Ranges passed to add_active_range() will be merged if possible

130

* Ranges passed to add_active_range() will be merged if possible

131

* so the number of times add_active_range() can be called is

131

* so the number of times add_active_range() can be called is

132

* related to the number of nodes and the number of holes

132

* related to the number of nodes and the number of holes

133

*/

133

*/

134

#ifdef CONFIG_MAX_ACTIVE_REGIONS

134

#ifdef CONFIG_MAX_ACTIVE_REGIONS

135

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

135

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

136

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

136

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

137

#else

137

#else

138

#if MAX_NUMNODES >= 32

138

#if MAX_NUMNODES >= 32

139

/* If there can be many nodes, allow up to 50 holes per node */

139

/* If there can be many nodes, allow up to 50 holes per node */

140

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

140

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

141

#else

141

#else

142

/* By default, allow up to 256 distinct regions */

142

/* By default, allow up to 256 distinct regions */

143

#define MAX_ACTIVE_REGIONS 256

143

#define MAX_ACTIVE_REGIONS 256

144

#endif

144

#endif

145

#endif

145

#endif

146

147

static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];

147

static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];

148

static int __meminitdata nr_nodemap_entries;

148

static int __meminitdata nr_nodemap_entries;

149

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

149

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

150

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

150

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

151

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

151

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

152

static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];

152

static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];

153

static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];

153

static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];

154

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

154

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

155

unsigned long __initdata required_kernelcore;

155

unsigned long __initdata required_kernelcore;

156

static unsigned long __initdata required_movablecore;

156

static unsigned long __initdata required_movablecore;

157

unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

157

unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

158

159

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

159

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

160

int movable_zone;

160

int movable_zone;

161

EXPORT_SYMBOL(movable_zone);

161

EXPORT_SYMBOL(movable_zone);

162

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

162

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

163

164

#if MAX_NUMNODES > 1

164

#if MAX_NUMNODES > 1

165

int nr_node_ids __read_mostly = MAX_NUMNODES;

165

int nr_node_ids __read_mostly = MAX_NUMNODES;

166

EXPORT_SYMBOL(nr_node_ids);

166

EXPORT_SYMBOL(nr_node_ids);

167

#endif

167

#endif

168

169

int page_group_by_mobility_disabled __read_mostly;

169

int page_group_by_mobility_disabled __read_mostly;

170

171

static void set_pageblock_migratetype(struct page *page, int migratetype)

171

static void set_pageblock_migratetype(struct page *page, int migratetype)

172

{

172

{

173

set_pageblock_flags_group(page, (unsigned long)migratetype,

173

set_pageblock_flags_group(page, (unsigned long)migratetype,

174

PB_migrate, PB_migrate_end);

174

PB_migrate, PB_migrate_end);

175

}

175

}

176

177

#ifdef CONFIG_DEBUG_VM

177

#ifdef CONFIG_DEBUG_VM

178

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

178

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

179

{

179

{

180

int ret = 0;

180

int ret = 0;

181

unsigned seq;

181

unsigned seq;

182

unsigned long pfn = page_to_pfn(page);

182

unsigned long pfn = page_to_pfn(page);

183

184

do {

184

do {

185

seq = zone_span_seqbegin(zone);

185

seq = zone_span_seqbegin(zone);

186

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

186

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

187

ret = 1;

187

ret = 1;

188

else if (pfn < zone->zone_start_pfn)

188

else if (pfn < zone->zone_start_pfn)

189

ret = 1;

189

ret = 1;

190

} while (zone_span_seqretry(zone, seq));

190

} while (zone_span_seqretry(zone, seq));

191

192

return ret;

192

return ret;

193

}

193

}

194

195

static int page_is_consistent(struct zone *zone, struct page *page)

195

static int page_is_consistent(struct zone *zone, struct page *page)

196

{

196

{

197

if (!pfn_valid_within(page_to_pfn(page)))

197

if (!pfn_valid_within(page_to_pfn(page)))

198

return 0;

198

return 0;

199

if (zone != page_zone(page))

199

if (zone != page_zone(page))

200

return 0;

200

return 0;

201

202

return 1;

202

return 1;

203

}

203

}

204

/*

204

/*

205

* Temporary debugging check for pages not lying within a given zone.

205

* Temporary debugging check for pages not lying within a given zone.

206

*/

206

*/

207

static int bad_range(struct zone *zone, struct page *page)

207

static int bad_range(struct zone *zone, struct page *page)

208

{

208

{

209

if (page_outside_zone_boundaries(zone, page))

209

if (page_outside_zone_boundaries(zone, page))

210

return 1;

210

return 1;

211

if (!page_is_consistent(zone, page))

211

if (!page_is_consistent(zone, page))

212

return 1;

212

return 1;

213

214

return 0;

214

return 0;

215

}

215

}

216

#else

216

#else

217

static inline int bad_range(struct zone *zone, struct page *page)

217

static inline int bad_range(struct zone *zone, struct page *page)

218

{

218

{

219

return 0;

219

return 0;

220

}

220

}

221

#endif

221

#endif

222

223

static void bad_page(struct page *page)

223

static void bad_page(struct page *page)

224

{

224

{

225

void *pc = page_get_page_cgroup(page);

225

void *pc = page_get_page_cgroup(page);

226

227

printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG

227

printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG

228

"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",

228

"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",

229

current->comm, page, (int)(2*sizeof(unsigned long)),

229

current->comm, page, (int)(2*sizeof(unsigned long)),

230

(unsigned long)page->flags, page->mapping,

230

(unsigned long)page->flags, page->mapping,

231

page_mapcount(page), page_count(page));

231

page_mapcount(page), page_count(page));

232

if (pc) {

232

if (pc) {

233

printk(KERN_EMERG "cgroup:%p\n", pc);

233

printk(KERN_EMERG "cgroup:%p\n", pc);

234

page_reset_bad_cgroup(page);

234

page_reset_bad_cgroup(page);

235

}

235

}

236

printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

236

printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

237

KERN_EMERG "Backtrace:\n");

237

KERN_EMERG "Backtrace:\n");

238

dump_stack();

238

dump_stack();

239

page->flags &= ~(1 << PG_lru |

239

page->flags &= ~(1 << PG_lru |

240

1 << PG_private |

240

1 << PG_private |

241

1 << PG_locked |

241

1 << PG_locked |

242

1 << PG_active |

242

1 << PG_active |

243

1 << PG_dirty |

243

1 << PG_dirty |

244

1 << PG_reclaim |

244

1 << PG_reclaim |

245

1 << PG_slab |

245

1 << PG_slab |

246

1 << PG_swapcache |

246

1 << PG_swapcache |

247

1 << PG_writeback |

247

1 << PG_writeback |

248

1 << PG_buddy );

248

1 << PG_buddy );

249

set_page_count(page, 0);

249

set_page_count(page, 0);

250

reset_page_mapcount(page);

250

reset_page_mapcount(page);

251

page->mapping = NULL;

251

page->mapping = NULL;

252

add_taint(TAINT_BAD_PAGE);

252

add_taint(TAINT_BAD_PAGE);

253

}

253

}

254

255

/*

255

/*

256

* Higher-order pages are called "compound pages". They are structured thusly:

256

* Higher-order pages are called "compound pages". They are structured thusly:

257

*

257

*

258

* The first PAGE_SIZE page is called the "head page".

258

* The first PAGE_SIZE page is called the "head page".

259

*

259

*

260

* The remaining PAGE_SIZE pages are called "tail pages".

260

* The remaining PAGE_SIZE pages are called "tail pages".

261

*

261

*

262

* All pages have PG_compound set. All pages have their ->private pointing at

262

* All pages have PG_compound set. All pages have their ->private pointing at

263

* the head page (even the head page has this).

263

* the head page (even the head page has this).

264

*

264

*

265

* The first tail page's ->lru.next holds the address of the compound page's

265

* The first tail page's ->lru.next holds the address of the compound page's

266

* put_page() function. Its ->lru.prev holds the order of allocation.

266

* put_page() function. Its ->lru.prev holds the order of allocation.

267

* This usage means that zero-order pages may not be compound.

267

* This usage means that zero-order pages may not be compound.

268

*/

268

*/

269

270

static void free_compound_page(struct page *page)

270

static void free_compound_page(struct page *page)

271

{

271

{

272

__free_pages_ok(page, compound_order(page));

272

__free_pages_ok(page, compound_order(page));

273

}

273

}

274

275

static void prep_compound_page(struct page *page, unsigned long order)

275

static void prep_compound_page(struct page *page, unsigned long order)

276

{

276

{

277

int i;

277

int i;

278

int nr_pages = 1 << order;

278

int nr_pages = 1 << order;

279

280

set_compound_page_dtor(page, free_compound_page);

280

set_compound_page_dtor(page, free_compound_page);

281

set_compound_order(page, order);

281

set_compound_order(page, order);

282

__SetPageHead(page);

282

__SetPageHead(page);

283

for (i = 1; i < nr_pages; i++) {

283

for (i = 1; i < nr_pages; i++) {

284

struct page *p = page + i;

284

struct page *p = page + i;

285

286

__SetPageTail(p);

286

__SetPageTail(p);

287

p->first_page = page;

287

p->first_page = page;

288

}

288

}

289

}

289

}

290

291

static void destroy_compound_page(struct page *page, unsigned long order)

291

static void destroy_compound_page(struct page *page, unsigned long order)

292

{

292

{

293

int i;

293

int i;

294

int nr_pages = 1 << order;

294

int nr_pages = 1 << order;

295

296

if (unlikely(compound_order(page) != order))

296

if (unlikely(compound_order(page) != order))

297

bad_page(page);

297

bad_page(page);

298

299

if (unlikely(!PageHead(page)))

299

if (unlikely(!PageHead(page)))

300

bad_page(page);

300

bad_page(page);

301

__ClearPageHead(page);

301

__ClearPageHead(page);

302

for (i = 1; i < nr_pages; i++) {

302

for (i = 1; i < nr_pages; i++) {

303

struct page *p = page + i;

303

struct page *p = page + i;

304

305

if (unlikely(!PageTail(p) |

305

if (unlikely(!PageTail(p) |

306

(p->first_page != page)))

306

(p->first_page != page)))

307

bad_page(page);

307

bad_page(page);

308

__ClearPageTail(p);

308

__ClearPageTail(p);

309

}

309

}

310

}

310

}

311

312

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

312

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

313

{

313

{

314

int i;

314

int i;

315

316

/*

316

/*

317

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

317

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

318

* and __GFP_HIGHMEM from hard or soft interrupt context.

318

* and __GFP_HIGHMEM from hard or soft interrupt context.

319

*/

319

*/

320

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

320

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

321

for (i = 0; i < (1 << order); i++)

321

for (i = 0; i < (1 << order); i++)

322

clear_highpage(page + i);

322

clear_highpage(page + i);

323

}

323

}

324

325

static inline void set_page_order(struct page *page, int order)

325

static inline void set_page_order(struct page *page, int order)

326

{

326

{

327

set_page_private(page, order);

327

set_page_private(page, order);

328

__SetPageBuddy(page);

328

__SetPageBuddy(page);

329

}

329

}

330

331

static inline void rmv_page_order(struct page *page)

331

static inline void rmv_page_order(struct page *page)

332

{

332

{

333

__ClearPageBuddy(page);

333

__ClearPageBuddy(page);

334

set_page_private(page, 0);

334

set_page_private(page, 0);

335

}

335

}

336

337

/*

337

/*

338

* Locate the struct page for both the matching buddy in our

338

* Locate the struct page for both the matching buddy in our

339

* pair (buddy1) and the combined O(n+1) page they form (page).

339

* pair (buddy1) and the combined O(n+1) page they form (page).

340

*

340

*

341

* 1) Any buddy B1 will have an order O twin B2 which satisfies

341

* 1) Any buddy B1 will have an order O twin B2 which satisfies

342

* the following equation:

342

* the following equation:

343

* B2 = B1 ^ (1 << O)

343

* B2 = B1 ^ (1 << O)

344

* For example, if the starting buddy (buddy2) is #8 its order

344

* For example, if the starting buddy (buddy2) is #8 its order

345

* 1 buddy is #10:

345

* 1 buddy is #10:

346

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

346

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

347

*

347

*

348

* 2) Any buddy B will have an order O+1 parent P which

348

* 2) Any buddy B will have an order O+1 parent P which

349

* satisfies the following equation:

349

* satisfies the following equation:

350

* P = B & ~(1 << O)

350

* P = B & ~(1 << O)

351

*

351

*

352

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

352

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

353

*/

353

*/

354

static inline struct page *

354

static inline struct page *

355

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

355

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

356

{

356

{

357

unsigned long buddy_idx = page_idx ^ (1 << order);

357

unsigned long buddy_idx = page_idx ^ (1 << order);

358

359

return page + (buddy_idx - page_idx);

359

return page + (buddy_idx - page_idx);

360

}

360

}

361

362

static inline unsigned long

362

static inline unsigned long

363

__find_combined_index(unsigned long page_idx, unsigned int order)

363

__find_combined_index(unsigned long page_idx, unsigned int order)

364

{

364

{

365

return (page_idx & ~(1 << order));

365

return (page_idx & ~(1 << order));

366

}

366

}

367

368

/*

368

/*

369

* This function checks whether a page is free && is the buddy

369

* This function checks whether a page is free && is the buddy

370

* we can do coalesce a page and its buddy if

370

* we can do coalesce a page and its buddy if

371

* (a) the buddy is not in a hole &&

371

* (a) the buddy is not in a hole &&

372

* (b) the buddy is in the buddy system &&

372

* (b) the buddy is in the buddy system &&

373

* (c) a page and its buddy have the same order &&

373

* (c) a page and its buddy have the same order &&

374

* (d) a page and its buddy are in the same zone.

374

* (d) a page and its buddy are in the same zone.

375

*

375

*

376

* For recording whether a page is in the buddy system, we use PG_buddy.

376

* For recording whether a page is in the buddy system, we use PG_buddy.

377

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

377

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

378

*

378

*

379

* For recording page's order, we use page_private(page).

379

* For recording page's order, we use page_private(page).

380

*/

380

*/

381

static inline int page_is_buddy(struct page *page, struct page *buddy,

381

static inline int page_is_buddy(struct page *page, struct page *buddy,

382

int order)

382

int order)

383

{

383

{

384

if (!pfn_valid_within(page_to_pfn(buddy)))

384

if (!pfn_valid_within(page_to_pfn(buddy)))

385

return 0;

385

return 0;

386

387

if (page_zone_id(page) != page_zone_id(buddy))

387

if (page_zone_id(page) != page_zone_id(buddy))

388

return 0;

388

return 0;

389

390

if (PageBuddy(buddy) && page_order(buddy) == order) {

390

if (PageBuddy(buddy) && page_order(buddy) == order) {

391

BUG_ON(page_count(buddy) != 0);

391

BUG_ON(page_count(buddy) != 0);

392

return 1;

392

return 1;

393

}

393

}

394

return 0;

394

return 0;

395

}

395

}

396

397

/*

397

/*

398

* Freeing function for a buddy system allocator.

398

* Freeing function for a buddy system allocator.

399

*

399

*

400

* The concept of a buddy system is to maintain direct-mapped table

400

* The concept of a buddy system is to maintain direct-mapped table

401

* (containing bit values) for memory blocks of various "orders".

401

* (containing bit values) for memory blocks of various "orders".

402

* The bottom level table contains the map for the smallest allocatable

402

* The bottom level table contains the map for the smallest allocatable

403

* units of memory (here, pages), and each level above it describes

403

* units of memory (here, pages), and each level above it describes

404

* pairs of units from the levels below, hence, "buddies".

404

* pairs of units from the levels below, hence, "buddies".

405

* At a high level, all that happens here is marking the table entry

405

* At a high level, all that happens here is marking the table entry

406

* at the bottom level available, and propagating the changes upward

406

* at the bottom level available, and propagating the changes upward

407

* as necessary, plus some accounting needed to play nicely with other

407

* as necessary, plus some accounting needed to play nicely with other

408

* parts of the VM system.

408

* parts of the VM system.

409

* At each level, we keep a list of pages, which are heads of continuous

409

* At each level, we keep a list of pages, which are heads of continuous

410

* free pages of length of (1 << order) and marked with PG_buddy. Page's

410

* free pages of length of (1 << order) and marked with PG_buddy. Page's

411

* order is recorded in page_private(page) field.

411

* order is recorded in page_private(page) field.

412

* So when we are allocating or freeing one, we can derive the state of the

412

* So when we are allocating or freeing one, we can derive the state of the

413

* other. That is, if we allocate a small block, and both were

413

* other. That is, if we allocate a small block, and both were

414

* free, the remainder of the region must be split into blocks.

414

* free, the remainder of the region must be split into blocks.

415

* If a block is freed, and its buddy is also free, then this

415

* If a block is freed, and its buddy is also free, then this

416

* triggers coalescing into a block of larger size.

416

* triggers coalescing into a block of larger size.

417

*

417

*

418

* -- wli

418

* -- wli

419

*/

419

*/

420

421

static inline void __free_one_page(struct page *page,

421

static inline void __free_one_page(struct page *page,

422

struct zone *zone, unsigned int order)

422

struct zone *zone, unsigned int order)

423

{

423

{

424

unsigned long page_idx;

424

unsigned long page_idx;

425

int order_size = 1 << order;

425

int order_size = 1 << order;

426

int migratetype = get_pageblock_migratetype(page);

426

int migratetype = get_pageblock_migratetype(page);

427

428

if (unlikely(PageCompound(page)))

428

if (unlikely(PageCompound(page)))

429

destroy_compound_page(page, order);

429

destroy_compound_page(page, order);

430

431

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

431

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

432

433

VM_BUG_ON(page_idx & (order_size - 1));

433

VM_BUG_ON(page_idx & (order_size - 1));

434

VM_BUG_ON(bad_range(zone, page));

434

VM_BUG_ON(bad_range(zone, page));

435

436

__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);

436

__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);

437

while (order < MAX_ORDER-1) {

437

while (order < MAX_ORDER-1) {

438

unsigned long combined_idx;

438

unsigned long combined_idx;

439

struct page *buddy;

439

struct page *buddy;

440

441

buddy = __page_find_buddy(page, page_idx, order);

441

buddy = __page_find_buddy(page, page_idx, order);

442

if (!page_is_buddy(page, buddy, order))

442

if (!page_is_buddy(page, buddy, order))

443

break; /* Move the buddy up one level. */

443

break; /* Move the buddy up one level. */

444

445

list_del(&buddy->lru);

445

list_del(&buddy->lru);

446

zone->free_area[order].nr_free--;

446

zone->free_area[order].nr_free--;

447

rmv_page_order(buddy);

447

rmv_page_order(buddy);

448

combined_idx = __find_combined_index(page_idx, order);

448

combined_idx = __find_combined_index(page_idx, order);

449

page = page + (combined_idx - page_idx);

449

page = page + (combined_idx - page_idx);

450

page_idx = combined_idx;

450

page_idx = combined_idx;

451

order++;

451

order++;

452

}

452

}

453

set_page_order(page, order);

453

set_page_order(page, order);

454

list_add(&page->lru,

454

list_add(&page->lru,

455

&zone->free_area[order].free_list[migratetype]);

455

&zone->free_area[order].free_list[migratetype]);

456

zone->free_area[order].nr_free++;

456

zone->free_area[order].nr_free++;

457

}

457

}

458

459

static inline int free_pages_check(struct page *page)

459

static inline int free_pages_check(struct page *page)

460

{

460

{

461

if (unlikely(page_mapcount(page) |

461

if (unlikely(page_mapcount(page) |

462

(page->mapping != NULL) |

462

(page->mapping != NULL) |

463

(page_get_page_cgroup(page) != NULL) |

463

(page_get_page_cgroup(page) != NULL) |

464

(page_count(page) != 0) |

464

(page_count(page) != 0) |

465

(page->flags & (

465

(page->flags & (

466

1 << PG_lru |

466

1 << PG_lru |

467

1 << PG_private |

467

1 << PG_private |

468

1 << PG_locked |

468

1 << PG_locked |

469

1 << PG_active |

469

1 << PG_active |

470

1 << PG_slab |

470

1 << PG_slab |

471

1 << PG_swapcache |

471

1 << PG_swapcache |

472

1 << PG_writeback |

472

1 << PG_writeback |

473

1 << PG_reserved |

473

1 << PG_reserved |

474

1 << PG_buddy ))))

474

1 << PG_buddy ))))

475

bad_page(page);

475

bad_page(page);

476

if (PageDirty(page))

476

if (PageDirty(page))

477

__ClearPageDirty(page);

477

__ClearPageDirty(page);

478

/*

478

/*

479

* For now, we report if PG_reserved was found set, but do not

479

* For now, we report if PG_reserved was found set, but do not

480

* clear it, and do not free the page. But we shall soon need

480

* clear it, and do not free the page. But we shall soon need

481

* to do more, for when the ZERO_PAGE count wraps negative.

481

* to do more, for when the ZERO_PAGE count wraps negative.

482

*/

482

*/

483

return PageReserved(page);

483

return PageReserved(page);

484

}

484

}

485

486

/*

486

/*

487

* Frees a list of pages.

487

* Frees a list of pages.

488

* Assumes all pages on list are in same zone, and of same order.

488

* Assumes all pages on list are in same zone, and of same order.

489

* count is the number of pages to free.

489

* count is the number of pages to free.

490

*

490

*

491

* If the zone was previously in an "all pages pinned" state then look to

491

* If the zone was previously in an "all pages pinned" state then look to

492

* see if this freeing clears that state.

492

* see if this freeing clears that state.

493

*

493

*

494

* And clear the zone's pages_scanned counter, to hold off the "all pages are

494

* And clear the zone's pages_scanned counter, to hold off the "all pages are

495

* pinned" detection logic.

495

* pinned" detection logic.

496

*/

496

*/

497

static void free_pages_bulk(struct zone *zone, int count,

497

static void free_pages_bulk(struct zone *zone, int count,

498

struct list_head *list, int order)

498

struct list_head *list, int order)

499

{

499

{

500

spin_lock(&zone->lock);

500

spin_lock(&zone->lock);

501

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

501

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

502

zone->pages_scanned = 0;

502

zone->pages_scanned = 0;

503

while (count--) {

503

while (count--) {

504

struct page *page;

504

struct page *page;

505

506

VM_BUG_ON(list_empty(list));

506

VM_BUG_ON(list_empty(list));

507

page = list_entry(list->prev, struct page, lru);

507

page = list_entry(list->prev, struct page, lru);

508

/* have to delete it as __free_one_page list manipulates */

508

/* have to delete it as __free_one_page list manipulates */

509

list_del(&page->lru);

509

list_del(&page->lru);

510

__free_one_page(page, zone, order);

510

__free_one_page(page, zone, order);

511

}

511

}

512

spin_unlock(&zone->lock);

512

spin_unlock(&zone->lock);

513

}

513

}

514

515

static void free_one_page(struct zone *zone, struct page *page, int order)

515

static void free_one_page(struct zone *zone, struct page *page, int order)

516

{

516

{

517

spin_lock(&zone->lock);

517

spin_lock(&zone->lock);

518

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

518

zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);

519

zone->pages_scanned = 0;

519

zone->pages_scanned = 0;

520

__free_one_page(page, zone, order);

520

__free_one_page(page, zone, order);

521

spin_unlock(&zone->lock);

521

spin_unlock(&zone->lock);

522

}

522

}

523

524

static void __free_pages_ok(struct page *page, unsigned int order)

524

static void __free_pages_ok(struct page *page, unsigned int order)

525

{

525

{

526

unsigned long flags;

526

unsigned long flags;

527

int i;

527

int i;

528

int reserved = 0;

528

int reserved = 0;

529

530

for (i = 0 ; i < (1 << order) ; ++i)

530

for (i = 0 ; i < (1 << order) ; ++i)

531

reserved += free_pages_check(page + i);

531

reserved += free_pages_check(page + i);

532

if (reserved)

532

if (reserved)

533

return;

533

return;

534

535

if (!PageHighMem(page))

535

if (!PageHighMem(page))

536

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

536

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

537

arch_free_page(page, order);

537

arch_free_page(page, order);

538

kernel_map_pages(page, 1 << order, 0);

538

kernel_map_pages(page, 1 << order, 0);

539

540

local_irq_save(flags);

540

local_irq_save(flags);

541

__count_vm_events(PGFREE, 1 << order);

541

__count_vm_events(PGFREE, 1 << order);

542

free_one_page(page_zone(page), page, order);

542

free_one_page(page_zone(page), page, order);

543

local_irq_restore(flags);

543

local_irq_restore(flags);

544

}

544

}

545

546

/*

546

/*

547

* permit the bootmem allocator to evade page validation on high-order frees

547

* permit the bootmem allocator to evade page validation on high-order frees

548

*/

548

*/

549

void __free_pages_bootmem(struct page *page, unsigned int order)

549

void __free_pages_bootmem(struct page *page, unsigned int order)

550

{

550

{

551

if (order == 0) {

551

if (order == 0) {

552

__ClearPageReserved(page);

552

__ClearPageReserved(page);

553

set_page_count(page, 0);

553

set_page_count(page, 0);

554

set_page_refcounted(page);

554

set_page_refcounted(page);

555

__free_page(page);

555

__free_page(page);

556

} else {

556

} else {

557

int loop;

557

int loop;

558

559

prefetchw(page);

559

prefetchw(page);

560

for (loop = 0; loop < BITS_PER_LONG; loop++) {

560

for (loop = 0; loop < BITS_PER_LONG; loop++) {

561

struct page *p = &page[loop];

561

struct page *p = &page[loop];

562

563

if (loop + 1 < BITS_PER_LONG)

563

if (loop + 1 < BITS_PER_LONG)

564

prefetchw(p + 1);

564

prefetchw(p + 1);

565

__ClearPageReserved(p);

565

__ClearPageReserved(p);

566

set_page_count(p, 0);

566

set_page_count(p, 0);

567

}

567

}

568

569

set_page_refcounted(page);

569

set_page_refcounted(page);

570

__free_pages(page, order);

570

__free_pages(page, order);

571

}

571

}

572

}

572

}

573

574

575

/*

575

/*

576

* The order of subdivision here is critical for the IO subsystem.

576

* The order of subdivision here is critical for the IO subsystem.

577

* Please do not alter this order without good reasons and regression

577

* Please do not alter this order without good reasons and regression

578

* testing. Specifically, as large blocks of memory are subdivided,

578

* testing. Specifically, as large blocks of memory are subdivided,

579

* the order in which smaller blocks are delivered depends on the order

579

* the order in which smaller blocks are delivered depends on the order

580

* they're subdivided in this function. This is the primary factor

580

* they're subdivided in this function. This is the primary factor

581

* influencing the order in which pages are delivered to the IO

581

* influencing the order in which pages are delivered to the IO

582

* subsystem according to empirical testing, and this is also justified

582

* subsystem according to empirical testing, and this is also justified

583

* by considering the behavior of a buddy system containing a single

583

* by considering the behavior of a buddy system containing a single

584

* large block of memory acted on by a series of small allocations.

584

* large block of memory acted on by a series of small allocations.

585

* This behavior is a critical factor in sglist merging's success.

585

* This behavior is a critical factor in sglist merging's success.

586

*

586

*

587

* -- wli

587

* -- wli

588

*/

588

*/

589

static inline void expand(struct zone *zone, struct page *page,

589

static inline void expand(struct zone *zone, struct page *page,

590

int low, int high, struct free_area *area,

590

int low, int high, struct free_area *area,

591

int migratetype)

591

int migratetype)

592

{

592

{

593

unsigned long size = 1 << high;

593

unsigned long size = 1 << high;

594

595

while (high > low) {

595

while (high > low) {

596

area--;

596

area--;

597

high--;

597

high--;

598

size >>= 1;

598

size >>= 1;

599

VM_BUG_ON(bad_range(zone, &page[size]));

599

VM_BUG_ON(bad_range(zone, &page[size]));

600

list_add(&page[size].lru, &area->free_list[migratetype]);

600

list_add(&page[size].lru, &area->free_list[migratetype]);

601

area->nr_free++;

601

area->nr_free++;

602

set_page_order(&page[size], high);

602

set_page_order(&page[size], high);

603

}

603

}

604

}

604

}

605

606

/*

606

/*

607

* This page is about to be returned from the page allocator

607

* This page is about to be returned from the page allocator

608

*/

608

*/

609

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

609

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

610

{

610

{

611

if (unlikely(page_mapcount(page) |

611

if (unlikely(page_mapcount(page) |

612

(page->mapping != NULL) |

612

(page->mapping != NULL) |

613

(page_get_page_cgroup(page) != NULL) |

613

(page_get_page_cgroup(page) != NULL) |

614

(page_count(page) != 0) |

614

(page_count(page) != 0) |

615

(page->flags & (

615

(page->flags & (

616

1 << PG_lru |

616

1 << PG_lru |

617

1 << PG_private |

617

1 << PG_private |

618

1 << PG_locked |

618

1 << PG_locked |

619

1 << PG_active |

619

1 << PG_active |

620

1 << PG_dirty |

620

1 << PG_dirty |

621

1 << PG_slab |

621

1 << PG_slab |

622

1 << PG_swapcache |

622

1 << PG_swapcache |

623

1 << PG_writeback |

623

1 << PG_writeback |

624

1 << PG_reserved |

624

1 << PG_reserved |

625

1 << PG_buddy ))))

625

1 << PG_buddy ))))

626

bad_page(page);

626

bad_page(page);

627

628

/*

628

/*

629

* For now, we report if PG_reserved was found set, but do not

629

* For now, we report if PG_reserved was found set, but do not

630

* clear it, and do not allocate the page: as a safety net.

630

* clear it, and do not allocate the page: as a safety net.

631

*/

631

*/

632

if (PageReserved(page))

632

if (PageReserved(page))

633

return 1;

633

return 1;

634

635

page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |

635

page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |

636

1 << PG_referenced | 1 << PG_arch_1 |

636

1 << PG_referenced | 1 << PG_arch_1 |

637

1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);

637

1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);

638

set_page_private(page, 0);

638

set_page_private(page, 0);

639

set_page_refcounted(page);

639

set_page_refcounted(page);

640

641

arch_alloc_page(page, order);

641

arch_alloc_page(page, order);

642

kernel_map_pages(page, 1 << order, 1);

642

kernel_map_pages(page, 1 << order, 1);

643

644

if (gfp_flags & __GFP_ZERO)

644

if (gfp_flags & __GFP_ZERO)

645

prep_zero_page(page, order, gfp_flags);

645

prep_zero_page(page, order, gfp_flags);

646

647

if (order && (gfp_flags & __GFP_COMP))

647

if (order && (gfp_flags & __GFP_COMP))

648

prep_compound_page(page, order);

648

prep_compound_page(page, order);

649

650

return 0;

650

return 0;

651

}

651

}

652

653

/*

653

/*

654

* Go through the free lists for the given migratetype and remove

654

* Go through the free lists for the given migratetype and remove

655

* the smallest available page from the freelists

655

* the smallest available page from the freelists

656

*/

656

*/

657

static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

657

static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

658

int migratetype)

658

int migratetype)

659

{

659

{

660

unsigned int current_order;

660

unsigned int current_order;

661

struct free_area * area;

661

struct free_area * area;

662

struct page *page;

662

struct page *page;

663

664

/* Find a page of the appropriate size in the preferred list */

664

/* Find a page of the appropriate size in the preferred list */

665

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

665

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

666

area = &(zone->free_area[current_order]);

666

area = &(zone->free_area[current_order]);

667

if (list_empty(&area->free_list[migratetype]))

667

if (list_empty(&area->free_list[migratetype]))

668

continue;

668

continue;

669

670

page = list_entry(area->free_list[migratetype].next,

670

page = list_entry(area->free_list[migratetype].next,

671

struct page, lru);

671

struct page, lru);

672

list_del(&page->lru);

672

list_del(&page->lru);

673

rmv_page_order(page);

673

rmv_page_order(page);

674

area->nr_free--;

674

area->nr_free--;

675

__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));

675

__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));

676

expand(zone, page, order, current_order, area, migratetype);

676

expand(zone, page, order, current_order, area, migratetype);

677

return page;

677

return page;

678

}

678

}

679

680

return NULL;

680

return NULL;

681

}

681

}

682

683

684

/*

684

/*

685

* This array describes the order lists are fallen back to when

685

* This array describes the order lists are fallen back to when

686

* the free lists for the desirable migrate type are depleted

686

* the free lists for the desirable migrate type are depleted

687

*/

687

*/

688

static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {

688

static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {

689

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

689

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

690

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

690

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

691

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

691

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

692

[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */

692

[MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */

693

};

693

};

694

695

/*

695

/*

696

* Move the free pages in a range to the free lists of the requested type.

696

* Move the free pages in a range to the free lists of the requested type.

697

* Note that start_page and end_pages are not aligned on a pageblock

697

* Note that start_page and end_pages are not aligned on a pageblock

698

* boundary. If alignment is required, use move_freepages_block()

698

* boundary. If alignment is required, use move_freepages_block()

699

*/

699

*/

700

int move_freepages(struct zone *zone,

700

int move_freepages(struct zone *zone,

701

struct page *start_page, struct page *end_page,

701

struct page *start_page, struct page *end_page,

702

int migratetype)

702

int migratetype)

703

{

703

{

704

struct page *page;

704

struct page *page;

705

unsigned long order;

705

unsigned long order;

706

int pages_moved = 0;

706

int pages_moved = 0;

707

708

#ifndef CONFIG_HOLES_IN_ZONE

708

#ifndef CONFIG_HOLES_IN_ZONE

709

/*

709

/*

710

* page_zone is not safe to call in this context when

710

* page_zone is not safe to call in this context when

711

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

711

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

712

* anyway as we check zone boundaries in move_freepages_block().

712

* anyway as we check zone boundaries in move_freepages_block().

713

* Remove at a later date when no bug reports exist related to

713

* Remove at a later date when no bug reports exist related to

714

* grouping pages by mobility

714

* grouping pages by mobility

715

*/

715

*/

716

BUG_ON(page_zone(start_page) != page_zone(end_page));

716

BUG_ON(page_zone(start_page) != page_zone(end_page));

717

#endif

717

#endif

718

719

for (page = start_page; page <= end_page;) {

719

for (page = start_page; page <= end_page;) {

720

if (!pfn_valid_within(page_to_pfn(page))) {

720

if (!pfn_valid_within(page_to_pfn(page))) {

721

page++;

721

page++;

722

continue;

722

continue;

723

}

723

}

724

725

if (!PageBuddy(page)) {

725

if (!PageBuddy(page)) {

726

page++;

726

page++;

727

continue;

727

continue;

728

}

728

}

729

730

order = page_order(page);

730

order = page_order(page);

731

list_del(&page->lru);

731

list_del(&page->lru);

732

list_add(&page->lru,

732

list_add(&page->lru,

733

&zone->free_area[order].free_list[migratetype]);

733

&zone->free_area[order].free_list[migratetype]);

734

page += 1 << order;

734

page += 1 << order;

735

pages_moved += 1 << order;

735

pages_moved += 1 << order;

736

}

736

}

737

738

return pages_moved;

738

return pages_moved;

739

}

739

}

740

741

int move_freepages_block(struct zone *zone, struct page *page, int migratetype)

741

int move_freepages_block(struct zone *zone, struct page *page, int migratetype)

742

{

742

{

743

unsigned long start_pfn, end_pfn;

743

unsigned long start_pfn, end_pfn;

744

struct page *start_page, *end_page;

744

struct page *start_page, *end_page;

745

746

start_pfn = page_to_pfn(page);

746

start_pfn = page_to_pfn(page);

747

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

747

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

748

start_page = pfn_to_page(start_pfn);

748

start_page = pfn_to_page(start_pfn);

749

end_page = start_page + pageblock_nr_pages - 1;

749

end_page = start_page + pageblock_nr_pages - 1;

750

end_pfn = start_pfn + pageblock_nr_pages - 1;

750

end_pfn = start_pfn + pageblock_nr_pages - 1;

751

752

/* Do not cross zone boundaries */

752

/* Do not cross zone boundaries */

753

if (start_pfn < zone->zone_start_pfn)

753

if (start_pfn < zone->zone_start_pfn)

754

start_page = page;

754

start_page = page;

755

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

755

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

756

return 0;

756

return 0;

757

758

return move_freepages(zone, start_page, end_page, migratetype);

758

return move_freepages(zone, start_page, end_page, migratetype);

759

}

759

}

760

761

/* Remove an element from the buddy allocator from the fallback list */

761

/* Remove an element from the buddy allocator from the fallback list */

762

static struct page *__rmqueue_fallback(struct zone *zone, int order,

762

static struct page *__rmqueue_fallback(struct zone *zone, int order,

763

int start_migratetype)

763

int start_migratetype)

764

{

764

{

765

struct free_area * area;

765

struct free_area * area;

766

int current_order;

766

int current_order;

767

struct page *page;

767

struct page *page;

768

int migratetype, i;

768

int migratetype, i;

769

770

/* Find the largest possible block of pages in the other list */

770

/* Find the largest possible block of pages in the other list */

771

for (current_order = MAX_ORDER-1; current_order >= order;

771

for (current_order = MAX_ORDER-1; current_order >= order;

772

--current_order) {

772

--current_order) {

773

for (i = 0; i < MIGRATE_TYPES - 1; i++) {

773

for (i = 0; i < MIGRATE_TYPES - 1; i++) {

774

migratetype = fallbacks[start_migratetype][i];

774

migratetype = fallbacks[start_migratetype][i];

775

776

/* MIGRATE_RESERVE handled later if necessary */

776

/* MIGRATE_RESERVE handled later if necessary */

777

if (migratetype == MIGRATE_RESERVE)

777

if (migratetype == MIGRATE_RESERVE)

778

continue;

778

continue;

779

780

area = &(zone->free_area[current_order]);

780

area = &(zone->free_area[current_order]);

781

if (list_empty(&area->free_list[migratetype]))

781

if (list_empty(&area->free_list[migratetype]))

782

continue;

782

continue;

783

784

page = list_entry(area->free_list[migratetype].next,

784

page = list_entry(area->free_list[migratetype].next,

785

struct page, lru);

785

struct page, lru);

786

area->nr_free--;

786

area->nr_free--;

787

788

/*

788

/*

789

* If breaking a large block of pages, move all free

789

* If breaking a large block of pages, move all free

790

* pages to the preferred allocation list. If falling

790

* pages to the preferred allocation list. If falling

791

* back for a reclaimable kernel allocation, be more

791

* back for a reclaimable kernel allocation, be more

792

* agressive about taking ownership of free pages

792

* agressive about taking ownership of free pages

793

*/

793

*/

794

if (unlikely(current_order >= (pageblock_order >> 1)) ||

794

if (unlikely(current_order >= (pageblock_order >> 1)) ||

795

start_migratetype == MIGRATE_RECLAIMABLE) {

795

start_migratetype == MIGRATE_RECLAIMABLE) {

796

unsigned long pages;

796

unsigned long pages;

797

pages = move_freepages_block(zone, page,

797

pages = move_freepages_block(zone, page,

798

start_migratetype);

798

start_migratetype);

799

800

/* Claim the whole block if over half of it is free */

800

/* Claim the whole block if over half of it is free */

801

if (pages >= (1 << (pageblock_order-1)))

801

if (pages >= (1 << (pageblock_order-1)))

802

set_pageblock_migratetype(page,

802

set_pageblock_migratetype(page,

803

start_migratetype);

803

start_migratetype);

804

805

migratetype = start_migratetype;

805

migratetype = start_migratetype;

806

}

806

}

807

808

/* Remove the page from the freelists */

808

/* Remove the page from the freelists */

809

list_del(&page->lru);

809

list_del(&page->lru);

810

rmv_page_order(page);

810

rmv_page_order(page);

811

__mod_zone_page_state(zone, NR_FREE_PAGES,

811

__mod_zone_page_state(zone, NR_FREE_PAGES,

812

-(1UL << order));

812

-(1UL << order));

813

814

if (current_order == pageblock_order)

814

if (current_order == pageblock_order)

815

set_pageblock_migratetype(page,

815

set_pageblock_migratetype(page,

816

start_migratetype);

816

start_migratetype);

817

818

expand(zone, page, order, current_order, area, migratetype);

818

expand(zone, page, order, current_order, area, migratetype);

819

return page;

819

return page;

820

}

820

}

821

}

821

}

822

823

/* Use MIGRATE_RESERVE rather than fail an allocation */

823

/* Use MIGRATE_RESERVE rather than fail an allocation */

824

return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);

824

return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);

825

}

825

}

826

827

/*

827

/*

828

* Do the hard work of removing an element from the buddy allocator.

828

* Do the hard work of removing an element from the buddy allocator.

829

* Call me with the zone->lock already held.

829

* Call me with the zone->lock already held.

830

*/

830

*/

831

static struct page *__rmqueue(struct zone *zone, unsigned int order,

831

static struct page *__rmqueue(struct zone *zone, unsigned int order,

832

int migratetype)

832

int migratetype)

833

{

833

{

834

struct page *page;

834

struct page *page;

835

836

page = __rmqueue_smallest(zone, order, migratetype);

836

page = __rmqueue_smallest(zone, order, migratetype);

837

838

if (unlikely(!page))

838

if (unlikely(!page))

839

page = __rmqueue_fallback(zone, order, migratetype);

839

page = __rmqueue_fallback(zone, order, migratetype);

840

841

return page;

841

return page;

842

}

842

}

843

844

/*

844

/*

845

* Obtain a specified number of elements from the buddy allocator, all under

845

* Obtain a specified number of elements from the buddy allocator, all under

846

* a single hold of the lock, for efficiency. Add them to the supplied list.

846

* a single hold of the lock, for efficiency. Add them to the supplied list.

847

* Returns the number of new pages which were placed at *list.

847

* Returns the number of new pages which were placed at *list.

848

*/

848

*/

849

static int rmqueue_bulk(struct zone *zone, unsigned int order,

849

static int rmqueue_bulk(struct zone *zone, unsigned int order,

850

unsigned long count, struct list_head *list,

850

unsigned long count, struct list_head *list,

851

int migratetype)

851

int migratetype)

852

{

852

{

853

int i;

853

int i;

854

855

spin_lock(&zone->lock);

855

spin_lock(&zone->lock);

856

for (i = 0; i < count; ++i) {

856

for (i = 0; i < count; ++i) {

857

struct page *page = __rmqueue(zone, order, migratetype);

857

struct page *page = __rmqueue(zone, order, migratetype);

858

if (unlikely(page == NULL))

858

if (unlikely(page == NULL))

859

break;

859

break;

860

861

/*

861

/*

862

* Split buddy pages returned by expand() are received here

862

* Split buddy pages returned by expand() are received here

863

* in physical page order. The page is added to the callers and

863

* in physical page order. The page is added to the callers and

864

* list and the list head then moves forward. From the callers

864

* list and the list head then moves forward. From the callers

865

* perspective, the linked list is ordered by page number in

865

* perspective, the linked list is ordered by page number in

866

* some conditions. This is useful for IO devices that can

866

* some conditions. This is useful for IO devices that can

867

* merge IO requests if the physical pages are ordered

867

* merge IO requests if the physical pages are ordered

868

* properly.

868

* properly.

869

*/

869

*/

870

list_add(&page->lru, list);

870

list_add(&page->lru, list);

871

set_page_private(page, migratetype);

871

set_page_private(page, migratetype);

872

list = &page->lru;

872

list = &page->lru;

873

}

873

}

874

spin_unlock(&zone->lock);

874

spin_unlock(&zone->lock);

875

return i;

875

return i;

876

}

876

}

877

878

#ifdef CONFIG_NUMA

878

#ifdef CONFIG_NUMA

879

/*

879

/*

880

* Called from the vmstat counter updater to drain pagesets of this

880

* Called from the vmstat counter updater to drain pagesets of this

881

* currently executing processor on remote nodes after they have

881

* currently executing processor on remote nodes after they have

882

* expired.

882

* expired.

883

*

883

*

884

* Note that this function must be called with the thread pinned to

884

* Note that this function must be called with the thread pinned to

885

* a single processor.

885

* a single processor.

886

*/

886

*/

887

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

887

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

888

{

888

{

889

unsigned long flags;

889

unsigned long flags;

890

int to_drain;

890

int to_drain;

891

892

local_irq_save(flags);

892

local_irq_save(flags);

893

if (pcp->count >= pcp->batch)

893

if (pcp->count >= pcp->batch)

894

to_drain = pcp->batch;

894

to_drain = pcp->batch;

895

else

895

else

896

to_drain = pcp->count;

896

to_drain = pcp->count;

897

free_pages_bulk(zone, to_drain, &pcp->list, 0);

897

free_pages_bulk(zone, to_drain, &pcp->list, 0);

898

pcp->count -= to_drain;

898

pcp->count -= to_drain;

899

local_irq_restore(flags);

899

local_irq_restore(flags);

900

}

900

}

901

#endif

901

#endif

902

903

/*

903

/*

904

* Drain pages of the indicated processor.

904

* Drain pages of the indicated processor.

905

*

905

*

906

* The processor must either be the current processor and the

906

* The processor must either be the current processor and the

907

* thread pinned to the current processor or a processor that

907

* thread pinned to the current processor or a processor that

908

* is not online.

908

* is not online.

909

*/

909

*/

910

static void drain_pages(unsigned int cpu)

910

static void drain_pages(unsigned int cpu)

911

{

911

{

912

unsigned long flags;

912

unsigned long flags;

913

struct zone *zone;

913

struct zone *zone;

914

915

for_each_zone(zone) {

915

for_each_zone(zone) {

916

struct per_cpu_pageset *pset;

916

struct per_cpu_pageset *pset;

917

struct per_cpu_pages *pcp;

917

struct per_cpu_pages *pcp;

918

919

if (!populated_zone(zone))

919

if (!populated_zone(zone))

920

continue;

920

continue;

921

922

pset = zone_pcp(zone, cpu);

922

pset = zone_pcp(zone, cpu);

923

924

pcp = &pset->pcp;

924

pcp = &pset->pcp;

925

local_irq_save(flags);

925

local_irq_save(flags);

926

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

926

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

927

pcp->count = 0;

927

pcp->count = 0;

928

local_irq_restore(flags);

928

local_irq_restore(flags);

929

}

929

}

930

}

930

}

931

932

/*

932

/*

933

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

933

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

934

*/

934

*/

935

void drain_local_pages(void *arg)

935

void drain_local_pages(void *arg)

936

{

936

{

937

drain_pages(smp_processor_id());

937

drain_pages(smp_processor_id());

938

}

938

}

939

940

/*

940

/*

941

* Spill all the per-cpu pages from all CPUs back into the buddy allocator

941

* Spill all the per-cpu pages from all CPUs back into the buddy allocator

942

*/

942

*/

943

void drain_all_pages(void)

943

void drain_all_pages(void)

944

{

944

{

945

on_each_cpu(drain_local_pages, NULL, 0, 1);

945

on_each_cpu(drain_local_pages, NULL, 0, 1);

946

}

946

}

947

948

#ifdef CONFIG_HIBERNATION

948

#ifdef CONFIG_HIBERNATION

949

950

void mark_free_pages(struct zone *zone)

950

void mark_free_pages(struct zone *zone)

951

{

951

{

952

unsigned long pfn, max_zone_pfn;

952

unsigned long pfn, max_zone_pfn;

953

unsigned long flags;

953

unsigned long flags;

954

int order, t;

954

int order, t;

955

struct list_head *curr;

955

struct list_head *curr;

956

957

if (!zone->spanned_pages)

957

if (!zone->spanned_pages)

958

return;

958

return;

959

960

spin_lock_irqsave(&zone->lock, flags);

960

spin_lock_irqsave(&zone->lock, flags);

961

962

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

962

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

963

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

963

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

964

if (pfn_valid(pfn)) {

964

if (pfn_valid(pfn)) {

965

struct page *page = pfn_to_page(pfn);

965

struct page *page = pfn_to_page(pfn);

966

967

if (!swsusp_page_is_forbidden(page))

967

if (!swsusp_page_is_forbidden(page))

968

swsusp_unset_page_free(page);

968

swsusp_unset_page_free(page);

969

}

969

}

970

971

for_each_migratetype_order(order, t) {

971

for_each_migratetype_order(order, t) {

972

list_for_each(curr, &zone->free_area[order].free_list[t]) {

972

list_for_each(curr, &zone->free_area[order].free_list[t]) {

973

unsigned long i;

973

unsigned long i;

974

975

pfn = page_to_pfn(list_entry(curr, struct page, lru));

975

pfn = page_to_pfn(list_entry(curr, struct page, lru));

976

for (i = 0; i < (1UL << order); i++)

976

for (i = 0; i < (1UL << order); i++)

977

swsusp_set_page_free(pfn_to_page(pfn + i));

977

swsusp_set_page_free(pfn_to_page(pfn + i));

978

}

978

}

979

}

979

}

980

spin_unlock_irqrestore(&zone->lock, flags);

980

spin_unlock_irqrestore(&zone->lock, flags);

981

}

981

}

982

#endif /* CONFIG_PM */

982

#endif /* CONFIG_PM */

983

984

/*

984

/*

985

* Free a 0-order page

985

* Free a 0-order page

986

*/

986

*/

987

static void free_hot_cold_page(struct page *page, int cold)

987

static void free_hot_cold_page(struct page *page, int cold)

988

{

988

{

989

struct zone *zone = page_zone(page);

989

struct zone *zone = page_zone(page);

990

struct per_cpu_pages *pcp;

990

struct per_cpu_pages *pcp;

991

unsigned long flags;

991

unsigned long flags;

992

993

if (PageAnon(page))

993

if (PageAnon(page))

994

page->mapping = NULL;

994

page->mapping = NULL;

995

if (free_pages_check(page))

995

if (free_pages_check(page))

996

return;

996

return;

997

998

if (!PageHighMem(page))

998

if (!PageHighMem(page))

999

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

999

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

1000

arch_free_page(page, 0);

1000

arch_free_page(page, 0);

1001

kernel_map_pages(page, 1, 0);

1001

kernel_map_pages(page, 1, 0);

1002

1003

pcp = &zone_pcp(zone, get_cpu())->pcp;

1003

pcp = &zone_pcp(zone, get_cpu())->pcp;

1004

local_irq_save(flags);

1004

local_irq_save(flags);

1005

__count_vm_event(PGFREE);

1005

__count_vm_event(PGFREE);

1006

if (cold)

1006

if (cold)

1007

list_add_tail(&page->lru, &pcp->list);

1007

list_add_tail(&page->lru, &pcp->list);

1008

else

1008

else

1009

list_add(&page->lru, &pcp->list);

1009

list_add(&page->lru, &pcp->list);

1010

set_page_private(page, get_pageblock_migratetype(page));

1010

set_page_private(page, get_pageblock_migratetype(page));

1011

pcp->count++;

1011

pcp->count++;

1012

if (pcp->count >= pcp->high) {

1012

if (pcp->count >= pcp->high) {

1013

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

1013

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

1014

pcp->count -= pcp->batch;

1014

pcp->count -= pcp->batch;

1015

}

1015

}

1016

local_irq_restore(flags);

1016

local_irq_restore(flags);

1017

put_cpu();

1017

put_cpu();

1018

}

1018

}

1019

1020

void free_hot_page(struct page *page)

1020

void free_hot_page(struct page *page)

1021

{

1021

{

1022

free_hot_cold_page(page, 0);

1022

free_hot_cold_page(page, 0);

1023

}

1023

}

1024

1025

void free_cold_page(struct page *page)

1025

void free_cold_page(struct page *page)

1026

{

1026

{

1027

free_hot_cold_page(page, 1);

1027

free_hot_cold_page(page, 1);

1028

}

1028

}

1029

1030

/*

1030

/*

1031

* split_page takes a non-compound higher-order page, and splits it into

1031

* split_page takes a non-compound higher-order page, and splits it into

1032

* n (1<<order) sub-pages: page[0..n]

1032

* n (1<<order) sub-pages: page[0..n]

1033

* Each sub-page must be freed individually.

1033

* Each sub-page must be freed individually.

1034

*

1034

*

1035

* Note: this is probably too low level an operation for use in drivers.

1035

* Note: this is probably too low level an operation for use in drivers.

1036

* Please consult with lkml before using this in your driver.

1036

* Please consult with lkml before using this in your driver.

1037

*/

1037

*/

1038

void split_page(struct page *page, unsigned int order)

1038

void split_page(struct page *page, unsigned int order)

1039

{

1039

{

1040

int i;

1040

int i;

1041

1042

VM_BUG_ON(PageCompound(page));

1042

VM_BUG_ON(PageCompound(page));

1043

VM_BUG_ON(!page_count(page));

1043

VM_BUG_ON(!page_count(page));

1044

for (i = 1; i < (1 << order); i++)

1044

for (i = 1; i < (1 << order); i++)

1045

set_page_refcounted(page + i);

1045

set_page_refcounted(page + i);

1046

}

1046

}

1047

1048

/*

1048

/*

1049

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1049

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1050

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1050

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1051

* or two.

1051

* or two.

1052

*/

1052

*/

1053

static struct page *buffered_rmqueue(struct zone *preferred_zone,

1053

static struct page *buffered_rmqueue(struct zone *preferred_zone,

1054

struct zone *zone, int order, gfp_t gfp_flags)

1054

struct zone *zone, int order, gfp_t gfp_flags)

1055

{

1055

{

1056

unsigned long flags;

1056

unsigned long flags;

1057

struct page *page;

1057

struct page *page;

1058

int cold = !!(gfp_flags & __GFP_COLD);

1058

int cold = !!(gfp_flags & __GFP_COLD);

1059

int cpu;

1059

int cpu;

1060

int migratetype = allocflags_to_migratetype(gfp_flags);

1060

int migratetype = allocflags_to_migratetype(gfp_flags);

1061

1062

again:

1062

again:

1063

cpu = get_cpu();

1063

cpu = get_cpu();

1064

if (likely(order == 0)) {

1064

if (likely(order == 0)) {

1065

struct per_cpu_pages *pcp;

1065

struct per_cpu_pages *pcp;

1066

1067

pcp = &zone_pcp(zone, cpu)->pcp;

1067

pcp = &zone_pcp(zone, cpu)->pcp;

1068

local_irq_save(flags);

1068

local_irq_save(flags);

1069

if (!pcp->count) {

1069

if (!pcp->count) {

1070

pcp->count = rmqueue_bulk(zone, 0,

1070

pcp->count = rmqueue_bulk(zone, 0,

1071

pcp->batch, &pcp->list, migratetype);

1071

pcp->batch, &pcp->list, migratetype);

1072

if (unlikely(!pcp->count))

1072

if (unlikely(!pcp->count))

1073

goto failed;

1073

goto failed;

1074

}

1074

}

1075

1076

/* Find a page of the appropriate migrate type */

1076

/* Find a page of the appropriate migrate type */

1077

if (cold) {

1077

if (cold) {

1078

list_for_each_entry_reverse(page, &pcp->list, lru)

1078

list_for_each_entry_reverse(page, &pcp->list, lru)

1079

if (page_private(page) == migratetype)

1079

if (page_private(page) == migratetype)

1080

break;

1080

break;

1081

} else {

1081

} else {

1082

list_for_each_entry(page, &pcp->list, lru)

1082

list_for_each_entry(page, &pcp->list, lru)

1083

if (page_private(page) == migratetype)

1083

if (page_private(page) == migratetype)

1084

break;

1084

break;

1085

}

1085

}

1086

1087

/* Allocate more to the pcp list if necessary */

1087

/* Allocate more to the pcp list if necessary */

1088

if (unlikely(&page->lru == &pcp->list)) {

1088

if (unlikely(&page->lru == &pcp->list)) {

1089

pcp->count += rmqueue_bulk(zone, 0,

1089

pcp->count += rmqueue_bulk(zone, 0,

1090

pcp->batch, &pcp->list, migratetype);

1090

pcp->batch, &pcp->list, migratetype);

1091

page = list_entry(pcp->list.next, struct page, lru);

1091

page = list_entry(pcp->list.next, struct page, lru);

1092

}

1092

}

1093

1094

list_del(&page->lru);

1094

list_del(&page->lru);

1095

pcp->count--;

1095

pcp->count--;

1096

} else {

1096

} else {

1097

spin_lock_irqsave(&zone->lock, flags);

1097

spin_lock_irqsave(&zone->lock, flags);

1098

page = __rmqueue(zone, order, migratetype);

1098

page = __rmqueue(zone, order, migratetype);

1099

spin_unlock(&zone->lock);

1099

spin_unlock(&zone->lock);

1100

if (!page)

1100

if (!page)

1101

goto failed;

1101

goto failed;

1102

}

1102

}

1103

1104

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1104

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1105

zone_statistics(preferred_zone, zone);

1105

zone_statistics(preferred_zone, zone);

1106

local_irq_restore(flags);

1106

local_irq_restore(flags);

1107

put_cpu();

1107

put_cpu();

1108

1109

VM_BUG_ON(bad_range(zone, page));

1109

VM_BUG_ON(bad_range(zone, page));

1110

if (prep_new_page(page, order, gfp_flags))

1110

if (prep_new_page(page, order, gfp_flags))

1111

goto again;

1111

goto again;

1112

return page;

1112

return page;

1113

1114

failed:

1114

failed:

1115

local_irq_restore(flags);

1115

local_irq_restore(flags);

1116

put_cpu();

1116

put_cpu();

1117

return NULL;

1117

return NULL;

1118

}

1118

}

1119

1120

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

1120

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

1121

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

1121

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

1122

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

1122

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

1123

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

1123

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

1124

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1124

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1125

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1125

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1126

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1126

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1127

1128

#ifdef CONFIG_FAIL_PAGE_ALLOC

1128

#ifdef CONFIG_FAIL_PAGE_ALLOC

1129

1130

static struct fail_page_alloc_attr {

1130

static struct fail_page_alloc_attr {

1131

struct fault_attr attr;

1131

struct fault_attr attr;

1132

1133

u32 ignore_gfp_highmem;

1133

u32 ignore_gfp_highmem;

1134

u32 ignore_gfp_wait;

1134

u32 ignore_gfp_wait;

1135

u32 min_order;

1135

u32 min_order;

1136

1137

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1137

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1138

1139

struct dentry *ignore_gfp_highmem_file;

1139

struct dentry *ignore_gfp_highmem_file;

1140

struct dentry *ignore_gfp_wait_file;

1140

struct dentry *ignore_gfp_wait_file;

1141

struct dentry *min_order_file;

1141

struct dentry *min_order_file;

1142

1143

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1143

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1144

1145

} fail_page_alloc = {

1145

} fail_page_alloc = {

1146

.attr = FAULT_ATTR_INITIALIZER,

1146

.attr = FAULT_ATTR_INITIALIZER,

1147

.ignore_gfp_wait = 1,

1147

.ignore_gfp_wait = 1,

1148

.ignore_gfp_highmem = 1,

1148

.ignore_gfp_highmem = 1,

1149

.min_order = 1,

1149

.min_order = 1,

1150

};

1150

};

1151

1152

static int __init setup_fail_page_alloc(char *str)

1152

static int __init setup_fail_page_alloc(char *str)

1153

{

1153

{

1154

return setup_fault_attr(&fail_page_alloc.attr, str);

1154

return setup_fault_attr(&fail_page_alloc.attr, str);

1155

}

1155

}

1156

__setup("fail_page_alloc=", setup_fail_page_alloc);

1156

__setup("fail_page_alloc=", setup_fail_page_alloc);

1157

1158

static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1158

static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1159

{

1159

{

1160

if (order < fail_page_alloc.min_order)

1160

if (order < fail_page_alloc.min_order)

1161

return 0;

1161

return 0;

1162

if (gfp_mask & __GFP_NOFAIL)

1162

if (gfp_mask & __GFP_NOFAIL)

1163

return 0;

1163

return 0;

1164

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1164

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1165

return 0;

1165

return 0;

1166

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1166

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1167

return 0;

1167

return 0;

1168

1169

return should_fail(&fail_page_alloc.attr, 1 << order);

1169

return should_fail(&fail_page_alloc.attr, 1 << order);

1170

}

1170

}

1171

1172

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1172

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1173

1174

static int __init fail_page_alloc_debugfs(void)

1174

static int __init fail_page_alloc_debugfs(void)

1175

{

1175

{

1176

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1176

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1177

struct dentry *dir;

1177

struct dentry *dir;

1178

int err;

1178

int err;

1179

1180

err = init_fault_attr_dentries(&fail_page_alloc.attr,

1180

err = init_fault_attr_dentries(&fail_page_alloc.attr,

1181

"fail_page_alloc");

1181

"fail_page_alloc");

1182

if (err)

1182

if (err)

1183

return err;

1183

return err;

1184

dir = fail_page_alloc.attr.dentries.dir;

1184

dir = fail_page_alloc.attr.dentries.dir;

1185

1186

fail_page_alloc.ignore_gfp_wait_file =

1186

fail_page_alloc.ignore_gfp_wait_file =

1187

debugfs_create_bool("ignore-gfp-wait", mode, dir,

1187

debugfs_create_bool("ignore-gfp-wait", mode, dir,

1188

&fail_page_alloc.ignore_gfp_wait);

1188

&fail_page_alloc.ignore_gfp_wait);

1189

1190

fail_page_alloc.ignore_gfp_highmem_file =

1190

fail_page_alloc.ignore_gfp_highmem_file =

1191

debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1191

debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1192

&fail_page_alloc.ignore_gfp_highmem);

1192

&fail_page_alloc.ignore_gfp_highmem);

1193

fail_page_alloc.min_order_file =

1193

fail_page_alloc.min_order_file =

1194

debugfs_create_u32("min-order", mode, dir,

1194

debugfs_create_u32("min-order", mode, dir,

1195

&fail_page_alloc.min_order);

1195

&fail_page_alloc.min_order);

1196

1197

if (!fail_page_alloc.ignore_gfp_wait_file ||

1197

if (!fail_page_alloc.ignore_gfp_wait_file ||

1198

!fail_page_alloc.ignore_gfp_highmem_file ||

1198

!fail_page_alloc.ignore_gfp_highmem_file ||

1199

!fail_page_alloc.min_order_file) {

1199

!fail_page_alloc.min_order_file) {

1200

err = -ENOMEM;

1200

err = -ENOMEM;

1201

debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);

1201

debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);

1202

debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);

1202

debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);

1203

debugfs_remove(fail_page_alloc.min_order_file);

1203

debugfs_remove(fail_page_alloc.min_order_file);

1204

cleanup_fault_attr_dentries(&fail_page_alloc.attr);

1204

cleanup_fault_attr_dentries(&fail_page_alloc.attr);

1205

}

1205

}

1206

1207

return err;

1207

return err;

1208

}

1208

}

1209

1210

late_initcall(fail_page_alloc_debugfs);

1210

late_initcall(fail_page_alloc_debugfs);

1211

1212

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1212

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1213

1214

#else /* CONFIG_FAIL_PAGE_ALLOC */

1214

#else /* CONFIG_FAIL_PAGE_ALLOC */

1215

1216

static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1216

static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1217

{

1217

{

1218

return 0;

1218

return 0;

1219

}

1219

}

1220

1221

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1221

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1222

1223

/*

1223

/*

1224

* Return 1 if free pages are above 'mark'. This takes into account the order

1224

* Return 1 if free pages are above 'mark'. This takes into account the order

1225

* of the allocation.

1225

* of the allocation.

1226

*/

1226

*/

1227

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1227

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1228

int classzone_idx, int alloc_flags)

1228

int classzone_idx, int alloc_flags)

1229

{

1229

{

1230

/* free_pages my go negative - that's OK */

1230

/* free_pages my go negative - that's OK */

1231

long min = mark;

1231

long min = mark;

1232

long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;

1232

long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;

1233

int o;

1233

int o;

1234

1235

if (alloc_flags & ALLOC_HIGH)

1235

if (alloc_flags & ALLOC_HIGH)

1236

min -= min / 2;

1236

min -= min / 2;

1237

if (alloc_flags & ALLOC_HARDER)

1237

if (alloc_flags & ALLOC_HARDER)

1238

min -= min / 4;

1238

min -= min / 4;

1239

1240

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1240

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1241

return 0;

1241

return 0;

1242

for (o = 0; o < order; o++) {

1242

for (o = 0; o < order; o++) {

1243

/* At the next order, this order's pages become unavailable */

1243

/* At the next order, this order's pages become unavailable */

1244

free_pages -= z->free_area[o].nr_free << o;

1244

free_pages -= z->free_area[o].nr_free << o;

1245

1246

/* Require fewer higher order pages to be free */

1246

/* Require fewer higher order pages to be free */

1247

min >>= 1;

1247

min >>= 1;

1248

1249

if (free_pages <= min)

1249

if (free_pages <= min)

1250

return 0;

1250

return 0;

1251

}

1251

}

1252

return 1;

1252

return 1;

1253

}

1253

}

1254

1255

#ifdef CONFIG_NUMA

1255

#ifdef CONFIG_NUMA

1256

/*

1256

/*

1257

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1257

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1258

* skip over zones that are not allowed by the cpuset, or that have

1258

* skip over zones that are not allowed by the cpuset, or that have

1259

* been recently (in last second) found to be nearly full. See further

1259

* been recently (in last second) found to be nearly full. See further

1260

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1260

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1261

* that have to skip over a lot of full or unallowed zones.

1261

* that have to skip over a lot of full or unallowed zones.

1262

*

1262

*

1263

* If the zonelist cache is present in the passed in zonelist, then

1263

* If the zonelist cache is present in the passed in zonelist, then

1264

* returns a pointer to the allowed node mask (either the current

1264

* returns a pointer to the allowed node mask (either the current

1265

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1265

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1266

*

1266

*

1267

* If the zonelist cache is not available for this zonelist, does

1267

* If the zonelist cache is not available for this zonelist, does

1268

* nothing and returns NULL.

1268

* nothing and returns NULL.

1269

*

1269

*

1270

* If the fullzones BITMAP in the zonelist cache is stale (more than

1270

* If the fullzones BITMAP in the zonelist cache is stale (more than

1271

* a second since last zap'd) then we zap it out (clear its bits.)

1271

* a second since last zap'd) then we zap it out (clear its bits.)

1272

*

1272

*

1273

* We hold off even calling zlc_setup, until after we've checked the

1273

* We hold off even calling zlc_setup, until after we've checked the

1274

* first zone in the zonelist, on the theory that most allocations will

1274

* first zone in the zonelist, on the theory that most allocations will

1275

* be satisfied from that first zone, so best to examine that zone as

1275

* be satisfied from that first zone, so best to examine that zone as

1276

* quickly as we can.

1276

* quickly as we can.

1277

*/

1277

*/

1278

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1278

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1279

{

1279

{

1280

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1280

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1281

nodemask_t *allowednodes; /* zonelist_cache approximation */

1281

nodemask_t *allowednodes; /* zonelist_cache approximation */

1282

1283

zlc = zonelist->zlcache_ptr;

1283

zlc = zonelist->zlcache_ptr;

1284

if (!zlc)

1284

if (!zlc)

1285

return NULL;

1285

return NULL;

1286

1287

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1287

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1288

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1288

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1289

zlc->last_full_zap = jiffies;

1289

zlc->last_full_zap = jiffies;

1290

}

1290

}

1291

1292

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1292

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1293

&cpuset_current_mems_allowed :

1293

&cpuset_current_mems_allowed :

1294

&node_states[N_HIGH_MEMORY];

1294

&node_states[N_HIGH_MEMORY];

1295

return allowednodes;

1295

return allowednodes;

1296

}

1296

}

1297

1298

/*

1298

/*

1299

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1299

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1300

* if it is worth looking at further for free memory:

1300

* if it is worth looking at further for free memory:

1301

* 1) Check that the zone isn't thought to be full (doesn't have its

1301

* 1) Check that the zone isn't thought to be full (doesn't have its

1302

* bit set in the zonelist_cache fullzones BITMAP).

1302

* bit set in the zonelist_cache fullzones BITMAP).

1303

* 2) Check that the zones node (obtained from the zonelist_cache

1303

* 2) Check that the zones node (obtained from the zonelist_cache

1304

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1304

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1305

* Return true (non-zero) if zone is worth looking at further, or

1305

* Return true (non-zero) if zone is worth looking at further, or

1306

* else return false (zero) if it is not.

1306

* else return false (zero) if it is not.

1307

*

1307

*

1308

* This check -ignores- the distinction between various watermarks,

1308

* This check -ignores- the distinction between various watermarks,

1309

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1309

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1310

* found to be full for any variation of these watermarks, it will

1310

* found to be full for any variation of these watermarks, it will

1311

* be considered full for up to one second by all requests, unless

1311

* be considered full for up to one second by all requests, unless

1312

* we are so low on memory on all allowed nodes that we are forced

1312

* we are so low on memory on all allowed nodes that we are forced

1313

* into the second scan of the zonelist.

1313

* into the second scan of the zonelist.

1314

*

1314

*

1315

* In the second scan we ignore this zonelist cache and exactly

1315

* In the second scan we ignore this zonelist cache and exactly

1316

* apply the watermarks to all zones, even it is slower to do so.

1316

* apply the watermarks to all zones, even it is slower to do so.

1317

* We are low on memory in the second scan, and should leave no stone

1317

* We are low on memory in the second scan, and should leave no stone

1318

* unturned looking for a free page.

1318

* unturned looking for a free page.

1319

*/

1319

*/

1320

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1320

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1321

nodemask_t *allowednodes)

1321

nodemask_t *allowednodes)

1322

{

1322

{

1323

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1323

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1324

int i; /* index of *z in zonelist zones */

1324

int i; /* index of *z in zonelist zones */

1325

int n; /* node that zone *z is on */

1325

int n; /* node that zone *z is on */

1326

1327

zlc = zonelist->zlcache_ptr;

1327

zlc = zonelist->zlcache_ptr;

1328

if (!zlc)

1328

if (!zlc)

1329

return 1;

1329

return 1;

1330

1331

i = z - zonelist->_zonerefs;

1331

i = z - zonelist->_zonerefs;

1332

n = zlc->z_to_n[i];

1332

n = zlc->z_to_n[i];

1333

1334

/* This zone is worth trying if it is allowed but not full */

1334

/* This zone is worth trying if it is allowed but not full */

1335

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1335

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1336

}

1336

}

1337

1338

/*

1338

/*

1339

* Given 'z' scanning a zonelist, set the corresponding bit in

1339

* Given 'z' scanning a zonelist, set the corresponding bit in

1340

* zlc->fullzones, so that subsequent attempts to allocate a page

1340

* zlc->fullzones, so that subsequent attempts to allocate a page

1341

* from that zone don't waste time re-examining it.

1341

* from that zone don't waste time re-examining it.

1342

*/

1342

*/

1343

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1343

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1344

{

1344

{

1345

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1345

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1346

int i; /* index of *z in zonelist zones */

1346

int i; /* index of *z in zonelist zones */

1347

1348

zlc = zonelist->zlcache_ptr;

1348

zlc = zonelist->zlcache_ptr;

1349

if (!zlc)

1349

if (!zlc)

1350

return;

1350

return;

1351

1352

i = z - zonelist->_zonerefs;

1352

i = z - zonelist->_zonerefs;

1353

1354

set_bit(i, zlc->fullzones);

1354

set_bit(i, zlc->fullzones);

1355

}

1355

}

1356

1357

#else /* CONFIG_NUMA */

1357

#else /* CONFIG_NUMA */

1358

1359

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1359

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1360

{

1360

{

1361

return NULL;

1361

return NULL;

1362

}

1362

}

1363

1364

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1364

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1365

nodemask_t *allowednodes)

1365

nodemask_t *allowednodes)

1366

{

1366

{

1367

return 1;

1367

return 1;

1368

}

1368

}

1369

1370

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1370

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1371

{

1371

{

1372

}

1372

}

1373

#endif /* CONFIG_NUMA */

1373

#endif /* CONFIG_NUMA */

1374

1375

/*

1375

/*

1376

* get_page_from_freelist goes through the zonelist trying to allocate

1376

* get_page_from_freelist goes through the zonelist trying to allocate

1377

* a page.

1377

* a page.

1378

*/

1378

*/

1379

static struct page *

1379

static struct page *

1380

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1380

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1381

struct zonelist *zonelist, int high_zoneidx, int alloc_flags)

1381

struct zonelist *zonelist, int high_zoneidx, int alloc_flags)

1382

{

1382

{

1383

struct zoneref *z;

1383

struct zoneref *z;

1384

struct page *page = NULL;

1384

struct page *page = NULL;

1385

int classzone_idx;

1385

int classzone_idx;

1386

struct zone *zone, *preferred_zone;

1386

struct zone *zone, *preferred_zone;

1387

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1387

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1388

int zlc_active = 0; /* set if using zonelist_cache */

1388

int zlc_active = 0; /* set if using zonelist_cache */

1389

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1389

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1390

1391

(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,

1391

(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,

1392

&preferred_zone);

1392

&preferred_zone);

1393

classzone_idx = zone_idx(preferred_zone);

1393

classzone_idx = zone_idx(preferred_zone);

1394

1395

zonelist_scan:

1395

zonelist_scan:

1396

/*

1396

/*

1397

* Scan zonelist, looking for a zone with enough free.

1397

* Scan zonelist, looking for a zone with enough free.

1398

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1398

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1399

*/

1399

*/

1400

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1400

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1401

high_zoneidx, nodemask) {

1401

high_zoneidx, nodemask) {

1402

if (NUMA_BUILD && zlc_active &&

1402

if (NUMA_BUILD && zlc_active &&

1403

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1403

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1404

continue;

1404

continue;

1405

if ((alloc_flags & ALLOC_CPUSET) &&

1405

if ((alloc_flags & ALLOC_CPUSET) &&

1406

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1406

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1407

goto try_next_zone;

1407

goto try_next_zone;

1408

1409

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1409

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1410

unsigned long mark;

1410

unsigned long mark;

1411

if (alloc_flags & ALLOC_WMARK_MIN)

1411

if (alloc_flags & ALLOC_WMARK_MIN)

1412

mark = zone->pages_min;

1412

mark = zone->pages_min;

1413

else if (alloc_flags & ALLOC_WMARK_LOW)

1413

else if (alloc_flags & ALLOC_WMARK_LOW)

1414

mark = zone->pages_low;

1414

mark = zone->pages_low;

1415

else

1415

else

1416

mark = zone->pages_high;

1416

mark = zone->pages_high;

1417

if (!zone_watermark_ok(zone, order, mark,

1417

if (!zone_watermark_ok(zone, order, mark,

1418

classzone_idx, alloc_flags)) {

1418

classzone_idx, alloc_flags)) {

1419

if (!zone_reclaim_mode ||

1419

if (!zone_reclaim_mode ||

1420

!zone_reclaim(zone, gfp_mask, order))

1420

!zone_reclaim(zone, gfp_mask, order))

1421

goto this_zone_full;

1421

goto this_zone_full;

1422

}

1422

}

1423

}

1423

}

1424

1425

page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);

1425

page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);

1426

if (page)

1426

if (page)

1427

break;

1427

break;

1428

this_zone_full:

1428

this_zone_full:

1429

if (NUMA_BUILD)

1429

if (NUMA_BUILD)

1430

zlc_mark_zone_full(zonelist, z);

1430

zlc_mark_zone_full(zonelist, z);

1431

try_next_zone:

1431

try_next_zone:

1432

if (NUMA_BUILD && !did_zlc_setup) {

1432

if (NUMA_BUILD && !did_zlc_setup) {

1433

/* we do zlc_setup after the first zone is tried */

1433

/* we do zlc_setup after the first zone is tried */

1434

allowednodes = zlc_setup(zonelist, alloc_flags);

1434

allowednodes = zlc_setup(zonelist, alloc_flags);

1435

zlc_active = 1;

1435

zlc_active = 1;

1436

did_zlc_setup = 1;

1436

did_zlc_setup = 1;

1437

}

1437

}

1438

}

1438

}

1439

1440

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1440

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1441

/* Disable zlc cache for second zonelist scan */

1441

/* Disable zlc cache for second zonelist scan */

1442

zlc_active = 0;

1442

zlc_active = 0;

1443

goto zonelist_scan;

1443

goto zonelist_scan;

1444

}

1444

}

1445

return page;

1445

return page;

1446

}

1446

}

1447

1448

/*

1448

/*

1449

* This is the 'heart' of the zoned buddy allocator.

1449

* This is the 'heart' of the zoned buddy allocator.

1450

*/

1450

*/

1451

static struct page *

1451

static struct page *

1452

__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,

1452

__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,

1453

struct zonelist *zonelist, nodemask_t *nodemask)

1453

struct zonelist *zonelist, nodemask_t *nodemask)

1454

{

1454

{

1455

const gfp_t wait = gfp_mask & __GFP_WAIT;

1455

const gfp_t wait = gfp_mask & __GFP_WAIT;

1456

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

1456

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

1457

struct zoneref *z;

1457

struct zoneref *z;

1458

struct zone *zone;

1458

struct zone *zone;

1459

struct page *page;

1459

struct page *page;

1460

struct reclaim_state reclaim_state;

1460

struct reclaim_state reclaim_state;

1461

struct task_struct *p = current;

1461

struct task_struct *p = current;

1462

int do_retry;

1462

int do_retry;

1463

int alloc_flags;

1463

int alloc_flags;

1464

int did_some_progress;

1464

unsigned long did_some_progress;

1465

unsigned long pages_reclaimed = 0;

1465

1466

might_sleep_if(wait);

1467

might_sleep_if(wait);

1467

1468

if (should_fail_alloc_page(gfp_mask, order))

1469

if (should_fail_alloc_page(gfp_mask, order))

1469

return NULL;

1470

return NULL;

1470

1471

restart:

1472

restart:

1472

z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */

1473

z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */

1473

1474

if (unlikely(!z->zone)) {

1475

if (unlikely(!z->zone)) {

1475

/*

1476

/*

1476

* Happens if we have an empty zonelist as a result of

1477

* Happens if we have an empty zonelist as a result of

1477

* GFP_THISNODE being used on a memoryless node

1478

* GFP_THISNODE being used on a memoryless node

1478

*/

1479

*/

1479

return NULL;

1480

return NULL;

1480

}

1481

}

1481

1482

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

1483

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

1483

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1484

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1484

if (page)

1485

if (page)

1485

goto got_pg;

1486

goto got_pg;

1486

1487

/*

1488

/*

1488

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1489

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1489

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1490

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1490

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1491

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1491

* using a larger set of nodes after it has established that the

1492

* using a larger set of nodes after it has established that the

1492

* allowed per node queues are empty and that nodes are

1493

* allowed per node queues are empty and that nodes are

1493

* over allocated.

1494

* over allocated.

1494

*/

1495

*/

1495

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1496

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1496

goto nopage;

1497

goto nopage;

1497

1498

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

1499

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

1499

wakeup_kswapd(zone, order);

1500

wakeup_kswapd(zone, order);

1500

1501

/*

1502

/*

1502

* OK, we're below the kswapd watermark and have kicked background

1503

* OK, we're below the kswapd watermark and have kicked background

1503

* reclaim. Now things get more complex, so set up alloc_flags according

1504

* reclaim. Now things get more complex, so set up alloc_flags according

1504

* to how we want to proceed.

1505

* to how we want to proceed.

1505

*

1506

*

1506

* The caller may dip into page reserves a bit more if the caller

1507

* The caller may dip into page reserves a bit more if the caller

1507

* cannot run direct reclaim, or if the caller has realtime scheduling

1508

* cannot run direct reclaim, or if the caller has realtime scheduling

1508

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1509

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1509

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1510

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1510

*/

1511

*/

1511

alloc_flags = ALLOC_WMARK_MIN;

1512

alloc_flags = ALLOC_WMARK_MIN;

1512

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1513

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1513

alloc_flags |= ALLOC_HARDER;

1514

alloc_flags |= ALLOC_HARDER;

1514

if (gfp_mask & __GFP_HIGH)

1515

if (gfp_mask & __GFP_HIGH)

1515

alloc_flags |= ALLOC_HIGH;

1516

alloc_flags |= ALLOC_HIGH;

1516

if (wait)

1517

if (wait)

1517

alloc_flags |= ALLOC_CPUSET;

1518

alloc_flags |= ALLOC_CPUSET;

1518

1519

/*

1520

/*

1520

* Go through the zonelist again. Let __GFP_HIGH and allocations

1521

* Go through the zonelist again. Let __GFP_HIGH and allocations

1521

* coming from realtime tasks go deeper into reserves.

1522

* coming from realtime tasks go deeper into reserves.

1522

*

1523

*

1523

* This is the last chance, in general, before the goto nopage.

1524

* This is the last chance, in general, before the goto nopage.

1524

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1525

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1525

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1526

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1526

*/

1527

*/

1527

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

1528

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

1528

high_zoneidx, alloc_flags);

1529

high_zoneidx, alloc_flags);

1529

if (page)

1530

if (page)

1530

goto got_pg;

1531

goto got_pg;

1531

1532

/* This allocation should allow future memory freeing. */

1533

/* This allocation should allow future memory freeing. */

1533

1534

rebalance:

1535

rebalance:

1535

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1536

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1536

&& !in_interrupt()) {

1537

&& !in_interrupt()) {

1537

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1538

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1538

nofail_alloc:

1539

nofail_alloc:

1539

/* go through the zonelist yet again, ignoring mins */

1540

/* go through the zonelist yet again, ignoring mins */

1540

page = get_page_from_freelist(gfp_mask, nodemask, order,

1541

page = get_page_from_freelist(gfp_mask, nodemask, order,

1541

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);

1542

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);

1542

if (page)

1543

if (page)

1543

goto got_pg;

1544

goto got_pg;

1544

if (gfp_mask & __GFP_NOFAIL) {

1545

if (gfp_mask & __GFP_NOFAIL) {

1545

congestion_wait(WRITE, HZ/50);

1546

congestion_wait(WRITE, HZ/50);

1546

goto nofail_alloc;

1547

goto nofail_alloc;

1547

}

1548

}

1548

}

1549

}

1549

goto nopage;

1550

goto nopage;

1550

}

1551

}

1551

1552

/* Atomic allocations - we can't balance anything */

1553

/* Atomic allocations - we can't balance anything */

1553

if (!wait)

1554

if (!wait)

1554

goto nopage;

1555

goto nopage;

1555

1556

cond_resched();

1557

cond_resched();

1557

1558

/* We now go into synchronous reclaim */

1559

/* We now go into synchronous reclaim */

1559

cpuset_memory_pressure_bump();

1560

cpuset_memory_pressure_bump();

1560

p->flags |= PF_MEMALLOC;

1561

p->flags |= PF_MEMALLOC;

1561

reclaim_state.reclaimed_slab = 0;

1562

reclaim_state.reclaimed_slab = 0;

1562

p->reclaim_state = &reclaim_state;

1563

p->reclaim_state = &reclaim_state;

1563

1564

did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);

1565

did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);

1565

1566

p->reclaim_state = NULL;

1567

p->reclaim_state = NULL;

1567

p->flags &= ~PF_MEMALLOC;

1568

p->flags &= ~PF_MEMALLOC;

1568

1569

cond_resched();

1570

cond_resched();

1570

1571

if (order != 0)

1572

if (order != 0)

1572

drain_all_pages();

1573

drain_all_pages();

1573

1574

if (likely(did_some_progress)) {

1575

if (likely(did_some_progress)) {

1575

page = get_page_from_freelist(gfp_mask, nodemask, order,

1576

page = get_page_from_freelist(gfp_mask, nodemask, order,

1576

zonelist, high_zoneidx, alloc_flags);

1577

zonelist, high_zoneidx, alloc_flags);

1577

if (page)

1578

if (page)

1578

goto got_pg;

1579

goto got_pg;

1579

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1580

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1580

if (!try_set_zone_oom(zonelist, gfp_mask)) {

1581

if (!try_set_zone_oom(zonelist, gfp_mask)) {

1581

schedule_timeout_uninterruptible(1);

1582

schedule_timeout_uninterruptible(1);

1582

goto restart;

1583

goto restart;

1583

}

1584

}

1584

1585

/*

1586

/*

1586

* Go through the zonelist yet one more time, keep

1587

* Go through the zonelist yet one more time, keep

1587

* very high watermark here, this is only to catch

1588

* very high watermark here, this is only to catch

1588

* a parallel oom killing, we must fail if we're still

1589

* a parallel oom killing, we must fail if we're still

1589

* under heavy pressure.

1590

* under heavy pressure.

1590

*/

1591

*/

1591

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

1592

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

1592

order, zonelist, high_zoneidx,

1593

order, zonelist, high_zoneidx,

1593

ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1594

ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1594

if (page) {

1595

if (page) {

1595

clear_zonelist_oom(zonelist, gfp_mask);

1596

clear_zonelist_oom(zonelist, gfp_mask);

1596

goto got_pg;

1597

goto got_pg;

1597

}

1598

}

1598

1599

/* The OOM killer will not help higher order allocs so fail */

1600

/* The OOM killer will not help higher order allocs so fail */

1600

if (order > PAGE_ALLOC_COSTLY_ORDER) {

1601

if (order > PAGE_ALLOC_COSTLY_ORDER) {

1601

clear_zonelist_oom(zonelist, gfp_mask);

1602

clear_zonelist_oom(zonelist, gfp_mask);

1602

goto nopage;

1603

goto nopage;

1603

}

1604

}

1604

1605

out_of_memory(zonelist, gfp_mask, order);

1606

out_of_memory(zonelist, gfp_mask, order);

1606

clear_zonelist_oom(zonelist, gfp_mask);

1607

clear_zonelist_oom(zonelist, gfp_mask);

1607

goto restart;

1608

goto restart;

1608

}

1609

}

1609

1610

/*

1611

/*

1611

* Don't let big-order allocations loop unless the caller explicitly

1612

* Don't let big-order allocations loop unless the caller explicitly

1612

* requests that. Wait for some write requests to complete then retry.

1613

* requests that. Wait for some write requests to complete then retry.

1613

*

1614

*

1614

* In this implementation, either order <= PAGE_ALLOC_COSTLY_ORDER or

1615

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

1615

* __GFP_REPEAT mean __GFP_NOFAIL, but that may not be true in other

1616

* means __GFP_NOFAIL, but that may not be true in other

1616

* implementations.

1617

* implementations.

1618

*

1619

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

1620

* specified, then we retry until we no longer reclaim any pages

1621

* (above), or we've reclaimed an order of pages at least as

1622

* large as the allocation's order. In both cases, if the

1623

* allocation still fails, we stop retrying.

1617

*/

1624

*/

1625

pages_reclaimed += did_some_progress;

1618

do_retry = 0;

1626

do_retry = 0;

1619

if (!(gfp_mask & __GFP_NORETRY)) {

1627

if (!(gfp_mask & __GFP_NORETRY)) {

1620

if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||

1628

if (order <= PAGE_ALLOC_COSTLY_ORDER) {

1621

(gfp_mask & __GFP_REPEAT))

1622

do_retry = 1;

1629

do_retry = 1;

1630

} else {

1631

if (gfp_mask & __GFP_REPEAT &&

1632

pages_reclaimed < (1 << order))

1633

do_retry = 1;

1634

}

1623

if (gfp_mask & __GFP_NOFAIL)

1635

if (gfp_mask & __GFP_NOFAIL)

1624

do_retry = 1;

1636

do_retry = 1;

1625

}

1637

}

1626

if (do_retry) {

1638

if (do_retry) {

1627

congestion_wait(WRITE, HZ/50);

1639

congestion_wait(WRITE, HZ/50);

1628

goto rebalance;

1640

goto rebalance;

1629

}

1641

}

1630

1642

1631

nopage:

1643

nopage:

1632

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1644

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1633

printk(KERN_WARNING "%s: page allocation failure."

1645

printk(KERN_WARNING "%s: page allocation failure."

1634

" order:%d, mode:0x%x\n",

1646

" order:%d, mode:0x%x\n",

1635

p->comm, order, gfp_mask);

1647

p->comm, order, gfp_mask);

1636

dump_stack();

1648

dump_stack();

1637

show_mem();

1649

show_mem();

1638

}

1650

}

1639

got_pg:

1651

got_pg:

1640

return page;

1652

return page;

1641

}

1653

}

1642

1654

1643

struct page *

1655

struct page *

1644

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1656

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1645

struct zonelist *zonelist)

1657

struct zonelist *zonelist)

1646

{

1658

{

1647

return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);

1659

return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);

1648

}

1660

}

1649

1661

1650

struct page *

1662

struct page *

1651

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

1663

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

1652

struct zonelist *zonelist, nodemask_t *nodemask)

1664

struct zonelist *zonelist, nodemask_t *nodemask)

1653

{

1665

{

1654

return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);

1666

return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);

1655

}

1667

}

1656

1668

1657

EXPORT_SYMBOL(__alloc_pages);

1669

EXPORT_SYMBOL(__alloc_pages);

1658

1670

1659

/*

1671

/*

1660

* Common helper functions.

1672

* Common helper functions.

1661

*/

1673

*/

1662

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1674

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1663

{

1675

{

1664

struct page * page;

1676

struct page * page;

1665

page = alloc_pages(gfp_mask, order);

1677

page = alloc_pages(gfp_mask, order);

1666

if (!page)

1678

if (!page)

1667

return 0;

1679

return 0;

1668

return (unsigned long) page_address(page);

1680

return (unsigned long) page_address(page);

1669

}

1681

}

1670

1682

1671

EXPORT_SYMBOL(__get_free_pages);

1683

EXPORT_SYMBOL(__get_free_pages);

1672

1684

1673

unsigned long get_zeroed_page(gfp_t gfp_mask)

1685

unsigned long get_zeroed_page(gfp_t gfp_mask)

1674

{

1686

{

1675

struct page * page;

1687

struct page * page;

1676

1688

1677

/*

1689

/*

1678

* get_zeroed_page() returns a 32-bit address, which cannot represent

1690

* get_zeroed_page() returns a 32-bit address, which cannot represent

1679

* a highmem page

1691

* a highmem page

1680

*/

1692

*/

1681

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1693

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1682

1694

1683

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1695

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1684

if (page)

1696

if (page)

1685

return (unsigned long) page_address(page);

1697

return (unsigned long) page_address(page);

1686

return 0;

1698

return 0;

1687

}

1699

}

1688

1700

1689

EXPORT_SYMBOL(get_zeroed_page);

1701

EXPORT_SYMBOL(get_zeroed_page);

1690

1702

1691

void __pagevec_free(struct pagevec *pvec)

1703

void __pagevec_free(struct pagevec *pvec)

1692

{

1704

{

1693

int i = pagevec_count(pvec);

1705

int i = pagevec_count(pvec);

1694

1706

1695

while (--i >= 0)

1707

while (--i >= 0)

1696

free_hot_cold_page(pvec->pages[i], pvec->cold);

1708

free_hot_cold_page(pvec->pages[i], pvec->cold);

1697

}

1709

}

1698

1710

1699

void __free_pages(struct page *page, unsigned int order)

1711

void __free_pages(struct page *page, unsigned int order)

1700

{

1712

{

1701

if (put_page_testzero(page)) {

1713

if (put_page_testzero(page)) {

1702

if (order == 0)

1714

if (order == 0)

1703

free_hot_page(page);

1715

free_hot_page(page);

1704

else

1716

else

1705

__free_pages_ok(page, order);

1717

__free_pages_ok(page, order);

1706

}

1718

}

1707

}

1719

}

1708

1720

1709

EXPORT_SYMBOL(__free_pages);

1721

EXPORT_SYMBOL(__free_pages);

1710

1722

1711

void free_pages(unsigned long addr, unsigned int order)

1723

void free_pages(unsigned long addr, unsigned int order)

1712

{

1724

{

1713

if (addr != 0) {

1725

if (addr != 0) {

1714

VM_BUG_ON(!virt_addr_valid((void *)addr));

1726

VM_BUG_ON(!virt_addr_valid((void *)addr));

1715

__free_pages(virt_to_page((void *)addr), order);

1727

__free_pages(virt_to_page((void *)addr), order);

1716

}

1728

}

1717

}

1729

}

1718

1730

1719

EXPORT_SYMBOL(free_pages);

1731

EXPORT_SYMBOL(free_pages);

1720

1732

1721

static unsigned int nr_free_zone_pages(int offset)

1733

static unsigned int nr_free_zone_pages(int offset)

1722

{

1734

{

1723

struct zoneref *z;

1735

struct zoneref *z;

1724

struct zone *zone;

1736

struct zone *zone;

1725

1737

1726

/* Just pick one node, since fallback list is circular */

1738

/* Just pick one node, since fallback list is circular */

1727

unsigned int sum = 0;

1739

unsigned int sum = 0;

1728

1740

1729

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

1741

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

1730

1742

1731

for_each_zone_zonelist(zone, z, zonelist, offset) {

1743

for_each_zone_zonelist(zone, z, zonelist, offset) {

1732

unsigned long size = zone->present_pages;

1744

unsigned long size = zone->present_pages;

1733

unsigned long high = zone->pages_high;

1745

unsigned long high = zone->pages_high;

1734

if (size > high)

1746

if (size > high)

1735

sum += size - high;

1747

sum += size - high;

1736

}

1748

}

1737

1749

1738

return sum;

1750

return sum;

1739

}

1751

}

1740

1752

1741

/*

1753

/*

1742

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1754

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1743

*/

1755

*/

1744

unsigned int nr_free_buffer_pages(void)

1756

unsigned int nr_free_buffer_pages(void)

1745

{

1757

{

1746

return nr_free_zone_pages(gfp_zone(GFP_USER));

1758

return nr_free_zone_pages(gfp_zone(GFP_USER));

1747

}

1759

}

1748

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

1760

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

1749

1761

1750

/*

1762

/*

1751

* Amount of free RAM allocatable within all zones

1763

* Amount of free RAM allocatable within all zones

1752

*/

1764

*/

1753

unsigned int nr_free_pagecache_pages(void)

1765

unsigned int nr_free_pagecache_pages(void)

1754

{

1766

{

1755

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

1767

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

1756

}

1768

}

1757

1769

1758

static inline void show_node(struct zone *zone)

1770

static inline void show_node(struct zone *zone)

1759

{

1771

{

1760

if (NUMA_BUILD)

1772

if (NUMA_BUILD)

1761

printk("Node %d ", zone_to_nid(zone));

1773

printk("Node %d ", zone_to_nid(zone));

1762

}

1774

}

1763

1775

1764

void si_meminfo(struct sysinfo *val)

1776

void si_meminfo(struct sysinfo *val)

1765

{

1777

{

1766

val->totalram = totalram_pages;

1778

val->totalram = totalram_pages;

1767

val->sharedram = 0;

1779

val->sharedram = 0;

1768

val->freeram = global_page_state(NR_FREE_PAGES);

1780

val->freeram = global_page_state(NR_FREE_PAGES);

1769

val->bufferram = nr_blockdev_pages();

1781

val->bufferram = nr_blockdev_pages();

1770

val->totalhigh = totalhigh_pages;

1782

val->totalhigh = totalhigh_pages;

1771

val->freehigh = nr_free_highpages();

1783

val->freehigh = nr_free_highpages();

1772

val->mem_unit = PAGE_SIZE;

1784

val->mem_unit = PAGE_SIZE;

1773

}

1785

}

1774

1786

1775

EXPORT_SYMBOL(si_meminfo);

1787

EXPORT_SYMBOL(si_meminfo);

1776

1788

1777

#ifdef CONFIG_NUMA

1789

#ifdef CONFIG_NUMA

1778

void si_meminfo_node(struct sysinfo *val, int nid)

1790

void si_meminfo_node(struct sysinfo *val, int nid)

1779

{

1791

{

1780

pg_data_t *pgdat = NODE_DATA(nid);

1792

pg_data_t *pgdat = NODE_DATA(nid);

1781

1793

1782

val->totalram = pgdat->node_present_pages;

1794

val->totalram = pgdat->node_present_pages;

1783

val->freeram = node_page_state(nid, NR_FREE_PAGES);

1795

val->freeram = node_page_state(nid, NR_FREE_PAGES);

1784

#ifdef CONFIG_HIGHMEM

1796

#ifdef CONFIG_HIGHMEM

1785

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1797

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1786

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

1798

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

1787

NR_FREE_PAGES);

1799

NR_FREE_PAGES);

1788

#else

1800

#else

1789

val->totalhigh = 0;

1801

val->totalhigh = 0;

1790

val->freehigh = 0;

1802

val->freehigh = 0;

1791

#endif

1803

#endif

1792

val->mem_unit = PAGE_SIZE;

1804

val->mem_unit = PAGE_SIZE;

1793

}

1805

}

1794

#endif

1806

#endif

1795

1807

1796

#define K(x) ((x) << (PAGE_SHIFT-10))

1808

#define K(x) ((x) << (PAGE_SHIFT-10))

1797

1809

1798

/*

1810

/*

1799

* Show free area list (used inside shift_scroll-lock stuff)

1811

* Show free area list (used inside shift_scroll-lock stuff)

1800

* We also calculate the percentage fragmentation. We do this by counting the

1812

* We also calculate the percentage fragmentation. We do this by counting the

1801

* memory on each free list with the exception of the first item on the list.

1813

* memory on each free list with the exception of the first item on the list.

1802

*/

1814

*/

1803

void show_free_areas(void)

1815

void show_free_areas(void)

1804

{

1816

{

1805

int cpu;

1817

int cpu;

1806

struct zone *zone;

1818

struct zone *zone;

1807

1819

1808

for_each_zone(zone) {

1820

for_each_zone(zone) {

1809

if (!populated_zone(zone))

1821

if (!populated_zone(zone))

1810

continue;

1822

continue;

1811

1823

1812

show_node(zone);

1824

show_node(zone);

1813

printk("%s per-cpu:\n", zone->name);

1825

printk("%s per-cpu:\n", zone->name);

1814

1826

1815

for_each_online_cpu(cpu) {

1827

for_each_online_cpu(cpu) {

1816

struct per_cpu_pageset *pageset;

1828

struct per_cpu_pageset *pageset;

1817

1829

1818

pageset = zone_pcp(zone, cpu);

1830

pageset = zone_pcp(zone, cpu);

1819

1831

1820

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

1832

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

1821

cpu, pageset->pcp.high,

1833

cpu, pageset->pcp.high,

1822

pageset->pcp.batch, pageset->pcp.count);

1834

pageset->pcp.batch, pageset->pcp.count);

1823

}

1835

}

1824

}

1836

}

1825

1837

1826

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"

1838

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"

1827

" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",

1839

" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",

1828

global_page_state(NR_ACTIVE),

1840

global_page_state(NR_ACTIVE),

1829

global_page_state(NR_INACTIVE),

1841

global_page_state(NR_INACTIVE),

1830

global_page_state(NR_FILE_DIRTY),

1842

global_page_state(NR_FILE_DIRTY),

1831

global_page_state(NR_WRITEBACK),

1843

global_page_state(NR_WRITEBACK),

1832

global_page_state(NR_UNSTABLE_NFS),

1844

global_page_state(NR_UNSTABLE_NFS),

1833

global_page_state(NR_FREE_PAGES),

1845

global_page_state(NR_FREE_PAGES),

1834

global_page_state(NR_SLAB_RECLAIMABLE) +

1846

global_page_state(NR_SLAB_RECLAIMABLE) +

1835

global_page_state(NR_SLAB_UNRECLAIMABLE),

1847

global_page_state(NR_SLAB_UNRECLAIMABLE),

1836

global_page_state(NR_FILE_MAPPED),

1848

global_page_state(NR_FILE_MAPPED),

1837

global_page_state(NR_PAGETABLE),

1849

global_page_state(NR_PAGETABLE),

1838

global_page_state(NR_BOUNCE));

1850

global_page_state(NR_BOUNCE));

1839

1851

1840

for_each_zone(zone) {

1852

for_each_zone(zone) {

1841

int i;

1853

int i;

1842

1854

1843

if (!populated_zone(zone))

1855

if (!populated_zone(zone))

1844

continue;

1856

continue;

1845

1857

1846

show_node(zone);

1858

show_node(zone);

1847

printk("%s"

1859

printk("%s"

1848

" free:%lukB"

1860

" free:%lukB"

1849

" min:%lukB"

1861

" min:%lukB"

1850

" low:%lukB"

1862

" low:%lukB"

1851

" high:%lukB"

1863

" high:%lukB"

1852

" active:%lukB"

1864

" active:%lukB"

1853

" inactive:%lukB"

1865

" inactive:%lukB"

1854

" present:%lukB"

1866

" present:%lukB"

1855

" pages_scanned:%lu"

1867

" pages_scanned:%lu"

1856

" all_unreclaimable? %s"

1868

" all_unreclaimable? %s"

1857

"\n",

1869

"\n",

1858

zone->name,

1870

zone->name,

1859

K(zone_page_state(zone, NR_FREE_PAGES)),

1871

K(zone_page_state(zone, NR_FREE_PAGES)),

1860

K(zone->pages_min),

1872

K(zone->pages_min),

1861

K(zone->pages_low),

1873

K(zone->pages_low),

1862

K(zone->pages_high),

1874

K(zone->pages_high),

1863

K(zone_page_state(zone, NR_ACTIVE)),

1875

K(zone_page_state(zone, NR_ACTIVE)),

1864

K(zone_page_state(zone, NR_INACTIVE)),

1876

K(zone_page_state(zone, NR_INACTIVE)),

1865

K(zone->present_pages),

1877

K(zone->present_pages),

1866

zone->pages_scanned,

1878

zone->pages_scanned,

1867

(zone_is_all_unreclaimable(zone) ? "yes" : "no")

1879

(zone_is_all_unreclaimable(zone) ? "yes" : "no")

1868

);

1880

);

1869

printk("lowmem_reserve[]:");

1881

printk("lowmem_reserve[]:");

1870

for (i = 0; i < MAX_NR_ZONES; i++)

1882

for (i = 0; i < MAX_NR_ZONES; i++)

1871

printk(" %lu", zone->lowmem_reserve[i]);

1883

printk(" %lu", zone->lowmem_reserve[i]);

1872

printk("\n");

1884

printk("\n");

1873

}

1885

}

1874

1886

1875

for_each_zone(zone) {

1887

for_each_zone(zone) {

1876

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1888

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1877

1889

1878

if (!populated_zone(zone))

1890

if (!populated_zone(zone))

1879

continue;

1891

continue;

1880

1892

1881

show_node(zone);

1893

show_node(zone);

1882

printk("%s: ", zone->name);

1894

printk("%s: ", zone->name);

1883

1895

1884

spin_lock_irqsave(&zone->lock, flags);

1896

spin_lock_irqsave(&zone->lock, flags);

1885

for (order = 0; order < MAX_ORDER; order++) {

1897

for (order = 0; order < MAX_ORDER; order++) {

1886

nr[order] = zone->free_area[order].nr_free;

1898

nr[order] = zone->free_area[order].nr_free;

1887

total += nr[order] << order;

1899

total += nr[order] << order;

1888

}

1900

}

1889

spin_unlock_irqrestore(&zone->lock, flags);

1901

spin_unlock_irqrestore(&zone->lock, flags);

1890

for (order = 0; order < MAX_ORDER; order++)

1902

for (order = 0; order < MAX_ORDER; order++)

1891

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1903

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1892

printk("= %lukB\n", K(total));

1904

printk("= %lukB\n", K(total));

1893

}

1905

}

1894

1906

1895

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

1907

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

1896

1908

1897

show_swap_cache_info();

1909

show_swap_cache_info();

1898

}

1910

}

1899

1911

1900

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

1912

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

1901

{

1913

{

1902

zoneref->zone = zone;

1914

zoneref->zone = zone;

1903

zoneref->zone_idx = zone_idx(zone);

1915

zoneref->zone_idx = zone_idx(zone);

1904

}

1916

}

1905

1917

1906

/*

1918

/*

1907

* Builds allocation fallback zone lists.

1919

* Builds allocation fallback zone lists.

1908

*

1920

*

1909

* Add all populated zones of a node to the zonelist.

1921

* Add all populated zones of a node to the zonelist.

1910

*/

1922

*/

1911

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

1923

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

1912

int nr_zones, enum zone_type zone_type)

1924

int nr_zones, enum zone_type zone_type)

1913

{

1925

{

1914

struct zone *zone;

1926

struct zone *zone;

1915

1927

1916

BUG_ON(zone_type >= MAX_NR_ZONES);

1928

BUG_ON(zone_type >= MAX_NR_ZONES);

1917

zone_type++;

1929

zone_type++;

1918

1930

1919

do {

1931

do {

1920

zone_type--;

1932

zone_type--;

1921

zone = pgdat->node_zones + zone_type;

1933

zone = pgdat->node_zones + zone_type;

1922

if (populated_zone(zone)) {

1934

if (populated_zone(zone)) {

1923

zoneref_set_zone(zone,

1935

zoneref_set_zone(zone,

1924

&zonelist->_zonerefs[nr_zones++]);

1936

&zonelist->_zonerefs[nr_zones++]);

1925

check_highest_zone(zone_type);

1937

check_highest_zone(zone_type);

1926

}

1938

}

1927

1939

1928

} while (zone_type);

1940

} while (zone_type);

1929

return nr_zones;

1941

return nr_zones;

1930

}

1942

}

1931

1943

1932

1944

1933

/*

1945

/*

1934

* zonelist_order:

1946

* zonelist_order:

1935

* 0 = automatic detection of better ordering.

1947

* 0 = automatic detection of better ordering.

1936

* 1 = order by ([node] distance, -zonetype)

1948

* 1 = order by ([node] distance, -zonetype)

1937

* 2 = order by (-zonetype, [node] distance)

1949

* 2 = order by (-zonetype, [node] distance)

1938

*

1950

*

1939

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

1951

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

1940

* the same zonelist. So only NUMA can configure this param.

1952

* the same zonelist. So only NUMA can configure this param.

1941

*/

1953

*/

1942

#define ZONELIST_ORDER_DEFAULT 0

1954

#define ZONELIST_ORDER_DEFAULT 0

1943

#define ZONELIST_ORDER_NODE 1

1955

#define ZONELIST_ORDER_NODE 1

1944

#define ZONELIST_ORDER_ZONE 2

1956

#define ZONELIST_ORDER_ZONE 2

1945

1957

1946

/* zonelist order in the kernel.

1958

/* zonelist order in the kernel.

1947

* set_zonelist_order() will set this to NODE or ZONE.

1959

* set_zonelist_order() will set this to NODE or ZONE.

1948

*/

1960

*/

1949

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

1961

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

1950

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

1962

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

1951

1963

1952

1964

1953

#ifdef CONFIG_NUMA

1965

#ifdef CONFIG_NUMA

1954

/* The value user specified ....changed by config */

1966

/* The value user specified ....changed by config */

1955

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1967

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1956

/* string for sysctl */

1968

/* string for sysctl */

1957

#define NUMA_ZONELIST_ORDER_LEN 16

1969

#define NUMA_ZONELIST_ORDER_LEN 16

1958

char numa_zonelist_order[16] = "default";

1970

char numa_zonelist_order[16] = "default";

1959

1971

1960

/*

1972

/*

1961

* interface for configure zonelist ordering.

1973

* interface for configure zonelist ordering.

1962

* command line option "numa_zonelist_order"

1974

* command line option "numa_zonelist_order"

1963

* = "[dD]efault - default, automatic configuration.

1975

* = "[dD]efault - default, automatic configuration.

1964

* = "[nN]ode - order by node locality, then by zone within node

1976

* = "[nN]ode - order by node locality, then by zone within node

1965

* = "[zZ]one - order by zone, then by locality within zone

1977

* = "[zZ]one - order by zone, then by locality within zone

1966

*/

1978

*/

1967

1979

1968

static int __parse_numa_zonelist_order(char *s)

1980

static int __parse_numa_zonelist_order(char *s)

1969

{

1981

{

1970

if (*s == 'd' || *s == 'D') {

1982

if (*s == 'd' || *s == 'D') {

1971

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1983

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

1972

} else if (*s == 'n' || *s == 'N') {

1984

} else if (*s == 'n' || *s == 'N') {

1973

user_zonelist_order = ZONELIST_ORDER_NODE;

1985

user_zonelist_order = ZONELIST_ORDER_NODE;

1974

} else if (*s == 'z' || *s == 'Z') {

1986

} else if (*s == 'z' || *s == 'Z') {

1975

user_zonelist_order = ZONELIST_ORDER_ZONE;

1987

user_zonelist_order = ZONELIST_ORDER_ZONE;

1976

} else {

1988

} else {

1977

printk(KERN_WARNING

1989

printk(KERN_WARNING

1978

"Ignoring invalid numa_zonelist_order value: "

1990

"Ignoring invalid numa_zonelist_order value: "

1979

"%s\n", s);

1991

"%s\n", s);

1980

return -EINVAL;

1992

return -EINVAL;

1981

}

1993

}

1982

return 0;

1994

return 0;

1983

}

1995

}

1984

1996

1985

static __init int setup_numa_zonelist_order(char *s)

1997

static __init int setup_numa_zonelist_order(char *s)

1986

{

1998

{

1987

if (s)

1999

if (s)

1988

return __parse_numa_zonelist_order(s);

2000

return __parse_numa_zonelist_order(s);

1989

return 0;

2001

return 0;

1990

}

2002

}

1991

early_param("numa_zonelist_order", setup_numa_zonelist_order);

2003

early_param("numa_zonelist_order", setup_numa_zonelist_order);

1992

2004

1993

/*

2005

/*

1994

* sysctl handler for numa_zonelist_order

2006

* sysctl handler for numa_zonelist_order

1995

*/

2007

*/

1996

int numa_zonelist_order_handler(ctl_table *table, int write,

2008

int numa_zonelist_order_handler(ctl_table *table, int write,

1997

struct file *file, void __user *buffer, size_t *length,

2009

struct file *file, void __user *buffer, size_t *length,

1998

loff_t *ppos)

2010

loff_t *ppos)

1999

{

2011

{

2000

char saved_string[NUMA_ZONELIST_ORDER_LEN];

2012

char saved_string[NUMA_ZONELIST_ORDER_LEN];

2001

int ret;

2013

int ret;

2002

2014

2003

if (write)

2015

if (write)

2004

strncpy(saved_string, (char*)table->data,

2016

strncpy(saved_string, (char*)table->data,

2005

NUMA_ZONELIST_ORDER_LEN);

2017

NUMA_ZONELIST_ORDER_LEN);

2006

ret = proc_dostring(table, write, file, buffer, length, ppos);

2018

ret = proc_dostring(table, write, file, buffer, length, ppos);

2007

if (ret)

2019

if (ret)

2008

return ret;

2020

return ret;

2009

if (write) {

2021

if (write) {

2010

int oldval = user_zonelist_order;

2022

int oldval = user_zonelist_order;

2011

if (__parse_numa_zonelist_order((char*)table->data)) {

2023

if (__parse_numa_zonelist_order((char*)table->data)) {

2012

/*

2024

/*

2013

* bogus value. restore saved string

2025

* bogus value. restore saved string

2014

*/

2026

*/

2015

strncpy((char*)table->data, saved_string,

2027

strncpy((char*)table->data, saved_string,

2016

NUMA_ZONELIST_ORDER_LEN);

2028

NUMA_ZONELIST_ORDER_LEN);

2017

user_zonelist_order = oldval;

2029

user_zonelist_order = oldval;

2018

} else if (oldval != user_zonelist_order)

2030

} else if (oldval != user_zonelist_order)

2019

build_all_zonelists();

2031

build_all_zonelists();

2020

}

2032

}

2021

return 0;

2033

return 0;

2022

}

2034

}

2023

2035

2024

2036

2025

#define MAX_NODE_LOAD (num_online_nodes())

2037

#define MAX_NODE_LOAD (num_online_nodes())

2026

static int node_load[MAX_NUMNODES];

2038

static int node_load[MAX_NUMNODES];

2027

2039

2028

/**

2040

/**

2029

* find_next_best_node - find the next node that should appear in a given node's fallback list

2041

* find_next_best_node - find the next node that should appear in a given node's fallback list

2030

* @node: node whose fallback list we're appending

2042

* @node: node whose fallback list we're appending

2031

* @used_node_mask: nodemask_t of already used nodes

2043

* @used_node_mask: nodemask_t of already used nodes

2032

*

2044

*

2033

* We use a number of factors to determine which is the next node that should

2045

* We use a number of factors to determine which is the next node that should

2034

* appear on a given node's fallback list. The node should not have appeared

2046

* appear on a given node's fallback list. The node should not have appeared

2035

* already in @node's fallback list, and it should be the next closest node

2047

* already in @node's fallback list, and it should be the next closest node

2036

* according to the distance array (which contains arbitrary distance values

2048

* according to the distance array (which contains arbitrary distance values

2037

* from each node to each node in the system), and should also prefer nodes

2049

* from each node to each node in the system), and should also prefer nodes

2038

* with no CPUs, since presumably they'll have very little allocation pressure

2050

* with no CPUs, since presumably they'll have very little allocation pressure

2039

* on them otherwise.

2051

* on them otherwise.

2040

* It returns -1 if no node is found.

2052

* It returns -1 if no node is found.

2041

*/

2053

*/

2042

static int find_next_best_node(int node, nodemask_t *used_node_mask)

2054

static int find_next_best_node(int node, nodemask_t *used_node_mask)

2043

{

2055

{

2044

int n, val;

2056

int n, val;

2045

int min_val = INT_MAX;

2057

int min_val = INT_MAX;

2046

int best_node = -1;

2058

int best_node = -1;

2047

node_to_cpumask_ptr(tmp, 0);

2059

node_to_cpumask_ptr(tmp, 0);

2048

2060

2049

/* Use the local node if we haven't already */

2061

/* Use the local node if we haven't already */

2050

if (!node_isset(node, *used_node_mask)) {

2062

if (!node_isset(node, *used_node_mask)) {

2051

node_set(node, *used_node_mask);

2063

node_set(node, *used_node_mask);

2052

return node;

2064

return node;

2053

}

2065

}

2054

2066

2055

for_each_node_state(n, N_HIGH_MEMORY) {

2067

for_each_node_state(n, N_HIGH_MEMORY) {

2056

2068

2057

/* Don't want a node to appear more than once */

2069

/* Don't want a node to appear more than once */

2058

if (node_isset(n, *used_node_mask))

2070

if (node_isset(n, *used_node_mask))

2059

continue;

2071

continue;

2060

2072

2061

/* Use the distance array to find the distance */

2073

/* Use the distance array to find the distance */

2062

val = node_distance(node, n);

2074

val = node_distance(node, n);

2063

2075

2064

/* Penalize nodes under us ("prefer the next node") */

2076

/* Penalize nodes under us ("prefer the next node") */

2065

val += (n < node);

2077

val += (n < node);

2066

2078

2067

/* Give preference to headless and unused nodes */

2079

/* Give preference to headless and unused nodes */

2068

node_to_cpumask_ptr_next(tmp, n);

2080

node_to_cpumask_ptr_next(tmp, n);

2069

if (!cpus_empty(*tmp))

2081

if (!cpus_empty(*tmp))

2070

val += PENALTY_FOR_NODE_WITH_CPUS;

2082

val += PENALTY_FOR_NODE_WITH_CPUS;

2071

2083

2072

/* Slight preference for less loaded node */

2084

/* Slight preference for less loaded node */

2073

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

2085

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

2074

val += node_load[n];

2086

val += node_load[n];

2075

2087

2076

if (val < min_val) {

2088

if (val < min_val) {

2077

min_val = val;

2089

min_val = val;

2078

best_node = n;

2090

best_node = n;

2079

}

2091

}

2080

}

2092

}

2081

2093

2082

if (best_node >= 0)

2094

if (best_node >= 0)

2083

node_set(best_node, *used_node_mask);

2095

node_set(best_node, *used_node_mask);

2084

2096

2085

return best_node;

2097

return best_node;

2086

}

2098

}

2087

2099

2088

2100

2089

/*

2101

/*

2090

* Build zonelists ordered by node and zones within node.

2102

* Build zonelists ordered by node and zones within node.

2091

* This results in maximum locality--normal zone overflows into local

2103

* This results in maximum locality--normal zone overflows into local

2092

* DMA zone, if any--but risks exhausting DMA zone.

2104

* DMA zone, if any--but risks exhausting DMA zone.

2093

*/

2105

*/

2094

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

2106

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

2095

{

2107

{

2096

int j;

2108

int j;

2097

struct zonelist *zonelist;

2109

struct zonelist *zonelist;

2098

2110

2099

zonelist = &pgdat->node_zonelists[0];

2111

zonelist = &pgdat->node_zonelists[0];

2100

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

2112

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

2101

;

2113

;

2102

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2114

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2103

MAX_NR_ZONES - 1);

2115

MAX_NR_ZONES - 1);

2104

zonelist->_zonerefs[j].zone = NULL;

2116

zonelist->_zonerefs[j].zone = NULL;

2105

zonelist->_zonerefs[j].zone_idx = 0;

2117

zonelist->_zonerefs[j].zone_idx = 0;

2106

}

2118

}

2107

2119

2108

/*

2120

/*

2109

* Build gfp_thisnode zonelists

2121

* Build gfp_thisnode zonelists

2110

*/

2122

*/

2111

static void build_thisnode_zonelists(pg_data_t *pgdat)

2123

static void build_thisnode_zonelists(pg_data_t *pgdat)

2112

{

2124

{

2113

int j;

2125

int j;

2114

struct zonelist *zonelist;

2126

struct zonelist *zonelist;

2115

2127

2116

zonelist = &pgdat->node_zonelists[1];

2128

zonelist = &pgdat->node_zonelists[1];

2117

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2129

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2118

zonelist->_zonerefs[j].zone = NULL;

2130

zonelist->_zonerefs[j].zone = NULL;

2119

zonelist->_zonerefs[j].zone_idx = 0;

2131

zonelist->_zonerefs[j].zone_idx = 0;

2120

}

2132

}

2121

2133

2122

/*

2134

/*

2123

* Build zonelists ordered by zone and nodes within zones.

2135

* Build zonelists ordered by zone and nodes within zones.

2124

* This results in conserving DMA zone[s] until all Normal memory is

2136

* This results in conserving DMA zone[s] until all Normal memory is

2125

* exhausted, but results in overflowing to remote node while memory

2137

* exhausted, but results in overflowing to remote node while memory

2126

* may still exist in local DMA zone.

2138

* may still exist in local DMA zone.

2127

*/

2139

*/

2128

static int node_order[MAX_NUMNODES];

2140

static int node_order[MAX_NUMNODES];

2129

2141

2130

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

2142

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

2131

{

2143

{

2132

int pos, j, node;

2144

int pos, j, node;

2133

int zone_type; /* needs to be signed */

2145

int zone_type; /* needs to be signed */

2134

struct zone *z;

2146

struct zone *z;

2135

struct zonelist *zonelist;

2147

struct zonelist *zonelist;

2136

2148

2137

zonelist = &pgdat->node_zonelists[0];

2149

zonelist = &pgdat->node_zonelists[0];

2138

pos = 0;

2150

pos = 0;

2139

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

2151

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

2140

for (j = 0; j < nr_nodes; j++) {

2152

for (j = 0; j < nr_nodes; j++) {

2141

node = node_order[j];

2153

node = node_order[j];

2142

z = &NODE_DATA(node)->node_zones[zone_type];

2154

z = &NODE_DATA(node)->node_zones[zone_type];

2143

if (populated_zone(z)) {

2155

if (populated_zone(z)) {

2144

zoneref_set_zone(z,

2156

zoneref_set_zone(z,

2145

&zonelist->_zonerefs[pos++]);

2157

&zonelist->_zonerefs[pos++]);

2146

check_highest_zone(zone_type);

2158

check_highest_zone(zone_type);

2147

}

2159

}

2148

}

2160

}

2149

}

2161

}

2150

zonelist->_zonerefs[pos].zone = NULL;

2162

zonelist->_zonerefs[pos].zone = NULL;

2151

zonelist->_zonerefs[pos].zone_idx = 0;

2163

zonelist->_zonerefs[pos].zone_idx = 0;

2152

}

2164

}

2153

2165

2154

static int default_zonelist_order(void)

2166

static int default_zonelist_order(void)

2155

{

2167

{

2156

int nid, zone_type;

2168

int nid, zone_type;

2157

unsigned long low_kmem_size,total_size;

2169

unsigned long low_kmem_size,total_size;

2158

struct zone *z;

2170

struct zone *z;

2159

int average_size;

2171

int average_size;

2160

/*

2172

/*

2161

* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.

2173

* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.

2162

* If they are really small and used heavily, the system can fall

2174

* If they are really small and used heavily, the system can fall

2163

* into OOM very easily.

2175

* into OOM very easily.

2164

* This function detect ZONE_DMA/DMA32 size and confgigures zone order.

2176

* This function detect ZONE_DMA/DMA32 size and confgigures zone order.

2165

*/

2177

*/

2166

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

2178

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

2167

low_kmem_size = 0;

2179

low_kmem_size = 0;

2168

total_size = 0;

2180

total_size = 0;

2169

for_each_online_node(nid) {

2181

for_each_online_node(nid) {

2170

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2182

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2171

z = &NODE_DATA(nid)->node_zones[zone_type];

2183

z = &NODE_DATA(nid)->node_zones[zone_type];

2172

if (populated_zone(z)) {

2184

if (populated_zone(z)) {

2173

if (zone_type < ZONE_NORMAL)

2185

if (zone_type < ZONE_NORMAL)

2174

low_kmem_size += z->present_pages;

2186

low_kmem_size += z->present_pages;

2175

total_size += z->present_pages;

2187

total_size += z->present_pages;

2176

}

2188

}

2177

}

2189

}

2178

}

2190

}

2179

if (!low_kmem_size || /* there are no DMA area. */

2191

if (!low_kmem_size || /* there are no DMA area. */

2180

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

2192

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

2181

return ZONELIST_ORDER_NODE;

2193

return ZONELIST_ORDER_NODE;

2182

/*

2194

/*

2183

* look into each node's config.

2195

* look into each node's config.

2184

* If there is a node whose DMA/DMA32 memory is very big area on

2196

* If there is a node whose DMA/DMA32 memory is very big area on

2185

* local memory, NODE_ORDER may be suitable.

2197

* local memory, NODE_ORDER may be suitable.

2186

*/

2198

*/

2187

average_size = total_size /

2199

average_size = total_size /

2188

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

2200

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

2189

for_each_online_node(nid) {

2201

for_each_online_node(nid) {

2190

low_kmem_size = 0;

2202

low_kmem_size = 0;

2191

total_size = 0;

2203

total_size = 0;

2192

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2204

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

2193

z = &NODE_DATA(nid)->node_zones[zone_type];

2205

z = &NODE_DATA(nid)->node_zones[zone_type];

2194

if (populated_zone(z)) {

2206

if (populated_zone(z)) {

2195

if (zone_type < ZONE_NORMAL)

2207

if (zone_type < ZONE_NORMAL)

2196

low_kmem_size += z->present_pages;

2208

low_kmem_size += z->present_pages;

2197

total_size += z->present_pages;

2209

total_size += z->present_pages;

2198

}

2210

}

2199

}

2211

}

2200

if (low_kmem_size &&

2212

if (low_kmem_size &&

2201

total_size > average_size && /* ignore small node */

2213

total_size > average_size && /* ignore small node */

2202

low_kmem_size > total_size * 70/100)

2214

low_kmem_size > total_size * 70/100)

2203

return ZONELIST_ORDER_NODE;

2215

return ZONELIST_ORDER_NODE;

2204

}

2216

}

2205

return ZONELIST_ORDER_ZONE;

2217

return ZONELIST_ORDER_ZONE;

2206

}

2218

}

2207

2219

2208

static void set_zonelist_order(void)

2220

static void set_zonelist_order(void)

2209

{

2221

{

2210

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

2222

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

2211

current_zonelist_order = default_zonelist_order();

2223

current_zonelist_order = default_zonelist_order();

2212

else

2224

else

2213

current_zonelist_order = user_zonelist_order;

2225

current_zonelist_order = user_zonelist_order;

2214

}

2226

}

2215

2227

2216

static void build_zonelists(pg_data_t *pgdat)

2228

static void build_zonelists(pg_data_t *pgdat)

2217

{

2229

{

2218

int j, node, load;

2230

int j, node, load;

2219

enum zone_type i;

2231

enum zone_type i;

2220

nodemask_t used_mask;

2232

nodemask_t used_mask;

2221

int local_node, prev_node;

2233

int local_node, prev_node;

2222

struct zonelist *zonelist;

2234

struct zonelist *zonelist;

2223

int order = current_zonelist_order;

2235

int order = current_zonelist_order;

2224

2236

2225

/* initialize zonelists */

2237

/* initialize zonelists */

2226

for (i = 0; i < MAX_ZONELISTS; i++) {

2238

for (i = 0; i < MAX_ZONELISTS; i++) {

2227

zonelist = pgdat->node_zonelists + i;

2239

zonelist = pgdat->node_zonelists + i;

2228

zonelist->_zonerefs[0].zone = NULL;

2240

zonelist->_zonerefs[0].zone = NULL;

2229

zonelist->_zonerefs[0].zone_idx = 0;

2241

zonelist->_zonerefs[0].zone_idx = 0;

2230

}

2242

}

2231

2243

2232

/* NUMA-aware ordering of nodes */

2244

/* NUMA-aware ordering of nodes */

2233

local_node = pgdat->node_id;

2245

local_node = pgdat->node_id;

2234

load = num_online_nodes();

2246

load = num_online_nodes();

2235

prev_node = local_node;

2247

prev_node = local_node;

2236

nodes_clear(used_mask);

2248

nodes_clear(used_mask);

2237

2249

2238

memset(node_load, 0, sizeof(node_load));

2250

memset(node_load, 0, sizeof(node_load));

2239

memset(node_order, 0, sizeof(node_order));

2251

memset(node_order, 0, sizeof(node_order));

2240

j = 0;

2252

j = 0;

2241

2253

2242

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

2254

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

2243

int distance = node_distance(local_node, node);

2255

int distance = node_distance(local_node, node);

2244

2256

2245

/*

2257

/*

2246

* If another node is sufficiently far away then it is better

2258

* If another node is sufficiently far away then it is better

2247

* to reclaim pages in a zone before going off node.

2259

* to reclaim pages in a zone before going off node.

2248

*/

2260

*/

2249

if (distance > RECLAIM_DISTANCE)

2261

if (distance > RECLAIM_DISTANCE)

2250

zone_reclaim_mode = 1;

2262

zone_reclaim_mode = 1;

2251

2263

2252

/*

2264

/*

2253

* We don't want to pressure a particular node.

2265

* We don't want to pressure a particular node.

2254

* So adding penalty to the first node in same

2266

* So adding penalty to the first node in same

2255

* distance group to make it round-robin.

2267

* distance group to make it round-robin.

2256

*/

2268

*/

2257

if (distance != node_distance(local_node, prev_node))

2269

if (distance != node_distance(local_node, prev_node))

2258

node_load[node] = load;

2270

node_load[node] = load;

2259

2271

2260

prev_node = node;

2272

prev_node = node;

2261

load--;

2273

load--;

2262

if (order == ZONELIST_ORDER_NODE)

2274

if (order == ZONELIST_ORDER_NODE)

2263

build_zonelists_in_node_order(pgdat, node);

2275

build_zonelists_in_node_order(pgdat, node);

2264

else

2276

else

2265

node_order[j++] = node; /* remember order */

2277

node_order[j++] = node; /* remember order */

2266

}

2278

}

2267

2279

2268

if (order == ZONELIST_ORDER_ZONE) {

2280

if (order == ZONELIST_ORDER_ZONE) {

2269

/* calculate node order -- i.e., DMA last! */

2281

/* calculate node order -- i.e., DMA last! */

2270

build_zonelists_in_zone_order(pgdat, j);

2282

build_zonelists_in_zone_order(pgdat, j);

2271

}

2283

}

2272

2284

2273

build_thisnode_zonelists(pgdat);

2285

build_thisnode_zonelists(pgdat);

2274

}

2286

}

2275

2287

2276

/* Construct the zonelist performance cache - see further mmzone.h */

2288

/* Construct the zonelist performance cache - see further mmzone.h */

2277

static void build_zonelist_cache(pg_data_t *pgdat)

2289

static void build_zonelist_cache(pg_data_t *pgdat)

2278

{

2290

{

2279

struct zonelist *zonelist;

2291

struct zonelist *zonelist;

2280

struct zonelist_cache *zlc;

2292

struct zonelist_cache *zlc;

2281

struct zoneref *z;

2293

struct zoneref *z;

2282

2294

2283

zonelist = &pgdat->node_zonelists[0];

2295

zonelist = &pgdat->node_zonelists[0];

2284

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

2296

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

2285

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

2297

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

2286

for (z = zonelist->_zonerefs; z->zone; z++)

2298

for (z = zonelist->_zonerefs; z->zone; z++)

2287

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

2299

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

2288

}

2300

}

2289

2301

2290

2302

2291

#else /* CONFIG_NUMA */

2303

#else /* CONFIG_NUMA */

2292

2304

2293

static void set_zonelist_order(void)

2305

static void set_zonelist_order(void)

2294

{

2306

{

2295

current_zonelist_order = ZONELIST_ORDER_ZONE;

2307

current_zonelist_order = ZONELIST_ORDER_ZONE;

2296

}

2308

}

2297

2309

2298

static void build_zonelists(pg_data_t *pgdat)

2310

static void build_zonelists(pg_data_t *pgdat)

2299

{

2311

{

2300

int node, local_node;

2312

int node, local_node;

2301

enum zone_type j;

2313

enum zone_type j;

2302

struct zonelist *zonelist;

2314

struct zonelist *zonelist;

2303

2315

2304

local_node = pgdat->node_id;

2316

local_node = pgdat->node_id;

2305

2317

2306

zonelist = &pgdat->node_zonelists[0];

2318

zonelist = &pgdat->node_zonelists[0];

2307

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2319

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

2308

2320

2309

/*

2321

/*

2310

* Now we build the zonelist so that it contains the zones

2322

* Now we build the zonelist so that it contains the zones

2311

* of all the other nodes.

2323

* of all the other nodes.

2312

* We don't want to pressure a particular node, so when

2324

* We don't want to pressure a particular node, so when

2313

* building the zones for node N, we make sure that the

2325

* building the zones for node N, we make sure that the

2314

* zones coming right after the local ones are those from

2326

* zones coming right after the local ones are those from

2315

* node N+1 (modulo N)

2327

* node N+1 (modulo N)

2316

*/

2328

*/

2317

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

2329

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

2318

if (!node_online(node))

2330

if (!node_online(node))

2319

continue;

2331

continue;

2320

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2332

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2321

MAX_NR_ZONES - 1);

2333

MAX_NR_ZONES - 1);

2322

}

2334

}

2323

for (node = 0; node < local_node; node++) {

2335

for (node = 0; node < local_node; node++) {

2324

if (!node_online(node))

2336

if (!node_online(node))

2325

continue;

2337

continue;

2326

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2338

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

2327

MAX_NR_ZONES - 1);

2339

MAX_NR_ZONES - 1);

2328

}

2340

}

2329

2341

2330

zonelist->_zonerefs[j].zone = NULL;

2342

zonelist->_zonerefs[j].zone = NULL;

2331

zonelist->_zonerefs[j].zone_idx = 0;

2343

zonelist->_zonerefs[j].zone_idx = 0;

2332

}

2344

}

2333

2345

2334

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

2346

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

2335

static void build_zonelist_cache(pg_data_t *pgdat)

2347

static void build_zonelist_cache(pg_data_t *pgdat)

2336

{

2348

{

2337

pgdat->node_zonelists[0].zlcache_ptr = NULL;

2349

pgdat->node_zonelists[0].zlcache_ptr = NULL;

2338

pgdat->node_zonelists[1].zlcache_ptr = NULL;

2350

pgdat->node_zonelists[1].zlcache_ptr = NULL;

2339

}

2351

}

2340

2352

2341

#endif /* CONFIG_NUMA */

2353

#endif /* CONFIG_NUMA */

2342

2354

2343

/* return values int ....just for stop_machine_run() */

2355

/* return values int ....just for stop_machine_run() */

2344

static int __build_all_zonelists(void *dummy)

2356

static int __build_all_zonelists(void *dummy)

2345

{

2357

{

2346

int nid;

2358

int nid;

2347

2359

2348

for_each_online_node(nid) {

2360

for_each_online_node(nid) {

2349

pg_data_t *pgdat = NODE_DATA(nid);

2361

pg_data_t *pgdat = NODE_DATA(nid);

2350

2362

2351

build_zonelists(pgdat);

2363

build_zonelists(pgdat);

2352

build_zonelist_cache(pgdat);

2364

build_zonelist_cache(pgdat);

2353

}

2365

}

2354

return 0;

2366

return 0;

2355

}

2367

}

2356

2368

2357

void build_all_zonelists(void)

2369

void build_all_zonelists(void)

2358

{

2370

{

2359

set_zonelist_order();

2371

set_zonelist_order();

2360

2372

2361

if (system_state == SYSTEM_BOOTING) {

2373

if (system_state == SYSTEM_BOOTING) {

2362

__build_all_zonelists(NULL);

2374

__build_all_zonelists(NULL);

2363

cpuset_init_current_mems_allowed();

2375

cpuset_init_current_mems_allowed();

2364

} else {

2376

} else {

2365

/* we have to stop all cpus to guarantee there is no user

2377

/* we have to stop all cpus to guarantee there is no user

2366

of zonelist */

2378

of zonelist */

2367

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

2379

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

2368

/* cpuset refresh routine should be here */

2380

/* cpuset refresh routine should be here */

2369

}

2381

}

2370

vm_total_pages = nr_free_pagecache_pages();

2382

vm_total_pages = nr_free_pagecache_pages();

2371

/*

2383

/*

2372

* Disable grouping by mobility if the number of pages in the

2384

* Disable grouping by mobility if the number of pages in the

2373

* system is too low to allow the mechanism to work. It would be

2385

* system is too low to allow the mechanism to work. It would be

2374

* more accurate, but expensive to check per-zone. This check is

2386

* more accurate, but expensive to check per-zone. This check is

2375

* made on memory-hotadd so a system can start with mobility

2387

* made on memory-hotadd so a system can start with mobility

2376

* disabled and enable it later

2388

* disabled and enable it later

2377

*/

2389

*/

2378

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

2390

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

2379

page_group_by_mobility_disabled = 1;

2391

page_group_by_mobility_disabled = 1;

2380

else

2392

else

2381

page_group_by_mobility_disabled = 0;

2393

page_group_by_mobility_disabled = 0;

2382

2394

2383

printk("Built %i zonelists in %s order, mobility grouping %s. "

2395

printk("Built %i zonelists in %s order, mobility grouping %s. "

2384

"Total pages: %ld\n",

2396

"Total pages: %ld\n",

2385

num_online_nodes(),

2397

num_online_nodes(),

2386

zonelist_order_name[current_zonelist_order],

2398

zonelist_order_name[current_zonelist_order],

2387

page_group_by_mobility_disabled ? "off" : "on",

2399

page_group_by_mobility_disabled ? "off" : "on",

2388

vm_total_pages);

2400

vm_total_pages);

2389

#ifdef CONFIG_NUMA

2401

#ifdef CONFIG_NUMA

2390

printk("Policy zone: %s\n", zone_names[policy_zone]);

2402

printk("Policy zone: %s\n", zone_names[policy_zone]);

2391

#endif

2403

#endif

2392

}

2404

}

2393

2405

2394

/*

2406

/*

2395

* Helper functions to size the waitqueue hash table.

2407

* Helper functions to size the waitqueue hash table.

2396

* Essentially these want to choose hash table sizes sufficiently

2408

* Essentially these want to choose hash table sizes sufficiently

2397

* large so that collisions trying to wait on pages are rare.

2409

* large so that collisions trying to wait on pages are rare.

2398

* But in fact, the number of active page waitqueues on typical

2410

* But in fact, the number of active page waitqueues on typical

2399

* systems is ridiculously low, less than 200. So this is even

2411

* systems is ridiculously low, less than 200. So this is even

2400

* conservative, even though it seems large.

2412

* conservative, even though it seems large.

2401

*

2413

*

2402

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

2414

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

2403

* waitqueues, i.e. the size of the waitq table given the number of pages.

2415

* waitqueues, i.e. the size of the waitq table given the number of pages.

2404

*/

2416

*/

2405

#define PAGES_PER_WAITQUEUE 256

2417

#define PAGES_PER_WAITQUEUE 256

2406

2418

2407

#ifndef CONFIG_MEMORY_HOTPLUG

2419

#ifndef CONFIG_MEMORY_HOTPLUG

2408

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2420

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2409

{

2421

{

2410

unsigned long size = 1;

2422

unsigned long size = 1;

2411

2423

2412

pages /= PAGES_PER_WAITQUEUE;

2424

pages /= PAGES_PER_WAITQUEUE;

2413

2425

2414

while (size < pages)

2426

while (size < pages)

2415

size <<= 1;

2427

size <<= 1;

2416

2428

2417

/*

2429

/*

2418

* Once we have dozens or even hundreds of threads sleeping

2430

* Once we have dozens or even hundreds of threads sleeping

2419

* on IO we've got bigger problems than wait queue collision.

2431

* on IO we've got bigger problems than wait queue collision.

2420

* Limit the size of the wait table to a reasonable size.

2432

* Limit the size of the wait table to a reasonable size.

2421

*/

2433

*/

2422

size = min(size, 4096UL);

2434

size = min(size, 4096UL);

2423

2435

2424

return max(size, 4UL);

2436

return max(size, 4UL);

2425

}

2437

}

2426

#else

2438

#else

2427

/*

2439

/*

2428

* A zone's size might be changed by hot-add, so it is not possible to determine

2440

* A zone's size might be changed by hot-add, so it is not possible to determine

2429

* a suitable size for its wait_table. So we use the maximum size now.

2441

* a suitable size for its wait_table. So we use the maximum size now.

2430

*

2442

*

2431

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

2443

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

2432

*

2444

*

2433

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

2445

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

2434

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

2446

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

2435

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

2447

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

2436

*

2448

*

2437

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

2449

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

2438

* or more by the traditional way. (See above). It equals:

2450

* or more by the traditional way. (See above). It equals:

2439

*

2451

*

2440

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

2452

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

2441

* ia64(16K page size) : = ( 8G + 4M)byte.

2453

* ia64(16K page size) : = ( 8G + 4M)byte.

2442

* powerpc (64K page size) : = (32G +16M)byte.

2454

* powerpc (64K page size) : = (32G +16M)byte.

2443

*/

2455

*/

2444

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2456

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

2445

{

2457

{

2446

return 4096UL;

2458

return 4096UL;

2447

}

2459

}

2448

#endif

2460

#endif

2449

2461

2450

/*

2462

/*

2451

* This is an integer logarithm so that shifts can be used later

2463

* This is an integer logarithm so that shifts can be used later

2452

* to extract the more random high bits from the multiplicative

2464

* to extract the more random high bits from the multiplicative

2453

* hash function before the remainder is taken.

2465

* hash function before the remainder is taken.

2454

*/

2466

*/

2455

static inline unsigned long wait_table_bits(unsigned long size)

2467

static inline unsigned long wait_table_bits(unsigned long size)

2456

{

2468

{

2457

return ffz(~size);

2469

return ffz(~size);

2458

}

2470

}

2459

2471

2460

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

2472

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

2461

2473

2462

/*

2474

/*

2463

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

2475

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

2464

* of blocks reserved is based on zone->pages_min. The memory within the

2476

* of blocks reserved is based on zone->pages_min. The memory within the

2465

* reserve will tend to store contiguous free pages. Setting min_free_kbytes

2477

* reserve will tend to store contiguous free pages. Setting min_free_kbytes

2466

* higher will lead to a bigger reserve which will get freed as contiguous

2478

* higher will lead to a bigger reserve which will get freed as contiguous

2467

* blocks as reclaim kicks in

2479

* blocks as reclaim kicks in

2468

*/

2480

*/

2469

static void setup_zone_migrate_reserve(struct zone *zone)

2481

static void setup_zone_migrate_reserve(struct zone *zone)

2470

{

2482

{

2471

unsigned long start_pfn, pfn, end_pfn;

2483

unsigned long start_pfn, pfn, end_pfn;

2472

struct page *page;

2484

struct page *page;

2473

unsigned long reserve, block_migratetype;

2485

unsigned long reserve, block_migratetype;

2474

2486

2475

/* Get the start pfn, end pfn and the number of blocks to reserve */

2487

/* Get the start pfn, end pfn and the number of blocks to reserve */

2476

start_pfn = zone->zone_start_pfn;

2488

start_pfn = zone->zone_start_pfn;

2477

end_pfn = start_pfn + zone->spanned_pages;

2489

end_pfn = start_pfn + zone->spanned_pages;

2478

reserve = roundup(zone->pages_min, pageblock_nr_pages) >>

2490

reserve = roundup(zone->pages_min, pageblock_nr_pages) >>

2479

pageblock_order;

2491

pageblock_order;

2480

2492

2481

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

2493

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

2482

if (!pfn_valid(pfn))

2494

if (!pfn_valid(pfn))

2483

continue;

2495

continue;

2484

page = pfn_to_page(pfn);

2496

page = pfn_to_page(pfn);

2485

2497

2486

/* Blocks with reserved pages will never free, skip them. */

2498

/* Blocks with reserved pages will never free, skip them. */

2487

if (PageReserved(page))

2499

if (PageReserved(page))

2488

continue;

2500

continue;

2489

2501

2490

block_migratetype = get_pageblock_migratetype(page);

2502

block_migratetype = get_pageblock_migratetype(page);

2491

2503

2492

/* If this block is reserved, account for it */

2504

/* If this block is reserved, account for it */

2493

if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {

2505

if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {

2494

reserve--;

2506

reserve--;

2495

continue;

2507

continue;

2496

}

2508

}

2497

2509

2498

/* Suitable for reserving if this block is movable */

2510

/* Suitable for reserving if this block is movable */

2499

if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {

2511

if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {

2500

set_pageblock_migratetype(page, MIGRATE_RESERVE);

2512

set_pageblock_migratetype(page, MIGRATE_RESERVE);

2501

move_freepages_block(zone, page, MIGRATE_RESERVE);

2513

move_freepages_block(zone, page, MIGRATE_RESERVE);

2502

reserve--;

2514

reserve--;

2503

continue;

2515

continue;

2504

}

2516

}

2505

2517

2506

/*

2518

/*

2507

* If the reserve is met and this is a previous reserved block,

2519

* If the reserve is met and this is a previous reserved block,

2508

* take it back

2520

* take it back

2509

*/

2521

*/

2510

if (block_migratetype == MIGRATE_RESERVE) {

2522

if (block_migratetype == MIGRATE_RESERVE) {

2511

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2523

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2512

move_freepages_block(zone, page, MIGRATE_MOVABLE);

2524

move_freepages_block(zone, page, MIGRATE_MOVABLE);

2513

}

2525

}

2514

}

2526

}

2515

}

2527

}

2516

2528

2517

/*

2529

/*

2518

* Initially all pages are reserved - free ones are freed

2530

* Initially all pages are reserved - free ones are freed

2519

* up by free_all_bootmem() once the early boot process is

2531

* up by free_all_bootmem() once the early boot process is

2520

* done. Non-atomic initialization, single-pass.

2532

* done. Non-atomic initialization, single-pass.

2521

*/

2533

*/

2522

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

2534

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

2523

unsigned long start_pfn, enum memmap_context context)

2535

unsigned long start_pfn, enum memmap_context context)

2524

{

2536

{

2525

struct page *page;

2537

struct page *page;

2526

unsigned long end_pfn = start_pfn + size;

2538

unsigned long end_pfn = start_pfn + size;

2527

unsigned long pfn;

2539

unsigned long pfn;

2528

struct zone *z;

2540

struct zone *z;

2529

2541

2530

z = &NODE_DATA(nid)->node_zones[zone];

2542

z = &NODE_DATA(nid)->node_zones[zone];

2531

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

2543

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

2532

/*

2544

/*

2533

* There can be holes in boot-time mem_map[]s

2545

* There can be holes in boot-time mem_map[]s

2534

* handed to this function. They do not

2546

* handed to this function. They do not

2535

* exist on hotplugged memory.

2547

* exist on hotplugged memory.

2536

*/

2548

*/

2537

if (context == MEMMAP_EARLY) {

2549

if (context == MEMMAP_EARLY) {

2538

if (!early_pfn_valid(pfn))

2550

if (!early_pfn_valid(pfn))

2539

continue;

2551

continue;

2540

if (!early_pfn_in_nid(pfn, nid))

2552

if (!early_pfn_in_nid(pfn, nid))

2541

continue;

2553

continue;

2542

}

2554

}

2543

page = pfn_to_page(pfn);

2555

page = pfn_to_page(pfn);

2544

set_page_links(page, zone, nid, pfn);

2556

set_page_links(page, zone, nid, pfn);

2545

init_page_count(page);

2557

init_page_count(page);

2546

reset_page_mapcount(page);

2558

reset_page_mapcount(page);

2547

SetPageReserved(page);

2559

SetPageReserved(page);

2548

/*

2560

/*

2549

* Mark the block movable so that blocks are reserved for

2561

* Mark the block movable so that blocks are reserved for

2550

* movable at startup. This will force kernel allocations

2562

* movable at startup. This will force kernel allocations

2551

* to reserve their blocks rather than leaking throughout

2563

* to reserve their blocks rather than leaking throughout

2552

* the address space during boot when many long-lived

2564

* the address space during boot when many long-lived

2553

* kernel allocations are made. Later some blocks near

2565

* kernel allocations are made. Later some blocks near

2554

* the start are marked MIGRATE_RESERVE by

2566

* the start are marked MIGRATE_RESERVE by

2555

* setup_zone_migrate_reserve()

2567

* setup_zone_migrate_reserve()

2556

*

2568

*

2557

* bitmap is created for zone's valid pfn range. but memmap

2569

* bitmap is created for zone's valid pfn range. but memmap

2558

* can be created for invalid pages (for alignment)

2570

* can be created for invalid pages (for alignment)

2559

* check here not to call set_pageblock_migratetype() against

2571

* check here not to call set_pageblock_migratetype() against

2560

* pfn out of zone.

2572

* pfn out of zone.

2561

*/

2573

*/

2562

if ((z->zone_start_pfn <= pfn)

2574

if ((z->zone_start_pfn <= pfn)

2563

&& (pfn < z->zone_start_pfn + z->spanned_pages)

2575

&& (pfn < z->zone_start_pfn + z->spanned_pages)

2564

&& !(pfn & (pageblock_nr_pages - 1)))

2576

&& !(pfn & (pageblock_nr_pages - 1)))

2565

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2577

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

2566

2578

2567

INIT_LIST_HEAD(&page->lru);

2579

INIT_LIST_HEAD(&page->lru);

2568

#ifdef WANT_PAGE_VIRTUAL

2580

#ifdef WANT_PAGE_VIRTUAL

2569

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

2581

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

2570

if (!is_highmem_idx(zone))

2582

if (!is_highmem_idx(zone))

2571

set_page_address(page, __va(pfn << PAGE_SHIFT));

2583

set_page_address(page, __va(pfn << PAGE_SHIFT));

2572

#endif

2584

#endif

2573

}

2585

}

2574

}

2586

}

2575

2587

2576

static void __meminit zone_init_free_lists(struct zone *zone)

2588

static void __meminit zone_init_free_lists(struct zone *zone)

2577

{

2589

{

2578

int order, t;

2590

int order, t;

2579

for_each_migratetype_order(order, t) {

2591

for_each_migratetype_order(order, t) {

2580

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

2592

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

2581

zone->free_area[order].nr_free = 0;

2593

zone->free_area[order].nr_free = 0;

2582

}

2594

}

2583

}

2595

}

2584

2596

2585

#ifndef __HAVE_ARCH_MEMMAP_INIT

2597

#ifndef __HAVE_ARCH_MEMMAP_INIT

2586

#define memmap_init(size, nid, zone, start_pfn) \

2598

#define memmap_init(size, nid, zone, start_pfn) \

2587

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

2599

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

2588

#endif

2600

#endif

2589

2601

2590

static int zone_batchsize(struct zone *zone)

2602

static int zone_batchsize(struct zone *zone)

2591

{

2603

{

2592

int batch;

2604

int batch;

2593

2605

2594

/*

2606

/*

2595

* The per-cpu-pages pools are set to around 1000th of the

2607

* The per-cpu-pages pools are set to around 1000th of the

2596

* size of the zone. But no more than 1/2 of a meg.

2608

* size of the zone. But no more than 1/2 of a meg.

2597

*

2609

*

2598

* OK, so we don't know how big the cache is. So guess.

2610

* OK, so we don't know how big the cache is. So guess.

2599

*/

2611

*/

2600

batch = zone->present_pages / 1024;

2612

batch = zone->present_pages / 1024;

2601

if (batch * PAGE_SIZE > 512 * 1024)

2613

if (batch * PAGE_SIZE > 512 * 1024)

2602

batch = (512 * 1024) / PAGE_SIZE;

2614

batch = (512 * 1024) / PAGE_SIZE;

2603

batch /= 4; /* We effectively *= 4 below */

2615

batch /= 4; /* We effectively *= 4 below */

2604

if (batch < 1)

2616

if (batch < 1)

2605

batch = 1;

2617

batch = 1;

2606

2618

2607

/*

2619

/*

2608

* Clamp the batch to a 2^n - 1 value. Having a power

2620

* Clamp the batch to a 2^n - 1 value. Having a power

2609

* of 2 value was found to be more likely to have

2621

* of 2 value was found to be more likely to have

2610

* suboptimal cache aliasing properties in some cases.

2622

* suboptimal cache aliasing properties in some cases.

2611

*

2623

*

2612

* For example if 2 tasks are alternately allocating

2624

* For example if 2 tasks are alternately allocating

2613

* batches of pages, one task can end up with a lot

2625

* batches of pages, one task can end up with a lot

2614

* of pages of one half of the possible page colors

2626

* of pages of one half of the possible page colors

2615

* and the other with pages of the other colors.

2627

* and the other with pages of the other colors.

2616

*/

2628

*/

2617

batch = (1 << (fls(batch + batch/2)-1)) - 1;

2629

batch = (1 << (fls(batch + batch/2)-1)) - 1;

2618

2630

2619

return batch;

2631

return batch;

2620

}

2632

}

2621

2633

2622

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

2634

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

2623

{

2635

{

2624

struct per_cpu_pages *pcp;

2636

struct per_cpu_pages *pcp;

2625

2637

2626

memset(p, 0, sizeof(*p));

2638

memset(p, 0, sizeof(*p));

2627

2639

2628

pcp = &p->pcp;

2640

pcp = &p->pcp;

2629

pcp->count = 0;

2641

pcp->count = 0;

2630

pcp->high = 6 * batch;

2642

pcp->high = 6 * batch;

2631

pcp->batch = max(1UL, 1 * batch);

2643

pcp->batch = max(1UL, 1 * batch);

2632

INIT_LIST_HEAD(&pcp->list);

2644

INIT_LIST_HEAD(&pcp->list);

2633

}

2645

}

2634

2646

2635

/*

2647

/*

2636

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

2648

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

2637

* to the value high for the pageset p.

2649

* to the value high for the pageset p.

2638

*/

2650

*/

2639

2651

2640

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

2652

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

2641

unsigned long high)

2653

unsigned long high)

2642

{

2654

{

2643

struct per_cpu_pages *pcp;

2655

struct per_cpu_pages *pcp;

2644

2656

2645

pcp = &p->pcp;

2657

pcp = &p->pcp;

2646

pcp->high = high;

2658

pcp->high = high;

2647

pcp->batch = max(1UL, high/4);

2659

pcp->batch = max(1UL, high/4);

2648

if ((high/4) > (PAGE_SHIFT * 8))

2660

if ((high/4) > (PAGE_SHIFT * 8))

2649

pcp->batch = PAGE_SHIFT * 8;

2661

pcp->batch = PAGE_SHIFT * 8;

2650

}

2662

}

2651

2663

2652

2664

2653

#ifdef CONFIG_NUMA

2665

#ifdef CONFIG_NUMA

2654

/*

2666

/*

2655

* Boot pageset table. One per cpu which is going to be used for all

2667

* Boot pageset table. One per cpu which is going to be used for all

2656

* zones and all nodes. The parameters will be set in such a way

2668

* zones and all nodes. The parameters will be set in such a way

2657

* that an item put on a list will immediately be handed over to

2669

* that an item put on a list will immediately be handed over to

2658

* the buddy list. This is safe since pageset manipulation is done

2670

* the buddy list. This is safe since pageset manipulation is done

2659

* with interrupts disabled.

2671

* with interrupts disabled.

2660

*

2672

*

2661

* Some NUMA counter updates may also be caught by the boot pagesets.

2673

* Some NUMA counter updates may also be caught by the boot pagesets.

2662

*

2674

*

2663

* The boot_pagesets must be kept even after bootup is complete for

2675

* The boot_pagesets must be kept even after bootup is complete for

2664

* unused processors and/or zones. They do play a role for bootstrapping

2676

* unused processors and/or zones. They do play a role for bootstrapping

2665

* hotplugged processors.

2677

* hotplugged processors.

2666

*

2678

*

2667

* zoneinfo_show() and maybe other functions do

2679

* zoneinfo_show() and maybe other functions do

2668

* not check if the processor is online before following the pageset pointer.

2680

* not check if the processor is online before following the pageset pointer.

2669

* Other parts of the kernel may not check if the zone is available.

2681

* Other parts of the kernel may not check if the zone is available.

2670

*/

2682

*/

2671

static struct per_cpu_pageset boot_pageset[NR_CPUS];

2683

static struct per_cpu_pageset boot_pageset[NR_CPUS];

2672

2684

2673

/*

2685

/*

2674

* Dynamically allocate memory for the

2686

* Dynamically allocate memory for the

2675

* per cpu pageset array in struct zone.

2687

* per cpu pageset array in struct zone.

2676

*/

2688

*/

2677

static int __cpuinit process_zones(int cpu)

2689

static int __cpuinit process_zones(int cpu)

2678

{

2690

{

2679

struct zone *zone, *dzone;

2691

struct zone *zone, *dzone;

2680

int node = cpu_to_node(cpu);

2692

int node = cpu_to_node(cpu);

2681

2693

2682

node_set_state(node, N_CPU); /* this node has a cpu */

2694

node_set_state(node, N_CPU); /* this node has a cpu */

2683

2695

2684

for_each_zone(zone) {

2696

for_each_zone(zone) {

2685

2697

2686

if (!populated_zone(zone))

2698

if (!populated_zone(zone))

2687

continue;

2699

continue;

2688

2700

2689

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2701

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2690

GFP_KERNEL, node);

2702

GFP_KERNEL, node);

2691

if (!zone_pcp(zone, cpu))

2703

if (!zone_pcp(zone, cpu))

2692

goto bad;

2704

goto bad;

2693

2705

2694

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2706

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2695

2707

2696

if (percpu_pagelist_fraction)

2708

if (percpu_pagelist_fraction)

2697

setup_pagelist_highmark(zone_pcp(zone, cpu),

2709

setup_pagelist_highmark(zone_pcp(zone, cpu),

2698

(zone->present_pages / percpu_pagelist_fraction));

2710

(zone->present_pages / percpu_pagelist_fraction));

2699

}

2711

}

2700

2712

2701

return 0;

2713

return 0;

2702

bad:

2714

bad:

2703

for_each_zone(dzone) {

2715

for_each_zone(dzone) {

2704

if (!populated_zone(dzone))

2716

if (!populated_zone(dzone))

2705

continue;

2717

continue;

2706

if (dzone == zone)

2718

if (dzone == zone)

2707

break;

2719

break;

2708

kfree(zone_pcp(dzone, cpu));

2720

kfree(zone_pcp(dzone, cpu));

2709

zone_pcp(dzone, cpu) = NULL;

2721

zone_pcp(dzone, cpu) = NULL;

2710

}

2722

}

2711

return -ENOMEM;

2723

return -ENOMEM;

2712

}

2724

}

2713

2725

2714

static inline void free_zone_pagesets(int cpu)

2726

static inline void free_zone_pagesets(int cpu)

2715

{

2727

{

2716

struct zone *zone;

2728

struct zone *zone;

2717

2729

2718

for_each_zone(zone) {

2730

for_each_zone(zone) {

2719

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2731

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2720

2732

2721

/* Free per_cpu_pageset if it is slab allocated */

2733

/* Free per_cpu_pageset if it is slab allocated */

2722

if (pset != &boot_pageset[cpu])

2734

if (pset != &boot_pageset[cpu])

2723

kfree(pset);

2735

kfree(pset);

2724

zone_pcp(zone, cpu) = NULL;

2736

zone_pcp(zone, cpu) = NULL;

2725

}

2737

}

2726

}

2738

}

2727

2739

2728

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2740

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2729

unsigned long action,

2741

unsigned long action,

2730

void *hcpu)

2742

void *hcpu)

2731

{

2743

{

2732

int cpu = (long)hcpu;

2744

int cpu = (long)hcpu;

2733

int ret = NOTIFY_OK;

2745

int ret = NOTIFY_OK;

2734

2746

2735

switch (action) {

2747

switch (action) {

2736

case CPU_UP_PREPARE:

2748

case CPU_UP_PREPARE:

2737

case CPU_UP_PREPARE_FROZEN:

2749

case CPU_UP_PREPARE_FROZEN:

2738

if (process_zones(cpu))

2750

if (process_zones(cpu))

2739

ret = NOTIFY_BAD;

2751

ret = NOTIFY_BAD;

2740

break;

2752

break;

2741

case CPU_UP_CANCELED:

2753

case CPU_UP_CANCELED:

2742

case CPU_UP_CANCELED_FROZEN:

2754

case CPU_UP_CANCELED_FROZEN:

2743

case CPU_DEAD:

2755

case CPU_DEAD:

2744

case CPU_DEAD_FROZEN:

2756

case CPU_DEAD_FROZEN:

2745

free_zone_pagesets(cpu);

2757

free_zone_pagesets(cpu);

2746

break;

2758

break;

2747

default:

2759

default:

2748

break;

2760

break;

2749

}

2761

}

2750

return ret;

2762

return ret;

2751

}

2763

}

2752

2764

2753

static struct notifier_block __cpuinitdata pageset_notifier =

2765

static struct notifier_block __cpuinitdata pageset_notifier =

2754

{ &pageset_cpuup_callback, NULL, 0 };

2766

{ &pageset_cpuup_callback, NULL, 0 };

2755

2767

2756

void __init setup_per_cpu_pageset(void)

2768

void __init setup_per_cpu_pageset(void)

2757

{

2769

{

2758

int err;

2770

int err;

2759

2771

2760

/* Initialize per_cpu_pageset for cpu 0.

2772

/* Initialize per_cpu_pageset for cpu 0.

2761

* A cpuup callback will do this for every cpu

2773

* A cpuup callback will do this for every cpu

2762

* as it comes online

2774

* as it comes online

2763

*/

2775

*/

2764

err = process_zones(smp_processor_id());

2776

err = process_zones(smp_processor_id());

2765

BUG_ON(err);

2777

BUG_ON(err);

2766

register_cpu_notifier(&pageset_notifier);

2778

register_cpu_notifier(&pageset_notifier);

2767

}

2779

}

2768

2780

2769

#endif

2781

#endif

2770

2782

2771

static noinline __init_refok

2783

static noinline __init_refok

2772

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2784

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2773

{

2785

{

2774

int i;

2786

int i;

2775

struct pglist_data *pgdat = zone->zone_pgdat;

2787

struct pglist_data *pgdat = zone->zone_pgdat;

2776

size_t alloc_size;

2788

size_t alloc_size;

2777

2789

2778

/*

2790

/*

2779

* The per-page waitqueue mechanism uses hashed waitqueues

2791

* The per-page waitqueue mechanism uses hashed waitqueues

2780

* per zone.

2792

* per zone.

2781

*/

2793

*/

2782

zone->wait_table_hash_nr_entries =

2794

zone->wait_table_hash_nr_entries =

2783

wait_table_hash_nr_entries(zone_size_pages);

2795

wait_table_hash_nr_entries(zone_size_pages);

2784

zone->wait_table_bits =

2796

zone->wait_table_bits =

2785

wait_table_bits(zone->wait_table_hash_nr_entries);

2797

wait_table_bits(zone->wait_table_hash_nr_entries);

2786

alloc_size = zone->wait_table_hash_nr_entries

2798

alloc_size = zone->wait_table_hash_nr_entries

2787

* sizeof(wait_queue_head_t);

2799

* sizeof(wait_queue_head_t);

2788

2800

2789

if (system_state == SYSTEM_BOOTING) {

2801

if (system_state == SYSTEM_BOOTING) {

2790

zone->wait_table = (wait_queue_head_t *)

2802

zone->wait_table = (wait_queue_head_t *)

2791

alloc_bootmem_node(pgdat, alloc_size);

2803

alloc_bootmem_node(pgdat, alloc_size);

2792

} else {

2804

} else {

2793

/*

2805

/*

2794

* This case means that a zone whose size was 0 gets new memory

2806

* This case means that a zone whose size was 0 gets new memory

2795

* via memory hot-add.

2807

* via memory hot-add.

2796

* But it may be the case that a new node was hot-added. In

2808

* But it may be the case that a new node was hot-added. In

2797

* this case vmalloc() will not be able to use this new node's

2809

* this case vmalloc() will not be able to use this new node's

2798

* memory - this wait_table must be initialized to use this new

2810

* memory - this wait_table must be initialized to use this new

2799

* node itself as well.

2811

* node itself as well.

2800

* To use this new node's memory, further consideration will be

2812

* To use this new node's memory, further consideration will be

2801

* necessary.

2813

* necessary.

2802

*/

2814

*/

2803

zone->wait_table = vmalloc(alloc_size);

2815

zone->wait_table = vmalloc(alloc_size);

2804

}

2816

}

2805

if (!zone->wait_table)

2817

if (!zone->wait_table)

2806

return -ENOMEM;

2818

return -ENOMEM;

2807

2819

2808

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2820

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2809

init_waitqueue_head(zone->wait_table + i);

2821

init_waitqueue_head(zone->wait_table + i);

2810

2822

2811

return 0;

2823

return 0;

2812

}

2824

}

2813

2825

2814

static __meminit void zone_pcp_init(struct zone *zone)

2826

static __meminit void zone_pcp_init(struct zone *zone)

2815

{

2827

{

2816

int cpu;

2828

int cpu;

2817

unsigned long batch = zone_batchsize(zone);

2829

unsigned long batch = zone_batchsize(zone);

2818

2830

2819

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2831

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2820

#ifdef CONFIG_NUMA

2832

#ifdef CONFIG_NUMA

2821

/* Early boot. Slab allocator not functional yet */

2833

/* Early boot. Slab allocator not functional yet */

2822

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2834

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2823

setup_pageset(&boot_pageset[cpu],0);

2835

setup_pageset(&boot_pageset[cpu],0);

2824

#else

2836

#else

2825

setup_pageset(zone_pcp(zone,cpu), batch);

2837

setup_pageset(zone_pcp(zone,cpu), batch);

2826

#endif

2838

#endif

2827

}

2839

}

2828

if (zone->present_pages)

2840

if (zone->present_pages)

2829

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2841

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2830

zone->name, zone->present_pages, batch);

2842

zone->name, zone->present_pages, batch);

2831

}

2843

}

2832

2844

2833

__meminit int init_currently_empty_zone(struct zone *zone,

2845

__meminit int init_currently_empty_zone(struct zone *zone,

2834

unsigned long zone_start_pfn,

2846

unsigned long zone_start_pfn,

2835

unsigned long size,

2847

unsigned long size,

2836

enum memmap_context context)

2848

enum memmap_context context)

2837

{

2849

{

2838

struct pglist_data *pgdat = zone->zone_pgdat;

2850

struct pglist_data *pgdat = zone->zone_pgdat;

2839

int ret;

2851

int ret;

2840

ret = zone_wait_table_init(zone, size);

2852

ret = zone_wait_table_init(zone, size);

2841

if (ret)

2853

if (ret)

2842

return ret;

2854

return ret;

2843

pgdat->nr_zones = zone_idx(zone) + 1;

2855

pgdat->nr_zones = zone_idx(zone) + 1;

2844

2856

2845

zone->zone_start_pfn = zone_start_pfn;

2857

zone->zone_start_pfn = zone_start_pfn;

2846

2858

2847

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

2859

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

2848

2860

2849

zone_init_free_lists(zone);

2861

zone_init_free_lists(zone);

2850

2862

2851

return 0;

2863

return 0;

2852

}

2864

}

2853

2865

2854

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2866

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2855

/*

2867

/*

2856

* Basic iterator support. Return the first range of PFNs for a node

2868

* Basic iterator support. Return the first range of PFNs for a node

2857

* Note: nid == MAX_NUMNODES returns first region regardless of node

2869

* Note: nid == MAX_NUMNODES returns first region regardless of node

2858

*/

2870

*/

2859

static int __meminit first_active_region_index_in_nid(int nid)

2871

static int __meminit first_active_region_index_in_nid(int nid)

2860

{

2872

{

2861

int i;

2873

int i;

2862

2874

2863

for (i = 0; i < nr_nodemap_entries; i++)

2875

for (i = 0; i < nr_nodemap_entries; i++)

2864

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2876

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2865

return i;

2877

return i;

2866

2878

2867

return -1;

2879

return -1;

2868

}

2880

}

2869

2881

2870

/*

2882

/*

2871

* Basic iterator support. Return the next active range of PFNs for a node

2883

* Basic iterator support. Return the next active range of PFNs for a node

2872

* Note: nid == MAX_NUMNODES returns next region regardless of node

2884

* Note: nid == MAX_NUMNODES returns next region regardless of node

2873

*/

2885

*/

2874

static int __meminit next_active_region_index_in_nid(int index, int nid)

2886

static int __meminit next_active_region_index_in_nid(int index, int nid)

2875

{

2887

{

2876

for (index = index + 1; index < nr_nodemap_entries; index++)

2888

for (index = index + 1; index < nr_nodemap_entries; index++)

2877

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2889

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2878

return index;

2890

return index;

2879

2891

2880

return -1;

2892

return -1;

2881

}

2893

}

2882

2894

2883

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2895

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2884

/*

2896

/*

2885

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2897

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2886

* Architectures may implement their own version but if add_active_range()

2898

* Architectures may implement their own version but if add_active_range()

2887

* was used and there are no special requirements, this is a convenient

2899

* was used and there are no special requirements, this is a convenient

2888

* alternative

2900

* alternative

2889

*/

2901

*/

2890

int __meminit early_pfn_to_nid(unsigned long pfn)

2902

int __meminit early_pfn_to_nid(unsigned long pfn)

2891

{

2903

{

2892

int i;

2904

int i;

2893

2905

2894

for (i = 0; i < nr_nodemap_entries; i++) {

2906

for (i = 0; i < nr_nodemap_entries; i++) {

2895

unsigned long start_pfn = early_node_map[i].start_pfn;

2907

unsigned long start_pfn = early_node_map[i].start_pfn;

2896

unsigned long end_pfn = early_node_map[i].end_pfn;

2908

unsigned long end_pfn = early_node_map[i].end_pfn;

2897

2909

2898

if (start_pfn <= pfn && pfn < end_pfn)

2910

if (start_pfn <= pfn && pfn < end_pfn)

2899

return early_node_map[i].nid;

2911

return early_node_map[i].nid;

2900

}

2912

}

2901

2913

2902

return 0;

2914

return 0;

2903

}

2915

}

2904

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2916

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2905

2917

2906

/* Basic iterator support to walk early_node_map[] */

2918

/* Basic iterator support to walk early_node_map[] */

2907

#define for_each_active_range_index_in_nid(i, nid) \

2919

#define for_each_active_range_index_in_nid(i, nid) \

2908

for (i = first_active_region_index_in_nid(nid); i != -1; \

2920

for (i = first_active_region_index_in_nid(nid); i != -1; \

2909

i = next_active_region_index_in_nid(i, nid))

2921

i = next_active_region_index_in_nid(i, nid))

2910

2922

2911

/**

2923

/**

2912

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2924

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2913

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2925

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2914

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2926

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2915

*

2927

*

2916

* If an architecture guarantees that all ranges registered with

2928

* If an architecture guarantees that all ranges registered with

2917

* add_active_ranges() contain no holes and may be freed, this

2929

* add_active_ranges() contain no holes and may be freed, this

2918

* this function may be used instead of calling free_bootmem() manually.

2930

* this function may be used instead of calling free_bootmem() manually.

2919

*/

2931

*/

2920

void __init free_bootmem_with_active_regions(int nid,

2932

void __init free_bootmem_with_active_regions(int nid,

2921

unsigned long max_low_pfn)

2933

unsigned long max_low_pfn)

2922

{

2934

{

2923

int i;

2935

int i;

2924

2936

2925

for_each_active_range_index_in_nid(i, nid) {

2937

for_each_active_range_index_in_nid(i, nid) {

2926

unsigned long size_pages = 0;

2938

unsigned long size_pages = 0;

2927

unsigned long end_pfn = early_node_map[i].end_pfn;

2939

unsigned long end_pfn = early_node_map[i].end_pfn;

2928

2940

2929

if (early_node_map[i].start_pfn >= max_low_pfn)

2941

if (early_node_map[i].start_pfn >= max_low_pfn)

2930

continue;

2942

continue;

2931

2943

2932

if (end_pfn > max_low_pfn)

2944

if (end_pfn > max_low_pfn)

2933

end_pfn = max_low_pfn;

2945

end_pfn = max_low_pfn;

2934

2946

2935

size_pages = end_pfn - early_node_map[i].start_pfn;

2947

size_pages = end_pfn - early_node_map[i].start_pfn;

2936

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2948

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2937

PFN_PHYS(early_node_map[i].start_pfn),

2949

PFN_PHYS(early_node_map[i].start_pfn),

2938

size_pages << PAGE_SHIFT);

2950

size_pages << PAGE_SHIFT);

2939

}

2951

}

2940

}

2952

}

2941

2953

2942

/**

2954

/**

2943

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2955

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2944

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2956

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2945

*

2957

*

2946

* If an architecture guarantees that all ranges registered with

2958

* If an architecture guarantees that all ranges registered with

2947

* add_active_ranges() contain no holes and may be freed, this

2959

* add_active_ranges() contain no holes and may be freed, this

2948

* function may be used instead of calling memory_present() manually.

2960

* function may be used instead of calling memory_present() manually.

2949

*/

2961

*/

2950

void __init sparse_memory_present_with_active_regions(int nid)

2962

void __init sparse_memory_present_with_active_regions(int nid)

2951

{

2963

{

2952

int i;

2964

int i;

2953

2965

2954

for_each_active_range_index_in_nid(i, nid)

2966

for_each_active_range_index_in_nid(i, nid)

2955

memory_present(early_node_map[i].nid,

2967

memory_present(early_node_map[i].nid,

2956

early_node_map[i].start_pfn,

2968

early_node_map[i].start_pfn,

2957

early_node_map[i].end_pfn);

2969

early_node_map[i].end_pfn);

2958

}

2970

}

2959

2971

2960

/**

2972

/**

2961

* push_node_boundaries - Push node boundaries to at least the requested boundary

2973

* push_node_boundaries - Push node boundaries to at least the requested boundary

2962

* @nid: The nid of the node to push the boundary for

2974

* @nid: The nid of the node to push the boundary for

2963

* @start_pfn: The start pfn of the node

2975

* @start_pfn: The start pfn of the node

2964

* @end_pfn: The end pfn of the node

2976

* @end_pfn: The end pfn of the node

2965

*

2977

*

2966

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2978

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2967

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2979

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2968

* be hotplugged even though no physical memory exists. This function allows

2980

* be hotplugged even though no physical memory exists. This function allows

2969

* an arch to push out the node boundaries so mem_map is allocated that can

2981

* an arch to push out the node boundaries so mem_map is allocated that can

2970

* be used later.

2982

* be used later.

2971

*/

2983

*/

2972

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2984

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2973

void __init push_node_boundaries(unsigned int nid,

2985

void __init push_node_boundaries(unsigned int nid,

2974

unsigned long start_pfn, unsigned long end_pfn)

2986

unsigned long start_pfn, unsigned long end_pfn)

2975

{

2987

{

2976

printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",

2988

printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",

2977

nid, start_pfn, end_pfn);

2989

nid, start_pfn, end_pfn);

2978

2990

2979

/* Initialise the boundary for this node if necessary */

2991

/* Initialise the boundary for this node if necessary */

2980

if (node_boundary_end_pfn[nid] == 0)

2992

if (node_boundary_end_pfn[nid] == 0)

2981

node_boundary_start_pfn[nid] = -1UL;

2993

node_boundary_start_pfn[nid] = -1UL;

2982

2994

2983

/* Update the boundaries */

2995

/* Update the boundaries */

2984

if (node_boundary_start_pfn[nid] > start_pfn)

2996

if (node_boundary_start_pfn[nid] > start_pfn)

2985

node_boundary_start_pfn[nid] = start_pfn;

2997

node_boundary_start_pfn[nid] = start_pfn;

2986

if (node_boundary_end_pfn[nid] < end_pfn)

2998

if (node_boundary_end_pfn[nid] < end_pfn)

2987

node_boundary_end_pfn[nid] = end_pfn;

2999

node_boundary_end_pfn[nid] = end_pfn;

2988

}

3000

}

2989

3001

2990

/* If necessary, push the node boundary out for reserve hotadd */

3002

/* If necessary, push the node boundary out for reserve hotadd */

2991

static void __meminit account_node_boundary(unsigned int nid,

3003

static void __meminit account_node_boundary(unsigned int nid,

2992

unsigned long *start_pfn, unsigned long *end_pfn)

3004

unsigned long *start_pfn, unsigned long *end_pfn)

2993

{

3005

{

2994

printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",

3006

printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",

2995

nid, *start_pfn, *end_pfn);

3007

nid, *start_pfn, *end_pfn);

2996

3008

2997

/* Return if boundary information has not been provided */

3009

/* Return if boundary information has not been provided */

2998

if (node_boundary_end_pfn[nid] == 0)

3010

if (node_boundary_end_pfn[nid] == 0)

2999

return;

3011

return;

3000

3012

3001

/* Check the boundaries and update if necessary */

3013

/* Check the boundaries and update if necessary */

3002

if (node_boundary_start_pfn[nid] < *start_pfn)

3014

if (node_boundary_start_pfn[nid] < *start_pfn)

3003

*start_pfn = node_boundary_start_pfn[nid];

3015

*start_pfn = node_boundary_start_pfn[nid];

3004

if (node_boundary_end_pfn[nid] > *end_pfn)

3016

if (node_boundary_end_pfn[nid] > *end_pfn)

3005

*end_pfn = node_boundary_end_pfn[nid];

3017

*end_pfn = node_boundary_end_pfn[nid];

3006

}

3018

}

3007

#else

3019

#else

3008

void __init push_node_boundaries(unsigned int nid,

3020

void __init push_node_boundaries(unsigned int nid,

3009

unsigned long start_pfn, unsigned long end_pfn) {}

3021

unsigned long start_pfn, unsigned long end_pfn) {}

3010

3022

3011

static void __meminit account_node_boundary(unsigned int nid,

3023

static void __meminit account_node_boundary(unsigned int nid,

3012

unsigned long *start_pfn, unsigned long *end_pfn) {}

3024

unsigned long *start_pfn, unsigned long *end_pfn) {}

3013

#endif

3025

#endif

3014

3026

3015

3027

3016

/**

3028

/**

3017

* get_pfn_range_for_nid - Return the start and end page frames for a node

3029

* get_pfn_range_for_nid - Return the start and end page frames for a node

3018

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

3030

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

3019

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

3031

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

3020

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

3032

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

3021

*

3033

*

3022

* It returns the start and end page frame of a node based on information

3034

* It returns the start and end page frame of a node based on information

3023

* provided by an arch calling add_active_range(). If called for a node

3035

* provided by an arch calling add_active_range(). If called for a node

3024

* with no available memory, a warning is printed and the start and end

3036

* with no available memory, a warning is printed and the start and end

3025

* PFNs will be 0.

3037

* PFNs will be 0.

3026

*/

3038

*/

3027

void __meminit get_pfn_range_for_nid(unsigned int nid,

3039

void __meminit get_pfn_range_for_nid(unsigned int nid,

3028

unsigned long *start_pfn, unsigned long *end_pfn)

3040

unsigned long *start_pfn, unsigned long *end_pfn)

3029

{

3041

{

3030

int i;

3042

int i;

3031

*start_pfn = -1UL;

3043

*start_pfn = -1UL;

3032

*end_pfn = 0;

3044

*end_pfn = 0;

3033

3045

3034

for_each_active_range_index_in_nid(i, nid) {

3046

for_each_active_range_index_in_nid(i, nid) {

3035

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

3047

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

3036

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

3048

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

3037

}

3049

}

3038

3050

3039

if (*start_pfn == -1UL)

3051

if (*start_pfn == -1UL)

3040

*start_pfn = 0;

3052

*start_pfn = 0;

3041

3053

3042

/* Push the node boundaries out if requested */

3054

/* Push the node boundaries out if requested */

3043

account_node_boundary(nid, start_pfn, end_pfn);

3055

account_node_boundary(nid, start_pfn, end_pfn);

3044

}

3056

}

3045

3057

3046

/*

3058

/*

3047

* This finds a zone that can be used for ZONE_MOVABLE pages. The

3059

* This finds a zone that can be used for ZONE_MOVABLE pages. The

3048

* assumption is made that zones within a node are ordered in monotonic

3060

* assumption is made that zones within a node are ordered in monotonic

3049

* increasing memory addresses so that the "highest" populated zone is used

3061

* increasing memory addresses so that the "highest" populated zone is used

3050

*/

3062

*/

3051

void __init find_usable_zone_for_movable(void)

3063

void __init find_usable_zone_for_movable(void)

3052

{

3064

{

3053

int zone_index;

3065

int zone_index;

3054

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

3066

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

3055

if (zone_index == ZONE_MOVABLE)

3067

if (zone_index == ZONE_MOVABLE)

3056

continue;

3068

continue;

3057

3069

3058

if (arch_zone_highest_possible_pfn[zone_index] >

3070

if (arch_zone_highest_possible_pfn[zone_index] >

3059

arch_zone_lowest_possible_pfn[zone_index])

3071

arch_zone_lowest_possible_pfn[zone_index])

3060

break;

3072

break;

3061

}

3073

}

3062

3074

3063

VM_BUG_ON(zone_index == -1);

3075

VM_BUG_ON(zone_index == -1);

3064

movable_zone = zone_index;

3076

movable_zone = zone_index;

3065

}

3077

}

3066

3078

3067

/*

3079

/*

3068

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

3080

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

3069

* because it is sized independant of architecture. Unlike the other zones,

3081

* because it is sized independant of architecture. Unlike the other zones,

3070

* the starting point for ZONE_MOVABLE is not fixed. It may be different

3082

* the starting point for ZONE_MOVABLE is not fixed. It may be different

3071

* in each node depending on the size of each node and how evenly kernelcore

3083

* in each node depending on the size of each node and how evenly kernelcore

3072

* is distributed. This helper function adjusts the zone ranges

3084

* is distributed. This helper function adjusts the zone ranges

3073

* provided by the architecture for a given node by using the end of the

3085

* provided by the architecture for a given node by using the end of the

3074

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

3086

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

3075

* zones within a node are in order of monotonic increases memory addresses

3087

* zones within a node are in order of monotonic increases memory addresses

3076

*/

3088

*/

3077

void __meminit adjust_zone_range_for_zone_movable(int nid,

3089

void __meminit adjust_zone_range_for_zone_movable(int nid,

3078

unsigned long zone_type,

3090

unsigned long zone_type,

3079

unsigned long node_start_pfn,

3091

unsigned long node_start_pfn,

3080

unsigned long node_end_pfn,

3092

unsigned long node_end_pfn,

3081

unsigned long *zone_start_pfn,

3093

unsigned long *zone_start_pfn,

3082

unsigned long *zone_end_pfn)

3094

unsigned long *zone_end_pfn)

3083

{

3095

{

3084

/* Only adjust if ZONE_MOVABLE is on this node */

3096

/* Only adjust if ZONE_MOVABLE is on this node */

3085

if (zone_movable_pfn[nid]) {

3097

if (zone_movable_pfn[nid]) {

3086

/* Size ZONE_MOVABLE */

3098

/* Size ZONE_MOVABLE */

3087

if (zone_type == ZONE_MOVABLE) {

3099

if (zone_type == ZONE_MOVABLE) {

3088

*zone_start_pfn = zone_movable_pfn[nid];

3100

*zone_start_pfn = zone_movable_pfn[nid];

3089

*zone_end_pfn = min(node_end_pfn,

3101

*zone_end_pfn = min(node_end_pfn,

3090

arch_zone_highest_possible_pfn[movable_zone]);

3102

arch_zone_highest_possible_pfn[movable_zone]);

3091

3103

3092

/* Adjust for ZONE_MOVABLE starting within this range */

3104

/* Adjust for ZONE_MOVABLE starting within this range */

3093

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

3105

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

3094

*zone_end_pfn > zone_movable_pfn[nid]) {

3106

*zone_end_pfn > zone_movable_pfn[nid]) {

3095

*zone_end_pfn = zone_movable_pfn[nid];

3107

*zone_end_pfn = zone_movable_pfn[nid];

3096

3108

3097

/* Check if this whole range is within ZONE_MOVABLE */

3109

/* Check if this whole range is within ZONE_MOVABLE */

3098

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

3110

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

3099

*zone_start_pfn = *zone_end_pfn;

3111

*zone_start_pfn = *zone_end_pfn;

3100

}

3112

}

3101

}

3113

}

3102

3114

3103

/*

3115

/*

3104

* Return the number of pages a zone spans in a node, including holes

3116

* Return the number of pages a zone spans in a node, including holes

3105

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

3117

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

3106

*/

3118

*/

3107

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

3119

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

3108

unsigned long zone_type,

3120

unsigned long zone_type,

3109

unsigned long *ignored)

3121

unsigned long *ignored)

3110

{

3122

{

3111

unsigned long node_start_pfn, node_end_pfn;

3123

unsigned long node_start_pfn, node_end_pfn;

3112

unsigned long zone_start_pfn, zone_end_pfn;

3124

unsigned long zone_start_pfn, zone_end_pfn;

3113

3125

3114

/* Get the start and end of the node and zone */

3126

/* Get the start and end of the node and zone */

3115

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3127

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3116

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

3128

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

3117

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

3129

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

3118

adjust_zone_range_for_zone_movable(nid, zone_type,

3130

adjust_zone_range_for_zone_movable(nid, zone_type,

3119

node_start_pfn, node_end_pfn,

3131

node_start_pfn, node_end_pfn,

3120

&zone_start_pfn, &zone_end_pfn);

3132

&zone_start_pfn, &zone_end_pfn);

3121

3133

3122

/* Check that this node has pages within the zone's required range */

3134

/* Check that this node has pages within the zone's required range */

3123

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

3135

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

3124

return 0;

3136

return 0;

3125

3137

3126

/* Move the zone boundaries inside the node if necessary */

3138

/* Move the zone boundaries inside the node if necessary */

3127

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

3139

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

3128

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

3140

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

3129

3141

3130

/* Return the spanned pages */

3142

/* Return the spanned pages */

3131

return zone_end_pfn - zone_start_pfn;

3143

return zone_end_pfn - zone_start_pfn;

3132

}

3144

}

3133

3145

3134

/*

3146

/*

3135

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

3147

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

3136

* then all holes in the requested range will be accounted for.

3148

* then all holes in the requested range will be accounted for.

3137

*/

3149

*/

3138

unsigned long __meminit __absent_pages_in_range(int nid,

3150

unsigned long __meminit __absent_pages_in_range(int nid,

3139

unsigned long range_start_pfn,

3151

unsigned long range_start_pfn,

3140

unsigned long range_end_pfn)

3152

unsigned long range_end_pfn)

3141

{

3153

{

3142

int i = 0;

3154

int i = 0;

3143

unsigned long prev_end_pfn = 0, hole_pages = 0;

3155

unsigned long prev_end_pfn = 0, hole_pages = 0;

3144

unsigned long start_pfn;

3156

unsigned long start_pfn;

3145

3157

3146

/* Find the end_pfn of the first active range of pfns in the node */

3158

/* Find the end_pfn of the first active range of pfns in the node */

3147

i = first_active_region_index_in_nid(nid);

3159

i = first_active_region_index_in_nid(nid);

3148

if (i == -1)

3160

if (i == -1)

3149

return 0;

3161

return 0;

3150

3162

3151

prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3163

prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3152

3164

3153

/* Account for ranges before physical memory on this node */

3165

/* Account for ranges before physical memory on this node */

3154

if (early_node_map[i].start_pfn > range_start_pfn)

3166

if (early_node_map[i].start_pfn > range_start_pfn)

3155

hole_pages = prev_end_pfn - range_start_pfn;

3167

hole_pages = prev_end_pfn - range_start_pfn;

3156

3168

3157

/* Find all holes for the zone within the node */

3169

/* Find all holes for the zone within the node */

3158

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

3170

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

3159

3171

3160

/* No need to continue if prev_end_pfn is outside the zone */

3172

/* No need to continue if prev_end_pfn is outside the zone */

3161

if (prev_end_pfn >= range_end_pfn)

3173

if (prev_end_pfn >= range_end_pfn)

3162

break;

3174

break;

3163

3175

3164

/* Make sure the end of the zone is not within the hole */

3176

/* Make sure the end of the zone is not within the hole */

3165

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3177

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

3166

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

3178

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

3167

3179

3168

/* Update the hole size cound and move on */

3180

/* Update the hole size cound and move on */

3169

if (start_pfn > range_start_pfn) {

3181

if (start_pfn > range_start_pfn) {

3170

BUG_ON(prev_end_pfn > start_pfn);

3182

BUG_ON(prev_end_pfn > start_pfn);

3171

hole_pages += start_pfn - prev_end_pfn;

3183

hole_pages += start_pfn - prev_end_pfn;

3172

}

3184

}

3173

prev_end_pfn = early_node_map[i].end_pfn;

3185

prev_end_pfn = early_node_map[i].end_pfn;

3174

}

3186

}

3175

3187

3176

/* Account for ranges past physical memory on this node */

3188

/* Account for ranges past physical memory on this node */

3177

if (range_end_pfn > prev_end_pfn)

3189

if (range_end_pfn > prev_end_pfn)

3178

hole_pages += range_end_pfn -

3190

hole_pages += range_end_pfn -

3179

max(range_start_pfn, prev_end_pfn);

3191

max(range_start_pfn, prev_end_pfn);

3180

3192

3181

return hole_pages;

3193

return hole_pages;

3182

}

3194

}

3183

3195

3184

/**

3196

/**

3185

* absent_pages_in_range - Return number of page frames in holes within a range

3197

* absent_pages_in_range - Return number of page frames in holes within a range

3186

* @start_pfn: The start PFN to start searching for holes

3198

* @start_pfn: The start PFN to start searching for holes

3187

* @end_pfn: The end PFN to stop searching for holes

3199

* @end_pfn: The end PFN to stop searching for holes

3188

*

3200

*

3189

* It returns the number of pages frames in memory holes within a range.

3201

* It returns the number of pages frames in memory holes within a range.

3190

*/

3202

*/

3191

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

3203

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

3192

unsigned long end_pfn)

3204

unsigned long end_pfn)

3193

{

3205

{

3194

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

3206

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

3195

}

3207

}

3196

3208

3197

/* Return the number of page frames in holes in a zone on a node */

3209

/* Return the number of page frames in holes in a zone on a node */

3198

static unsigned long __meminit zone_absent_pages_in_node(int nid,

3210

static unsigned long __meminit zone_absent_pages_in_node(int nid,

3199

unsigned long zone_type,

3211

unsigned long zone_type,

3200

unsigned long *ignored)

3212

unsigned long *ignored)

3201

{

3213

{

3202

unsigned long node_start_pfn, node_end_pfn;

3214

unsigned long node_start_pfn, node_end_pfn;

3203

unsigned long zone_start_pfn, zone_end_pfn;

3215

unsigned long zone_start_pfn, zone_end_pfn;

3204

3216

3205

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3217

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

3206

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

3218

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

3207

node_start_pfn);

3219

node_start_pfn);

3208

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

3220

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

3209

node_end_pfn);

3221

node_end_pfn);

3210

3222

3211

adjust_zone_range_for_zone_movable(nid, zone_type,

3223

adjust_zone_range_for_zone_movable(nid, zone_type,

3212

node_start_pfn, node_end_pfn,

3224

node_start_pfn, node_end_pfn,

3213

&zone_start_pfn, &zone_end_pfn);

3225

&zone_start_pfn, &zone_end_pfn);

3214

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

3226

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

3215

}

3227

}

3216

3228

3217

#else

3229

#else

3218

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

3230

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

3219

unsigned long zone_type,

3231

unsigned long zone_type,

3220

unsigned long *zones_size)

3232

unsigned long *zones_size)

3221

{

3233

{

3222

return zones_size[zone_type];

3234

return zones_size[zone_type];

3223

}

3235

}

3224

3236

3225

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

3237

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

3226

unsigned long zone_type,

3238

unsigned long zone_type,

3227

unsigned long *zholes_size)

3239

unsigned long *zholes_size)

3228

{

3240

{

3229

if (!zholes_size)

3241

if (!zholes_size)

3230

return 0;

3242

return 0;

3231

3243

3232

return zholes_size[zone_type];

3244

return zholes_size[zone_type];

3233

}

3245

}

3234

3246

3235

#endif

3247

#endif

3236

3248

3237

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

3249

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

3238

unsigned long *zones_size, unsigned long *zholes_size)

3250

unsigned long *zones_size, unsigned long *zholes_size)

3239

{

3251

{

3240

unsigned long realtotalpages, totalpages = 0;

3252

unsigned long realtotalpages, totalpages = 0;

3241

enum zone_type i;

3253

enum zone_type i;

3242

3254

3243

for (i = 0; i < MAX_NR_ZONES; i++)

3255

for (i = 0; i < MAX_NR_ZONES; i++)

3244

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

3256

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

3245

zones_size);

3257

zones_size);

3246

pgdat->node_spanned_pages = totalpages;

3258

pgdat->node_spanned_pages = totalpages;

3247

3259

3248

realtotalpages = totalpages;

3260

realtotalpages = totalpages;

3249

for (i = 0; i < MAX_NR_ZONES; i++)

3261

for (i = 0; i < MAX_NR_ZONES; i++)

3250

realtotalpages -=

3262

realtotalpages -=

3251

zone_absent_pages_in_node(pgdat->node_id, i,

3263

zone_absent_pages_in_node(pgdat->node_id, i,

3252

zholes_size);

3264

zholes_size);

3253

pgdat->node_present_pages = realtotalpages;

3265

pgdat->node_present_pages = realtotalpages;

3254

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

3266

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

3255

realtotalpages);

3267

realtotalpages);

3256

}

3268

}

3257

3269

3258

#ifndef CONFIG_SPARSEMEM

3270

#ifndef CONFIG_SPARSEMEM

3259

/*

3271

/*

3260

* Calculate the size of the zone->blockflags rounded to an unsigned long

3272

* Calculate the size of the zone->blockflags rounded to an unsigned long

3261

* Start by making sure zonesize is a multiple of pageblock_order by rounding

3273

* Start by making sure zonesize is a multiple of pageblock_order by rounding

3262

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

3274

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

3263

* round what is now in bits to nearest long in bits, then return it in

3275

* round what is now in bits to nearest long in bits, then return it in

3264

* bytes.

3276

* bytes.

3265

*/

3277

*/

3266

static unsigned long __init usemap_size(unsigned long zonesize)

3278

static unsigned long __init usemap_size(unsigned long zonesize)

3267

{

3279

{

3268

unsigned long usemapsize;

3280

unsigned long usemapsize;

3269

3281

3270

usemapsize = roundup(zonesize, pageblock_nr_pages);

3282

usemapsize = roundup(zonesize, pageblock_nr_pages);

3271

usemapsize = usemapsize >> pageblock_order;

3283

usemapsize = usemapsize >> pageblock_order;

3272

usemapsize *= NR_PAGEBLOCK_BITS;

3284

usemapsize *= NR_PAGEBLOCK_BITS;

3273

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

3285

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

3274

3286

3275

return usemapsize / 8;

3287

return usemapsize / 8;

3276

}

3288

}

3277

3289

3278

static void __init setup_usemap(struct pglist_data *pgdat,

3290

static void __init setup_usemap(struct pglist_data *pgdat,

3279

struct zone *zone, unsigned long zonesize)

3291

struct zone *zone, unsigned long zonesize)

3280

{

3292

{

3281

unsigned long usemapsize = usemap_size(zonesize);

3293

unsigned long usemapsize = usemap_size(zonesize);

3282

zone->pageblock_flags = NULL;

3294

zone->pageblock_flags = NULL;

3283

if (usemapsize) {

3295

if (usemapsize) {

3284

zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);

3296

zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);

3285

memset(zone->pageblock_flags, 0, usemapsize);

3297

memset(zone->pageblock_flags, 0, usemapsize);

3286

}

3298

}

3287

}

3299

}

3288

#else

3300

#else

3289

static void inline setup_usemap(struct pglist_data *pgdat,

3301

static void inline setup_usemap(struct pglist_data *pgdat,

3290

struct zone *zone, unsigned long zonesize) {}

3302

struct zone *zone, unsigned long zonesize) {}

3291

#endif /* CONFIG_SPARSEMEM */

3303

#endif /* CONFIG_SPARSEMEM */

3292

3304

3293

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

3305

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

3294

3306

3295

/* Return a sensible default order for the pageblock size. */

3307

/* Return a sensible default order for the pageblock size. */

3296

static inline int pageblock_default_order(void)

3308

static inline int pageblock_default_order(void)

3297

{

3309

{

3298

if (HPAGE_SHIFT > PAGE_SHIFT)

3310

if (HPAGE_SHIFT > PAGE_SHIFT)

3299

return HUGETLB_PAGE_ORDER;

3311

return HUGETLB_PAGE_ORDER;

3300

3312

3301

return MAX_ORDER-1;

3313

return MAX_ORDER-1;

3302

}

3314

}

3303

3315

3304

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

3316

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

3305

static inline void __init set_pageblock_order(unsigned int order)

3317

static inline void __init set_pageblock_order(unsigned int order)

3306

{

3318

{

3307

/* Check that pageblock_nr_pages has not already been setup */

3319

/* Check that pageblock_nr_pages has not already been setup */

3308

if (pageblock_order)

3320

if (pageblock_order)

3309

return;

3321

return;

3310

3322

3311

/*

3323

/*

3312

* Assume the largest contiguous order of interest is a huge page.

3324

* Assume the largest contiguous order of interest is a huge page.

3313

* This value may be variable depending on boot parameters on IA64

3325

* This value may be variable depending on boot parameters on IA64

3314

*/

3326

*/

3315

pageblock_order = order;

3327

pageblock_order = order;

3316

}

3328

}

3317

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3329

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3318

3330

3319

/*

3331

/*

3320

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

3332

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

3321

* and pageblock_default_order() are unused as pageblock_order is set

3333

* and pageblock_default_order() are unused as pageblock_order is set

3322

* at compile-time. See include/linux/pageblock-flags.h for the values of

3334

* at compile-time. See include/linux/pageblock-flags.h for the values of

3323

* pageblock_order based on the kernel config

3335

* pageblock_order based on the kernel config

3324

*/

3336

*/

3325

static inline int pageblock_default_order(unsigned int order)

3337

static inline int pageblock_default_order(unsigned int order)

3326

{

3338

{

3327

return MAX_ORDER-1;

3339

return MAX_ORDER-1;

3328

}

3340

}

3329

#define set_pageblock_order(x) do {} while (0)

3341

#define set_pageblock_order(x) do {} while (0)

3330

3342

3331

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3343

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

3332

3344

3333

/*

3345

/*

3334

* Set up the zone data structures:

3346

* Set up the zone data structures:

3335

* - mark all pages reserved

3347

* - mark all pages reserved

3336

* - mark all memory queues empty

3348

* - mark all memory queues empty

3337

* - clear the memory bitmaps

3349

* - clear the memory bitmaps

3338

*/

3350

*/

3339

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

3351

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

3340

unsigned long *zones_size, unsigned long *zholes_size)

3352

unsigned long *zones_size, unsigned long *zholes_size)

3341

{

3353

{

3342

enum zone_type j;

3354

enum zone_type j;

3343

int nid = pgdat->node_id;

3355

int nid = pgdat->node_id;

3344

unsigned long zone_start_pfn = pgdat->node_start_pfn;

3356

unsigned long zone_start_pfn = pgdat->node_start_pfn;

3345

int ret;

3357

int ret;

3346

3358

3347

pgdat_resize_init(pgdat);

3359

pgdat_resize_init(pgdat);

3348

pgdat->nr_zones = 0;

3360

pgdat->nr_zones = 0;

3349

init_waitqueue_head(&pgdat->kswapd_wait);

3361

init_waitqueue_head(&pgdat->kswapd_wait);

3350

pgdat->kswapd_max_order = 0;

3362

pgdat->kswapd_max_order = 0;

3351

3363

3352

for (j = 0; j < MAX_NR_ZONES; j++) {

3364

for (j = 0; j < MAX_NR_ZONES; j++) {

3353

struct zone *zone = pgdat->node_zones + j;

3365

struct zone *zone = pgdat->node_zones + j;

3354

unsigned long size, realsize, memmap_pages;

3366

unsigned long size, realsize, memmap_pages;

3355

3367

3356

size = zone_spanned_pages_in_node(nid, j, zones_size);

3368

size = zone_spanned_pages_in_node(nid, j, zones_size);

3357

realsize = size - zone_absent_pages_in_node(nid, j,

3369

realsize = size - zone_absent_pages_in_node(nid, j,

3358

zholes_size);

3370

zholes_size);

3359

3371

3360

/*

3372

/*

3361

* Adjust realsize so that it accounts for how much memory

3373

* Adjust realsize so that it accounts for how much memory

3362

* is used by this zone for memmap. This affects the watermark

3374

* is used by this zone for memmap. This affects the watermark

3363

* and per-cpu initialisations

3375

* and per-cpu initialisations

3364

*/

3376

*/

3365

memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;

3377

memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;

3366

if (realsize >= memmap_pages) {

3378

if (realsize >= memmap_pages) {

3367

realsize -= memmap_pages;

3379

realsize -= memmap_pages;

3368

printk(KERN_DEBUG

3380

printk(KERN_DEBUG

3369

" %s zone: %lu pages used for memmap\n",

3381

" %s zone: %lu pages used for memmap\n",

3370

zone_names[j], memmap_pages);

3382

zone_names[j], memmap_pages);

3371

} else

3383

} else

3372

printk(KERN_WARNING

3384

printk(KERN_WARNING

3373

" %s zone: %lu pages exceeds realsize %lu\n",

3385

" %s zone: %lu pages exceeds realsize %lu\n",

3374

zone_names[j], memmap_pages, realsize);

3386

zone_names[j], memmap_pages, realsize);

3375

3387

3376

/* Account for reserved pages */

3388

/* Account for reserved pages */

3377

if (j == 0 && realsize > dma_reserve) {

3389

if (j == 0 && realsize > dma_reserve) {

3378

realsize -= dma_reserve;

3390

realsize -= dma_reserve;

3379

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

3391

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

3380

zone_names[0], dma_reserve);

3392

zone_names[0], dma_reserve);

3381

}

3393

}

3382

3394

3383

if (!is_highmem_idx(j))

3395

if (!is_highmem_idx(j))

3384

nr_kernel_pages += realsize;

3396

nr_kernel_pages += realsize;

3385

nr_all_pages += realsize;

3397

nr_all_pages += realsize;

3386

3398

3387

zone->spanned_pages = size;

3399

zone->spanned_pages = size;

3388

zone->present_pages = realsize;

3400

zone->present_pages = realsize;

3389

#ifdef CONFIG_NUMA

3401

#ifdef CONFIG_NUMA

3390

zone->node = nid;

3402

zone->node = nid;

3391

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

3403

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

3392

/ 100;

3404

/ 100;

3393

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

3405

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

3394

#endif

3406

#endif

3395

zone->name = zone_names[j];

3407

zone->name = zone_names[j];

3396

spin_lock_init(&zone->lock);

3408

spin_lock_init(&zone->lock);

3397

spin_lock_init(&zone->lru_lock);

3409

spin_lock_init(&zone->lru_lock);

3398

zone_seqlock_init(zone);

3410

zone_seqlock_init(zone);

3399

zone->zone_pgdat = pgdat;

3411

zone->zone_pgdat = pgdat;

3400

3412

3401

zone->prev_priority = DEF_PRIORITY;

3413

zone->prev_priority = DEF_PRIORITY;

3402

3414

3403

zone_pcp_init(zone);

3415

zone_pcp_init(zone);

3404

INIT_LIST_HEAD(&zone->active_list);

3416

INIT_LIST_HEAD(&zone->active_list);

3405

INIT_LIST_HEAD(&zone->inactive_list);

3417

INIT_LIST_HEAD(&zone->inactive_list);

3406

zone->nr_scan_active = 0;

3418

zone->nr_scan_active = 0;

3407

zone->nr_scan_inactive = 0;

3419

zone->nr_scan_inactive = 0;

3408

zap_zone_vm_stats(zone);

3420

zap_zone_vm_stats(zone);

3409

zone->flags = 0;

3421

zone->flags = 0;

3410

if (!size)

3422

if (!size)

3411

continue;

3423

continue;

3412

3424

3413

set_pageblock_order(pageblock_default_order());

3425

set_pageblock_order(pageblock_default_order());

3414

setup_usemap(pgdat, zone, size);

3426

setup_usemap(pgdat, zone, size);

3415

ret = init_currently_empty_zone(zone, zone_start_pfn,

3427

ret = init_currently_empty_zone(zone, zone_start_pfn,

3416

size, MEMMAP_EARLY);

3428

size, MEMMAP_EARLY);

3417

BUG_ON(ret);

3429

BUG_ON(ret);

3418

zone_start_pfn += size;

3430

zone_start_pfn += size;

3419

}

3431

}

3420

}

3432

}

3421

3433

3422

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

3434

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

3423

{

3435

{

3424

/* Skip empty nodes */

3436

/* Skip empty nodes */

3425

if (!pgdat->node_spanned_pages)

3437

if (!pgdat->node_spanned_pages)

3426

return;

3438

return;

3427

3439

3428

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3440

#ifdef CONFIG_FLAT_NODE_MEM_MAP

3429

/* ia64 gets its own node_mem_map, before this, without bootmem */

3441

/* ia64 gets its own node_mem_map, before this, without bootmem */

3430

if (!pgdat->node_mem_map) {

3442

if (!pgdat->node_mem_map) {

3431

unsigned long size, start, end;

3443

unsigned long size, start, end;

3432

struct page *map;

3444

struct page *map;

3433

3445

3434

/*

3446

/*

3435

* The zone's endpoints aren't required to be MAX_ORDER

3447

* The zone's endpoints aren't required to be MAX_ORDER

3436

* aligned but the node_mem_map endpoints must be in order

3448

* aligned but the node_mem_map endpoints must be in order

3437

* for the buddy allocator to function correctly.

3449

* for the buddy allocator to function correctly.

3438

*/

3450

*/

3439

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

3451

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

3440

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

3452

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

3441

end = ALIGN(end, MAX_ORDER_NR_PAGES);

3453

end = ALIGN(end, MAX_ORDER_NR_PAGES);

3442

size = (end - start) * sizeof(struct page);

3454

size = (end - start) * sizeof(struct page);

3443

map = alloc_remap(pgdat->node_id, size);

3455

map = alloc_remap(pgdat->node_id, size);

3444

if (!map)

3456

if (!map)

3445

map = alloc_bootmem_node(pgdat, size);

3457

map = alloc_bootmem_node(pgdat, size);

3446

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

3458

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

3447

}

3459

}

3448

#ifndef CONFIG_NEED_MULTIPLE_NODES

3460

#ifndef CONFIG_NEED_MULTIPLE_NODES

3449

/*

3461

/*

3450

* With no DISCONTIG, the global mem_map is just set as node 0's

3462

* With no DISCONTIG, the global mem_map is just set as node 0's

3451

*/

3463

*/

3452

if (pgdat == NODE_DATA(0)) {

3464

if (pgdat == NODE_DATA(0)) {

3453

mem_map = NODE_DATA(0)->node_mem_map;

3465

mem_map = NODE_DATA(0)->node_mem_map;

3454

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3466

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3455

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

3467

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

3456

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

3468

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

3457

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3469

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3458

}

3470

}

3459

#endif

3471

#endif

3460

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

3472

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

3461

}

3473

}

3462

3474

3463

void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,

3475

void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,

3464

unsigned long *zones_size, unsigned long node_start_pfn,

3476

unsigned long *zones_size, unsigned long node_start_pfn,

3465

unsigned long *zholes_size)

3477

unsigned long *zholes_size)

3466

{

3478

{

3467

pgdat->node_id = nid;

3479

pgdat->node_id = nid;

3468

pgdat->node_start_pfn = node_start_pfn;

3480

pgdat->node_start_pfn = node_start_pfn;

3469

calculate_node_totalpages(pgdat, zones_size, zholes_size);

3481

calculate_node_totalpages(pgdat, zones_size, zholes_size);

3470

3482

3471

alloc_node_mem_map(pgdat);

3483

alloc_node_mem_map(pgdat);

3472

3484

3473

free_area_init_core(pgdat, zones_size, zholes_size);

3485

free_area_init_core(pgdat, zones_size, zholes_size);

3474

}

3486

}

3475

3487

3476

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3488

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

3477

3489

3478

#if MAX_NUMNODES > 1

3490

#if MAX_NUMNODES > 1

3479

/*

3491

/*

3480

* Figure out the number of possible node ids.

3492

* Figure out the number of possible node ids.

3481

*/

3493

*/

3482

static void __init setup_nr_node_ids(void)

3494

static void __init setup_nr_node_ids(void)

3483

{

3495

{

3484

unsigned int node;

3496

unsigned int node;

3485

unsigned int highest = 0;

3497

unsigned int highest = 0;

3486

3498

3487

for_each_node_mask(node, node_possible_map)

3499

for_each_node_mask(node, node_possible_map)

3488

highest = node;

3500

highest = node;

3489

nr_node_ids = highest + 1;

3501

nr_node_ids = highest + 1;

3490

}

3502

}

3491

#else

3503

#else

3492

static inline void setup_nr_node_ids(void)

3504

static inline void setup_nr_node_ids(void)

3493

{

3505

{

3494

}

3506

}

3495

#endif

3507

#endif

3496

3508

3497

/**

3509

/**

3498

* add_active_range - Register a range of PFNs backed by physical memory

3510

* add_active_range - Register a range of PFNs backed by physical memory

3499

* @nid: The node ID the range resides on

3511

* @nid: The node ID the range resides on

3500

* @start_pfn: The start PFN of the available physical memory

3512

* @start_pfn: The start PFN of the available physical memory

3501

* @end_pfn: The end PFN of the available physical memory

3513

* @end_pfn: The end PFN of the available physical memory

3502

*

3514

*

3503

* These ranges are stored in an early_node_map[] and later used by

3515

* These ranges are stored in an early_node_map[] and later used by

3504

* free_area_init_nodes() to calculate zone sizes and holes. If the

3516

* free_area_init_nodes() to calculate zone sizes and holes. If the

3505

* range spans a memory hole, it is up to the architecture to ensure

3517

* range spans a memory hole, it is up to the architecture to ensure

3506

* the memory is not freed by the bootmem allocator. If possible

3518

* the memory is not freed by the bootmem allocator. If possible

3507

* the range being registered will be merged with existing ranges.

3519

* the range being registered will be merged with existing ranges.

3508

*/

3520

*/

3509

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

3521

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

3510

unsigned long end_pfn)

3522

unsigned long end_pfn)

3511

{

3523

{

3512

int i;

3524

int i;

3513

3525

3514

printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "

3526

printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "

3515

"%d entries of %d used\n",

3527

"%d entries of %d used\n",

3516

nid, start_pfn, end_pfn,

3528

nid, start_pfn, end_pfn,

3517

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

3529

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

3518

3530

3519

/* Merge with existing active regions if possible */

3531

/* Merge with existing active regions if possible */

3520

for (i = 0; i < nr_nodemap_entries; i++) {

3532

for (i = 0; i < nr_nodemap_entries; i++) {

3521

if (early_node_map[i].nid != nid)

3533

if (early_node_map[i].nid != nid)

3522

continue;

3534

continue;

3523

3535

3524

/* Skip if an existing region covers this new one */

3536

/* Skip if an existing region covers this new one */

3525

if (start_pfn >= early_node_map[i].start_pfn &&

3537

if (start_pfn >= early_node_map[i].start_pfn &&

3526

end_pfn <= early_node_map[i].end_pfn)

3538

end_pfn <= early_node_map[i].end_pfn)

3527

return;

3539

return;

3528

3540

3529

/* Merge forward if suitable */

3541

/* Merge forward if suitable */

3530

if (start_pfn <= early_node_map[i].end_pfn &&

3542

if (start_pfn <= early_node_map[i].end_pfn &&

3531

end_pfn > early_node_map[i].end_pfn) {

3543

end_pfn > early_node_map[i].end_pfn) {

3532

early_node_map[i].end_pfn = end_pfn;

3544

early_node_map[i].end_pfn = end_pfn;

3533

return;

3545

return;

3534

}

3546

}

3535

3547

3536

/* Merge backward if suitable */

3548

/* Merge backward if suitable */

3537

if (start_pfn < early_node_map[i].end_pfn &&

3549

if (start_pfn < early_node_map[i].end_pfn &&

3538

end_pfn >= early_node_map[i].start_pfn) {

3550

end_pfn >= early_node_map[i].start_pfn) {

3539

early_node_map[i].start_pfn = start_pfn;

3551

early_node_map[i].start_pfn = start_pfn;

3540

return;

3552

return;

3541

}

3553

}

3542

}

3554

}

3543

3555

3544

/* Check that early_node_map is large enough */

3556

/* Check that early_node_map is large enough */

3545

if (i >= MAX_ACTIVE_REGIONS) {

3557

if (i >= MAX_ACTIVE_REGIONS) {

3546

printk(KERN_CRIT "More than %d memory regions, truncating\n",

3558

printk(KERN_CRIT "More than %d memory regions, truncating\n",

3547

MAX_ACTIVE_REGIONS);

3559

MAX_ACTIVE_REGIONS);

3548

return;

3560

return;

3549

}

3561

}

3550

3562

3551

early_node_map[i].nid = nid;

3563

early_node_map[i].nid = nid;

3552

early_node_map[i].start_pfn = start_pfn;

3564

early_node_map[i].start_pfn = start_pfn;

3553

early_node_map[i].end_pfn = end_pfn;

3565

early_node_map[i].end_pfn = end_pfn;

3554

nr_nodemap_entries = i + 1;

3566

nr_nodemap_entries = i + 1;

3555

}

3567

}

3556

3568

3557

/**

3569

/**

3558

* shrink_active_range - Shrink an existing registered range of PFNs

3570

* shrink_active_range - Shrink an existing registered range of PFNs

3559

* @nid: The node id the range is on that should be shrunk

3571

* @nid: The node id the range is on that should be shrunk

3560

* @old_end_pfn: The old end PFN of the range

3572

* @old_end_pfn: The old end PFN of the range

3561

* @new_end_pfn: The new PFN of the range

3573

* @new_end_pfn: The new PFN of the range

3562

*

3574

*

3563

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

3575

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

3564

* The map is kept at the end physical page range that has already been

3576

* The map is kept at the end physical page range that has already been

3565

* registered with add_active_range(). This function allows an arch to shrink

3577

* registered with add_active_range(). This function allows an arch to shrink

3566

* an existing registered range.

3578

* an existing registered range.

3567

*/

3579

*/

3568

void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,

3580

void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,

3569

unsigned long new_end_pfn)

3581

unsigned long new_end_pfn)

3570

{

3582

{

3571

int i;

3583

int i;

3572

3584

3573

/* Find the old active region end and shrink */

3585

/* Find the old active region end and shrink */

3574

for_each_active_range_index_in_nid(i, nid)

3586

for_each_active_range_index_in_nid(i, nid)

3575

if (early_node_map[i].end_pfn == old_end_pfn) {

3587

if (early_node_map[i].end_pfn == old_end_pfn) {

3576

early_node_map[i].end_pfn = new_end_pfn;

3588

early_node_map[i].end_pfn = new_end_pfn;

3577

break;

3589

break;

3578

}

3590

}

3579

}

3591

}

3580

3592

3581

/**

3593

/**

3582

* remove_all_active_ranges - Remove all currently registered regions

3594

* remove_all_active_ranges - Remove all currently registered regions

3583

*

3595

*

3584

* During discovery, it may be found that a table like SRAT is invalid

3596

* During discovery, it may be found that a table like SRAT is invalid

3585

* and an alternative discovery method must be used. This function removes

3597

* and an alternative discovery method must be used. This function removes

3586

* all currently registered regions.

3598

* all currently registered regions.

3587

*/

3599

*/

3588

void __init remove_all_active_ranges(void)

3600

void __init remove_all_active_ranges(void)

3589

{

3601

{

3590

memset(early_node_map, 0, sizeof(early_node_map));

3602

memset(early_node_map, 0, sizeof(early_node_map));

3591

nr_nodemap_entries = 0;

3603

nr_nodemap_entries = 0;

3592

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

3604

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

3593

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

3605

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

3594

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

3606

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

3595

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

3607

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

3596

}

3608

}

3597

3609

3598

/* Compare two active node_active_regions */

3610

/* Compare two active node_active_regions */

3599

static int __init cmp_node_active_region(const void *a, const void *b)

3611

static int __init cmp_node_active_region(const void *a, const void *b)

3600

{

3612

{

3601

struct node_active_region *arange = (struct node_active_region *)a;

3613

struct node_active_region *arange = (struct node_active_region *)a;

3602

struct node_active_region *brange = (struct node_active_region *)b;

3614

struct node_active_region *brange = (struct node_active_region *)b;

3603

3615

3604

/* Done this way to avoid overflows */

3616

/* Done this way to avoid overflows */

3605

if (arange->start_pfn > brange->start_pfn)

3617

if (arange->start_pfn > brange->start_pfn)

3606

return 1;

3618

return 1;

3607

if (arange->start_pfn < brange->start_pfn)

3619

if (arange->start_pfn < brange->start_pfn)

3608

return -1;

3620

return -1;

3609

3621

3610

return 0;

3622

return 0;

3611

}

3623

}

3612

3624

3613

/* sort the node_map by start_pfn */

3625

/* sort the node_map by start_pfn */

3614

static void __init sort_node_map(void)

3626

static void __init sort_node_map(void)

3615

{

3627

{

3616

sort(early_node_map, (size_t)nr_nodemap_entries,

3628

sort(early_node_map, (size_t)nr_nodemap_entries,

3617

sizeof(struct node_active_region),

3629

sizeof(struct node_active_region),

3618

cmp_node_active_region, NULL);

3630

cmp_node_active_region, NULL);

3619

}

3631

}

3620

3632

3621

/* Find the lowest pfn for a node */

3633

/* Find the lowest pfn for a node */

3622

unsigned long __init find_min_pfn_for_node(unsigned long nid)

3634

unsigned long __init find_min_pfn_for_node(unsigned long nid)

3623

{

3635

{

3624

int i;

3636

int i;

3625

unsigned long min_pfn = ULONG_MAX;

3637

unsigned long min_pfn = ULONG_MAX;

3626

3638

3627

/* Assuming a sorted map, the first range found has the starting pfn */

3639

/* Assuming a sorted map, the first range found has the starting pfn */

3628

for_each_active_range_index_in_nid(i, nid)

3640

for_each_active_range_index_in_nid(i, nid)

3629

min_pfn = min(min_pfn, early_node_map[i].start_pfn);

3641

min_pfn = min(min_pfn, early_node_map[i].start_pfn);

3630

3642

3631

if (min_pfn == ULONG_MAX) {

3643

if (min_pfn == ULONG_MAX) {

3632

printk(KERN_WARNING

3644

printk(KERN_WARNING

3633

"Could not find start_pfn for node %lu\n", nid);

3645

"Could not find start_pfn for node %lu\n", nid);

3634

return 0;

3646

return 0;

3635

}

3647

}

3636

3648

3637

return min_pfn;

3649

return min_pfn;

3638

}

3650

}

3639

3651

3640

/**

3652

/**

3641

* find_min_pfn_with_active_regions - Find the minimum PFN registered

3653

* find_min_pfn_with_active_regions - Find the minimum PFN registered

3642

*

3654

*

3643

* It returns the minimum PFN based on information provided via

3655

* It returns the minimum PFN based on information provided via

3644

* add_active_range().

3656

* add_active_range().

3645

*/

3657

*/

3646

unsigned long __init find_min_pfn_with_active_regions(void)

3658

unsigned long __init find_min_pfn_with_active_regions(void)

3647

{

3659

{

3648

return find_min_pfn_for_node(MAX_NUMNODES);

3660

return find_min_pfn_for_node(MAX_NUMNODES);

3649

}

3661

}

3650

3662

3651

/**

3663

/**

3652

* find_max_pfn_with_active_regions - Find the maximum PFN registered

3664

* find_max_pfn_with_active_regions - Find the maximum PFN registered

3653

*

3665

*

3654

* It returns the maximum PFN based on information provided via

3666

* It returns the maximum PFN based on information provided via

3655

* add_active_range().

3667

* add_active_range().

3656

*/

3668

*/

3657

unsigned long __init find_max_pfn_with_active_regions(void)

3669

unsigned long __init find_max_pfn_with_active_regions(void)

3658

{

3670

{

3659

int i;

3671

int i;

3660

unsigned long max_pfn = 0;

3672

unsigned long max_pfn = 0;

3661

3673

3662

for (i = 0; i < nr_nodemap_entries; i++)

3674

for (i = 0; i < nr_nodemap_entries; i++)

3663

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

3675

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

3664

3676

3665

return max_pfn;

3677

return max_pfn;

3666

}

3678

}

3667

3679

3668

/*

3680

/*

3669

* early_calculate_totalpages()

3681

* early_calculate_totalpages()

3670

* Sum pages in active regions for movable zone.

3682

* Sum pages in active regions for movable zone.

3671

* Populate N_HIGH_MEMORY for calculating usable_nodes.

3683

* Populate N_HIGH_MEMORY for calculating usable_nodes.

3672

*/

3684

*/

3673

static unsigned long __init early_calculate_totalpages(void)

3685

static unsigned long __init early_calculate_totalpages(void)

3674

{

3686

{

3675

int i;

3687

int i;

3676

unsigned long totalpages = 0;

3688

unsigned long totalpages = 0;

3677

3689

3678

for (i = 0; i < nr_nodemap_entries; i++) {

3690

for (i = 0; i < nr_nodemap_entries; i++) {

3679

unsigned long pages = early_node_map[i].end_pfn -

3691

unsigned long pages = early_node_map[i].end_pfn -

3680

early_node_map[i].start_pfn;

3692

early_node_map[i].start_pfn;

3681

totalpages += pages;

3693

totalpages += pages;

3682

if (pages)

3694

if (pages)

3683

node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);

3695

node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);

3684

}

3696

}

3685

return totalpages;

3697

return totalpages;

3686

}

3698

}

3687

3699

3688

/*

3700

/*

3689

* Find the PFN the Movable zone begins in each node. Kernel memory

3701

* Find the PFN the Movable zone begins in each node. Kernel memory

3690

* is spread evenly between nodes as long as the nodes have enough

3702

* is spread evenly between nodes as long as the nodes have enough

3691

* memory. When they don't, some nodes will have more kernelcore than

3703

* memory. When they don't, some nodes will have more kernelcore than

3692

* others

3704

* others

3693

*/

3705

*/

3694

void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)

3706

void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)

3695

{

3707

{

3696

int i, nid;

3708

int i, nid;

3697

unsigned long usable_startpfn;

3709

unsigned long usable_startpfn;

3698

unsigned long kernelcore_node, kernelcore_remaining;

3710

unsigned long kernelcore_node, kernelcore_remaining;

3699

unsigned long totalpages = early_calculate_totalpages();

3711

unsigned long totalpages = early_calculate_totalpages();

3700

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

3712

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

3701

3713

3702

/*

3714

/*

3703

* If movablecore was specified, calculate what size of

3715

* If movablecore was specified, calculate what size of

3704

* kernelcore that corresponds so that memory usable for

3716

* kernelcore that corresponds so that memory usable for

3705

* any allocation type is evenly spread. If both kernelcore

3717

* any allocation type is evenly spread. If both kernelcore

3706

* and movablecore are specified, then the value of kernelcore

3718

* and movablecore are specified, then the value of kernelcore

3707

* will be used for required_kernelcore if it's greater than

3719

* will be used for required_kernelcore if it's greater than

3708

* what movablecore would have allowed.

3720

* what movablecore would have allowed.

3709

*/

3721

*/

3710

if (required_movablecore) {

3722

if (required_movablecore) {

3711

unsigned long corepages;

3723

unsigned long corepages;

3712

3724

3713

/*

3725

/*

3714

* Round-up so that ZONE_MOVABLE is at least as large as what

3726

* Round-up so that ZONE_MOVABLE is at least as large as what

3715

* was requested by the user

3727

* was requested by the user

3716

*/

3728

*/

3717

required_movablecore =

3729

required_movablecore =

3718

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

3730

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

3719

corepages = totalpages - required_movablecore;

3731

corepages = totalpages - required_movablecore;

3720

3732

3721

required_kernelcore = max(required_kernelcore, corepages);

3733

required_kernelcore = max(required_kernelcore, corepages);

3722

}

3734

}

3723

3735

3724

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

3736

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

3725

if (!required_kernelcore)

3737

if (!required_kernelcore)

3726

return;

3738

return;

3727

3739

3728

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

3740

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

3729

find_usable_zone_for_movable();

3741

find_usable_zone_for_movable();

3730

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

3742

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

3731

3743

3732

restart:

3744

restart:

3733

/* Spread kernelcore memory as evenly as possible throughout nodes */

3745

/* Spread kernelcore memory as evenly as possible throughout nodes */

3734

kernelcore_node = required_kernelcore / usable_nodes;

3746

kernelcore_node = required_kernelcore / usable_nodes;

3735

for_each_node_state(nid, N_HIGH_MEMORY) {

3747

for_each_node_state(nid, N_HIGH_MEMORY) {

3736

/*

3748

/*

3737

* Recalculate kernelcore_node if the division per node

3749

* Recalculate kernelcore_node if the division per node

3738

* now exceeds what is necessary to satisfy the requested

3750

* now exceeds what is necessary to satisfy the requested

3739

* amount of memory for the kernel

3751

* amount of memory for the kernel

3740

*/

3752

*/

3741

if (required_kernelcore < kernelcore_node)

3753

if (required_kernelcore < kernelcore_node)

3742

kernelcore_node = required_kernelcore / usable_nodes;

3754

kernelcore_node = required_kernelcore / usable_nodes;

3743

3755

3744

/*

3756

/*

3745

* As the map is walked, we track how much memory is usable

3757

* As the map is walked, we track how much memory is usable

3746

* by the kernel using kernelcore_remaining. When it is

3758

* by the kernel using kernelcore_remaining. When it is

3747

* 0, the rest of the node is usable by ZONE_MOVABLE

3759

* 0, the rest of the node is usable by ZONE_MOVABLE

3748

*/

3760

*/

3749

kernelcore_remaining = kernelcore_node;

3761

kernelcore_remaining = kernelcore_node;

3750

3762

3751

/* Go through each range of PFNs within this node */

3763

/* Go through each range of PFNs within this node */

3752

for_each_active_range_index_in_nid(i, nid) {

3764

for_each_active_range_index_in_nid(i, nid) {

3753

unsigned long start_pfn, end_pfn;

3765

unsigned long start_pfn, end_pfn;

3754

unsigned long size_pages;

3766

unsigned long size_pages;

3755

3767

3756

start_pfn = max(early_node_map[i].start_pfn,

3768

start_pfn = max(early_node_map[i].start_pfn,

3757

zone_movable_pfn[nid]);

3769

zone_movable_pfn[nid]);

3758

end_pfn = early_node_map[i].end_pfn;

3770

end_pfn = early_node_map[i].end_pfn;

3759

if (start_pfn >= end_pfn)

3771

if (start_pfn >= end_pfn)

3760

continue;

3772

continue;

3761

3773

3762

/* Account for what is only usable for kernelcore */

3774

/* Account for what is only usable for kernelcore */

3763

if (start_pfn < usable_startpfn) {

3775

if (start_pfn < usable_startpfn) {

3764

unsigned long kernel_pages;

3776

unsigned long kernel_pages;

3765

kernel_pages = min(end_pfn, usable_startpfn)

3777

kernel_pages = min(end_pfn, usable_startpfn)

3766

- start_pfn;

3778

- start_pfn;

3767

3779

3768

kernelcore_remaining -= min(kernel_pages,

3780

kernelcore_remaining -= min(kernel_pages,

3769

kernelcore_remaining);

3781

kernelcore_remaining);

3770

required_kernelcore -= min(kernel_pages,

3782

required_kernelcore -= min(kernel_pages,

3771

required_kernelcore);

3783

required_kernelcore);

3772

3784

3773

/* Continue if range is now fully accounted */

3785

/* Continue if range is now fully accounted */

3774

if (end_pfn <= usable_startpfn) {

3786

if (end_pfn <= usable_startpfn) {

3775

3787

3776

/*

3788

/*

3777

* Push zone_movable_pfn to the end so

3789

* Push zone_movable_pfn to the end so

3778

* that if we have to rebalance

3790

* that if we have to rebalance

3779

* kernelcore across nodes, we will

3791

* kernelcore across nodes, we will

3780

* not double account here

3792

* not double account here

3781

*/

3793

*/

3782

zone_movable_pfn[nid] = end_pfn;

3794

zone_movable_pfn[nid] = end_pfn;

3783

continue;

3795

continue;

3784

}

3796

}

3785

start_pfn = usable_startpfn;

3797

start_pfn = usable_startpfn;

3786

}

3798

}

3787

3799

3788

/*

3800

/*

3789

* The usable PFN range for ZONE_MOVABLE is from

3801

* The usable PFN range for ZONE_MOVABLE is from

3790

* start_pfn->end_pfn. Calculate size_pages as the

3802

* start_pfn->end_pfn. Calculate size_pages as the

3791

* number of pages used as kernelcore

3803

* number of pages used as kernelcore

3792

*/

3804

*/

3793

size_pages = end_pfn - start_pfn;

3805

size_pages = end_pfn - start_pfn;

3794

if (size_pages > kernelcore_remaining)

3806

if (size_pages > kernelcore_remaining)

3795

size_pages = kernelcore_remaining;

3807

size_pages = kernelcore_remaining;

3796

zone_movable_pfn[nid] = start_pfn + size_pages;

3808

zone_movable_pfn[nid] = start_pfn + size_pages;

3797

3809

3798

/*

3810

/*

3799

* Some kernelcore has been met, update counts and

3811

* Some kernelcore has been met, update counts and

3800

* break if the kernelcore for this node has been

3812

* break if the kernelcore for this node has been

3801

* satisified

3813

* satisified

3802

*/

3814

*/

3803

required_kernelcore -= min(required_kernelcore,

3815

required_kernelcore -= min(required_kernelcore,

3804

size_pages);

3816

size_pages);

3805

kernelcore_remaining -= size_pages;

3817

kernelcore_remaining -= size_pages;

3806

if (!kernelcore_remaining)

3818

if (!kernelcore_remaining)

3807

break;

3819

break;

3808

}

3820

}

3809

}

3821

}

3810

3822

3811

/*

3823

/*

3812

* If there is still required_kernelcore, we do another pass with one

3824

* If there is still required_kernelcore, we do another pass with one

3813

* less node in the count. This will push zone_movable_pfn[nid] further

3825

* less node in the count. This will push zone_movable_pfn[nid] further

3814

* along on the nodes that still have memory until kernelcore is

3826

* along on the nodes that still have memory until kernelcore is

3815

* satisified

3827

* satisified

3816

*/

3828

*/

3817

usable_nodes--;

3829

usable_nodes--;

3818

if (usable_nodes && required_kernelcore > usable_nodes)

3830

if (usable_nodes && required_kernelcore > usable_nodes)

3819

goto restart;

3831

goto restart;

3820

3832

3821

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

3833

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

3822

for (nid = 0; nid < MAX_NUMNODES; nid++)

3834

for (nid = 0; nid < MAX_NUMNODES; nid++)

3823

zone_movable_pfn[nid] =

3835

zone_movable_pfn[nid] =

3824

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

3836

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

3825

}

3837

}

3826

3838

3827

/* Any regular memory on that node ? */

3839

/* Any regular memory on that node ? */

3828

static void check_for_regular_memory(pg_data_t *pgdat)

3840

static void check_for_regular_memory(pg_data_t *pgdat)

3829

{

3841

{

3830

#ifdef CONFIG_HIGHMEM

3842

#ifdef CONFIG_HIGHMEM

3831

enum zone_type zone_type;

3843

enum zone_type zone_type;

3832

3844

3833

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

3845

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

3834

struct zone *zone = &pgdat->node_zones[zone_type];

3846

struct zone *zone = &pgdat->node_zones[zone_type];

3835

if (zone->present_pages)

3847

if (zone->present_pages)

3836

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

3848

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

3837

}

3849

}

3838

#endif

3850

#endif

3839

}

3851

}

3840

3852

3841

/**

3853

/**

3842

* free_area_init_nodes - Initialise all pg_data_t and zone data

3854

* free_area_init_nodes - Initialise all pg_data_t and zone data

3843

* @max_zone_pfn: an array of max PFNs for each zone

3855

* @max_zone_pfn: an array of max PFNs for each zone

3844

*

3856

*

3845

* This will call free_area_init_node() for each active node in the system.

3857

* This will call free_area_init_node() for each active node in the system.

3846

* Using the page ranges provided by add_active_range(), the size of each

3858

* Using the page ranges provided by add_active_range(), the size of each

3847

* zone in each node and their holes is calculated. If the maximum PFN

3859

* zone in each node and their holes is calculated. If the maximum PFN

3848

* between two adjacent zones match, it is assumed that the zone is empty.

3860

* between two adjacent zones match, it is assumed that the zone is empty.

3849

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

3861

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

3850

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

3862

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

3851

* starts where the previous one ended. For example, ZONE_DMA32 starts

3863

* starts where the previous one ended. For example, ZONE_DMA32 starts

3852

* at arch_max_dma_pfn.

3864

* at arch_max_dma_pfn.

3853

*/

3865

*/

3854

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

3866

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

3855

{

3867

{

3856

unsigned long nid;

3868

unsigned long nid;

3857

enum zone_type i;

3869

enum zone_type i;

3858

3870

3859

/* Sort early_node_map as initialisation assumes it is sorted */

3871

/* Sort early_node_map as initialisation assumes it is sorted */

3860

sort_node_map();

3872

sort_node_map();

3861

3873

3862

/* Record where the zone boundaries are */

3874

/* Record where the zone boundaries are */

3863

memset(arch_zone_lowest_possible_pfn, 0,

3875

memset(arch_zone_lowest_possible_pfn, 0,

3864

sizeof(arch_zone_lowest_possible_pfn));

3876

sizeof(arch_zone_lowest_possible_pfn));

3865

memset(arch_zone_highest_possible_pfn, 0,

3877

memset(arch_zone_highest_possible_pfn, 0,

3866

sizeof(arch_zone_highest_possible_pfn));

3878

sizeof(arch_zone_highest_possible_pfn));

3867

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

3879

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

3868

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

3880

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

3869

for (i = 1; i < MAX_NR_ZONES; i++) {

3881

for (i = 1; i < MAX_NR_ZONES; i++) {

3870

if (i == ZONE_MOVABLE)

3882

if (i == ZONE_MOVABLE)

3871

continue;

3883

continue;

3872

arch_zone_lowest_possible_pfn[i] =

3884

arch_zone_lowest_possible_pfn[i] =

3873

arch_zone_highest_possible_pfn[i-1];

3885

arch_zone_highest_possible_pfn[i-1];

3874

arch_zone_highest_possible_pfn[i] =

3886

arch_zone_highest_possible_pfn[i] =

3875

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

3887

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

3876

}

3888

}

3877

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

3889

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

3878

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

3890

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

3879

3891

3880

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

3892

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

3881

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

3893

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

3882

find_zone_movable_pfns_for_nodes(zone_movable_pfn);

3894

find_zone_movable_pfns_for_nodes(zone_movable_pfn);

3883

3895

3884

/* Print out the zone ranges */

3896

/* Print out the zone ranges */

3885

printk("Zone PFN ranges:\n");

3897

printk("Zone PFN ranges:\n");

3886

for (i = 0; i < MAX_NR_ZONES; i++) {

3898

for (i = 0; i < MAX_NR_ZONES; i++) {

3887

if (i == ZONE_MOVABLE)

3899

if (i == ZONE_MOVABLE)

3888

continue;

3900

continue;

3889

printk(" %-8s %8lu -> %8lu\n",

3901

printk(" %-8s %8lu -> %8lu\n",

3890

zone_names[i],

3902

zone_names[i],

3891

arch_zone_lowest_possible_pfn[i],

3903

arch_zone_lowest_possible_pfn[i],

3892

arch_zone_highest_possible_pfn[i]);

3904

arch_zone_highest_possible_pfn[i]);

3893

}

3905

}

3894

3906

3895

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

3907

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

3896

printk("Movable zone start PFN for each node\n");

3908

printk("Movable zone start PFN for each node\n");

3897

for (i = 0; i < MAX_NUMNODES; i++) {

3909

for (i = 0; i < MAX_NUMNODES; i++) {

3898

if (zone_movable_pfn[i])

3910

if (zone_movable_pfn[i])

3899

printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);

3911

printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);

3900

}

3912

}

3901

3913

3902

/* Print out the early_node_map[] */

3914

/* Print out the early_node_map[] */

3903

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

3915

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

3904

for (i = 0; i < nr_nodemap_entries; i++)

3916

for (i = 0; i < nr_nodemap_entries; i++)

3905

printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,

3917

printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,

3906

early_node_map[i].start_pfn,

3918

early_node_map[i].start_pfn,

3907

early_node_map[i].end_pfn);

3919

early_node_map[i].end_pfn);

3908

3920

3909

/* Initialise every node */

3921

/* Initialise every node */

3910

setup_nr_node_ids();

3922

setup_nr_node_ids();

3911

for_each_online_node(nid) {

3923

for_each_online_node(nid) {

3912

pg_data_t *pgdat = NODE_DATA(nid);

3924

pg_data_t *pgdat = NODE_DATA(nid);

3913

free_area_init_node(nid, pgdat, NULL,

3925

free_area_init_node(nid, pgdat, NULL,

3914

find_min_pfn_for_node(nid), NULL);

3926

find_min_pfn_for_node(nid), NULL);

3915

3927

3916

/* Any memory on that node */

3928

/* Any memory on that node */

3917

if (pgdat->node_present_pages)

3929

if (pgdat->node_present_pages)

3918

node_set_state(nid, N_HIGH_MEMORY);

3930

node_set_state(nid, N_HIGH_MEMORY);

3919

check_for_regular_memory(pgdat);

3931

check_for_regular_memory(pgdat);

3920

}

3932

}

3921

}

3933

}

3922

3934

3923

static int __init cmdline_parse_core(char *p, unsigned long *core)

3935

static int __init cmdline_parse_core(char *p, unsigned long *core)

3924

{

3936

{

3925

unsigned long long coremem;

3937

unsigned long long coremem;

3926

if (!p)

3938

if (!p)

3927

return -EINVAL;

3939

return -EINVAL;

3928

3940

3929

coremem = memparse(p, &p);

3941

coremem = memparse(p, &p);

3930

*core = coremem >> PAGE_SHIFT;

3942

*core = coremem >> PAGE_SHIFT;

3931

3943

3932

/* Paranoid check that UL is enough for the coremem value */

3944

/* Paranoid check that UL is enough for the coremem value */

3933

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

3945

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

3934

3946

3935

return 0;

3947

return 0;

3936

}

3948

}

3937

3949

3938

/*

3950

/*

3939

* kernelcore=size sets the amount of memory for use for allocations that

3951

* kernelcore=size sets the amount of memory for use for allocations that

3940

* cannot be reclaimed or migrated.

3952

* cannot be reclaimed or migrated.

3941

*/

3953

*/

3942

static int __init cmdline_parse_kernelcore(char *p)

3954

static int __init cmdline_parse_kernelcore(char *p)

3943

{

3955

{

3944

return cmdline_parse_core(p, &required_kernelcore);

3956

return cmdline_parse_core(p, &required_kernelcore);

3945

}

3957

}

3946

3958

3947

/*

3959

/*

3948

* movablecore=size sets the amount of memory for use for allocations that

3960

* movablecore=size sets the amount of memory for use for allocations that

3949

* can be reclaimed or migrated.

3961

* can be reclaimed or migrated.

3950

*/

3962

*/

3951

static int __init cmdline_parse_movablecore(char *p)

3963

static int __init cmdline_parse_movablecore(char *p)

3952

{

3964

{

3953

return cmdline_parse_core(p, &required_movablecore);

3965

return cmdline_parse_core(p, &required_movablecore);

3954

}

3966

}

3955

3967

3956

early_param("kernelcore", cmdline_parse_kernelcore);

3968

early_param("kernelcore", cmdline_parse_kernelcore);

3957

early_param("movablecore", cmdline_parse_movablecore);

3969

early_param("movablecore", cmdline_parse_movablecore);

3958

3970

3959

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3971

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

3960

3972

3961

/**

3973

/**

3962

* set_dma_reserve - set the specified number of pages reserved in the first zone

3974

* set_dma_reserve - set the specified number of pages reserved in the first zone

3963

* @new_dma_reserve: The number of pages to mark reserved

3975

* @new_dma_reserve: The number of pages to mark reserved

3964

*

3976

*

3965

* The per-cpu batchsize and zone watermarks are determined by present_pages.

3977

* The per-cpu batchsize and zone watermarks are determined by present_pages.

3966

* In the DMA zone, a significant percentage may be consumed by kernel image

3978

* In the DMA zone, a significant percentage may be consumed by kernel image

3967

* and other unfreeable allocations which can skew the watermarks badly. This

3979

* and other unfreeable allocations which can skew the watermarks badly. This

3968

* function may optionally be used to account for unfreeable pages in the

3980

* function may optionally be used to account for unfreeable pages in the

3969

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

3981

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

3970

* smaller per-cpu batchsize.

3982

* smaller per-cpu batchsize.

3971

*/

3983

*/

3972

void __init set_dma_reserve(unsigned long new_dma_reserve)

3984

void __init set_dma_reserve(unsigned long new_dma_reserve)

3973

{

3985

{

3974

dma_reserve = new_dma_reserve;

3986

dma_reserve = new_dma_reserve;

3975

}

3987

}

3976

3988

3977

#ifndef CONFIG_NEED_MULTIPLE_NODES

3989

#ifndef CONFIG_NEED_MULTIPLE_NODES

3978

static bootmem_data_t contig_bootmem_data;

3990

static bootmem_data_t contig_bootmem_data;

3979

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

3991

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

3980

3992

3981

EXPORT_SYMBOL(contig_page_data);

3993

EXPORT_SYMBOL(contig_page_data);

3982

#endif

3994

#endif

3983

3995

3984

void __init free_area_init(unsigned long *zones_size)

3996

void __init free_area_init(unsigned long *zones_size)

3985

{

3997

{

3986

free_area_init_node(0, NODE_DATA(0), zones_size,

3998

free_area_init_node(0, NODE_DATA(0), zones_size,

3987

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

3999

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

3988

}

4000

}

3989

4001

3990

static int page_alloc_cpu_notify(struct notifier_block *self,

4002

static int page_alloc_cpu_notify(struct notifier_block *self,

3991

unsigned long action, void *hcpu)

4003

unsigned long action, void *hcpu)

3992

{

4004

{

3993

int cpu = (unsigned long)hcpu;

4005

int cpu = (unsigned long)hcpu;

3994

4006

3995

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4007

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

3996

drain_pages(cpu);

4008

drain_pages(cpu);

3997

4009

3998

/*

4010

/*

3999

* Spill the event counters of the dead processor

4011

* Spill the event counters of the dead processor

4000

* into the current processors event counters.

4012

* into the current processors event counters.

4001

* This artificially elevates the count of the current

4013

* This artificially elevates the count of the current

4002

* processor.

4014

* processor.

4003

*/

4015

*/

4004

vm_events_fold_cpu(cpu);

4016

vm_events_fold_cpu(cpu);

4005

4017

4006

/*

4018

/*

4007

* Zero the differential counters of the dead processor

4019

* Zero the differential counters of the dead processor

4008

* so that the vm statistics are consistent.

4020

* so that the vm statistics are consistent.

4009

*

4021

*

4010

* This is only okay since the processor is dead and cannot

4022

* This is only okay since the processor is dead and cannot

4011

* race with what we are doing.

4023

* race with what we are doing.

4012

*/

4024

*/

4013

refresh_cpu_vm_stats(cpu);

4025

refresh_cpu_vm_stats(cpu);

4014

}

4026

}

4015

return NOTIFY_OK;

4027

return NOTIFY_OK;

4016

}

4028

}

4017

4029

4018

void __init page_alloc_init(void)

4030

void __init page_alloc_init(void)

4019

{

4031

{

4020

hotcpu_notifier(page_alloc_cpu_notify, 0);

4032

hotcpu_notifier(page_alloc_cpu_notify, 0);

4021

}

4033

}

4022

4034

4023

/*

4035

/*

4024

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4036

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4025

* or min_free_kbytes changes.

4037

* or min_free_kbytes changes.

4026

*/

4038

*/

4027

static void calculate_totalreserve_pages(void)

4039

static void calculate_totalreserve_pages(void)

4028

{

4040

{

4029

struct pglist_data *pgdat;

4041

struct pglist_data *pgdat;

4030

unsigned long reserve_pages = 0;

4042

unsigned long reserve_pages = 0;

4031

enum zone_type i, j;

4043

enum zone_type i, j;

4032

4044

4033

for_each_online_pgdat(pgdat) {

4045

for_each_online_pgdat(pgdat) {

4034

for (i = 0; i < MAX_NR_ZONES; i++) {

4046

for (i = 0; i < MAX_NR_ZONES; i++) {

4035

struct zone *zone = pgdat->node_zones + i;

4047

struct zone *zone = pgdat->node_zones + i;

4036

unsigned long max = 0;

4048

unsigned long max = 0;

4037

4049

4038

/* Find valid and maximum lowmem_reserve in the zone */

4050

/* Find valid and maximum lowmem_reserve in the zone */

4039

for (j = i; j < MAX_NR_ZONES; j++) {

4051

for (j = i; j < MAX_NR_ZONES; j++) {

4040

if (zone->lowmem_reserve[j] > max)

4052

if (zone->lowmem_reserve[j] > max)

4041

max = zone->lowmem_reserve[j];

4053

max = zone->lowmem_reserve[j];

4042

}

4054

}

4043

4055

4044

/* we treat pages_high as reserved pages. */

4056

/* we treat pages_high as reserved pages. */

4045

max += zone->pages_high;

4057

max += zone->pages_high;

4046

4058

4047

if (max > zone->present_pages)

4059

if (max > zone->present_pages)

4048

max = zone->present_pages;

4060

max = zone->present_pages;

4049

reserve_pages += max;

4061

reserve_pages += max;

4050

}

4062

}

4051

}

4063

}

4052

totalreserve_pages = reserve_pages;

4064

totalreserve_pages = reserve_pages;

4053

}

4065

}

4054

4066

4055

/*

4067

/*

4056

* setup_per_zone_lowmem_reserve - called whenever

4068

* setup_per_zone_lowmem_reserve - called whenever

4057

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

4069

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

4058

* has a correct pages reserved value, so an adequate number of

4070

* has a correct pages reserved value, so an adequate number of

4059

* pages are left in the zone after a successful __alloc_pages().

4071

* pages are left in the zone after a successful __alloc_pages().

4060

*/

4072

*/

4061

static void setup_per_zone_lowmem_reserve(void)

4073

static void setup_per_zone_lowmem_reserve(void)

4062

{

4074

{

4063

struct pglist_data *pgdat;

4075

struct pglist_data *pgdat;

4064

enum zone_type j, idx;

4076

enum zone_type j, idx;

4065

4077

4066

for_each_online_pgdat(pgdat) {

4078

for_each_online_pgdat(pgdat) {

4067

for (j = 0; j < MAX_NR_ZONES; j++) {

4079

for (j = 0; j < MAX_NR_ZONES; j++) {

4068

struct zone *zone = pgdat->node_zones + j;

4080

struct zone *zone = pgdat->node_zones + j;

4069

unsigned long present_pages = zone->present_pages;

4081

unsigned long present_pages = zone->present_pages;

4070

4082

4071

zone->lowmem_reserve[j] = 0;

4083

zone->lowmem_reserve[j] = 0;

4072

4084

4073

idx = j;

4085

idx = j;

4074

while (idx) {

4086

while (idx) {

4075

struct zone *lower_zone;

4087

struct zone *lower_zone;

4076

4088

4077

idx--;

4089

idx--;

4078

4090

4079

if (sysctl_lowmem_reserve_ratio[idx] < 1)

4091

if (sysctl_lowmem_reserve_ratio[idx] < 1)

4080

sysctl_lowmem_reserve_ratio[idx] = 1;

4092

sysctl_lowmem_reserve_ratio[idx] = 1;

4081

4093

4082

lower_zone = pgdat->node_zones + idx;

4094

lower_zone = pgdat->node_zones + idx;

4083

lower_zone->lowmem_reserve[j] = present_pages /

4095

lower_zone->lowmem_reserve[j] = present_pages /

4084

sysctl_lowmem_reserve_ratio[idx];

4096

sysctl_lowmem_reserve_ratio[idx];

4085

present_pages += lower_zone->present_pages;

4097

present_pages += lower_zone->present_pages;

4086

}

4098

}

4087

}

4099

}

4088

}

4100

}

4089

4101

4090

/* update totalreserve_pages */

4102

/* update totalreserve_pages */

4091

calculate_totalreserve_pages();

4103

calculate_totalreserve_pages();

4092

}

4104

}

4093

4105

4094

/**

4106

/**

4095

* setup_per_zone_pages_min - called when min_free_kbytes changes.

4107

* setup_per_zone_pages_min - called when min_free_kbytes changes.

4096

*

4108

*

4097

* Ensures that the pages_{min,low,high} values for each zone are set correctly

4109

* Ensures that the pages_{min,low,high} values for each zone are set correctly

4098

* with respect to min_free_kbytes.

4110

* with respect to min_free_kbytes.

4099

*/

4111

*/

4100

void setup_per_zone_pages_min(void)

4112

void setup_per_zone_pages_min(void)

4101

{

4113

{

4102

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

4114

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

4103

unsigned long lowmem_pages = 0;

4115

unsigned long lowmem_pages = 0;

4104

struct zone *zone;

4116

struct zone *zone;

4105

unsigned long flags;

4117

unsigned long flags;

4106

4118

4107

/* Calculate total number of !ZONE_HIGHMEM pages */

4119

/* Calculate total number of !ZONE_HIGHMEM pages */

4108

for_each_zone(zone) {

4120

for_each_zone(zone) {

4109

if (!is_highmem(zone))

4121

if (!is_highmem(zone))

4110

lowmem_pages += zone->present_pages;

4122

lowmem_pages += zone->present_pages;

4111

}

4123

}

4112

4124

4113

for_each_zone(zone) {

4125

for_each_zone(zone) {

4114

u64 tmp;

4126

u64 tmp;

4115

4127

4116

spin_lock_irqsave(&zone->lru_lock, flags);

4128

spin_lock_irqsave(&zone->lru_lock, flags);

4117

tmp = (u64)pages_min * zone->present_pages;

4129

tmp = (u64)pages_min * zone->present_pages;

4118

do_div(tmp, lowmem_pages);

4130

do_div(tmp, lowmem_pages);

4119

if (is_highmem(zone)) {

4131

if (is_highmem(zone)) {

4120

/*

4132

/*

4121

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

4133

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

4122

* need highmem pages, so cap pages_min to a small

4134

* need highmem pages, so cap pages_min to a small

4123

* value here.

4135

* value here.

4124

*

4136

*

4125

* The (pages_high-pages_low) and (pages_low-pages_min)

4137

* The (pages_high-pages_low) and (pages_low-pages_min)

4126

* deltas controls asynch page reclaim, and so should

4138

* deltas controls asynch page reclaim, and so should

4127

* not be capped for highmem.

4139

* not be capped for highmem.

4128

*/

4140

*/

4129

int min_pages;

4141

int min_pages;

4130

4142

4131

min_pages = zone->present_pages / 1024;

4143

min_pages = zone->present_pages / 1024;

4132

if (min_pages < SWAP_CLUSTER_MAX)

4144

if (min_pages < SWAP_CLUSTER_MAX)

4133

min_pages = SWAP_CLUSTER_MAX;

4145

min_pages = SWAP_CLUSTER_MAX;

4134

if (min_pages > 128)

4146

if (min_pages > 128)

4135

min_pages = 128;

4147

min_pages = 128;

4136

zone->pages_min = min_pages;

4148

zone->pages_min = min_pages;

4137

} else {

4149

} else {

4138

/*

4150

/*

4139

* If it's a lowmem zone, reserve a number of pages

4151

* If it's a lowmem zone, reserve a number of pages

4140

* proportionate to the zone's size.

4152

* proportionate to the zone's size.

4141

*/

4153

*/

4142

zone->pages_min = tmp;

4154

zone->pages_min = tmp;

4143

}

4155

}

4144

4156

4145

zone->pages_low = zone->pages_min + (tmp >> 2);

4157

zone->pages_low = zone->pages_min + (tmp >> 2);

4146

zone->pages_high = zone->pages_min + (tmp >> 1);

4158

zone->pages_high = zone->pages_min + (tmp >> 1);

4147

setup_zone_migrate_reserve(zone);

4159

setup_zone_migrate_reserve(zone);

4148

spin_unlock_irqrestore(&zone->lru_lock, flags);

4160

spin_unlock_irqrestore(&zone->lru_lock, flags);

4149

}

4161

}

4150

4162

4151

/* update totalreserve_pages */

4163

/* update totalreserve_pages */

4152

calculate_totalreserve_pages();

4164

calculate_totalreserve_pages();

4153

}

4165

}

4154

4166

4155

/*

4167

/*

4156

* Initialise min_free_kbytes.

4168

* Initialise min_free_kbytes.

4157

*

4169

*

4158

* For small machines we want it small (128k min). For large machines

4170

* For small machines we want it small (128k min). For large machines

4159

* we want it large (64MB max). But it is not linear, because network

4171

* we want it large (64MB max). But it is not linear, because network

4160

* bandwidth does not increase linearly with machine size. We use

4172

* bandwidth does not increase linearly with machine size. We use

4161

*

4173

*

4162

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

4174

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

4163

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

4175

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

4164

*

4176

*

4165

* which yields

4177

* which yields

4166

*

4178

*

4167

* 16MB: 512k

4179

* 16MB: 512k

4168

* 32MB: 724k

4180

* 32MB: 724k

4169

* 64MB: 1024k

4181

* 64MB: 1024k

4170

* 128MB: 1448k

4182

* 128MB: 1448k

4171

* 256MB: 2048k

4183

* 256MB: 2048k

4172

* 512MB: 2896k

4184

* 512MB: 2896k

4173

* 1024MB: 4096k

4185

* 1024MB: 4096k

4174

* 2048MB: 5792k

4186

* 2048MB: 5792k

4175

* 4096MB: 8192k

4187

* 4096MB: 8192k

4176

* 8192MB: 11584k

4188

* 8192MB: 11584k

4177

* 16384MB: 16384k

4189

* 16384MB: 16384k

4178

*/

4190

*/

4179

static int __init init_per_zone_pages_min(void)

4191

static int __init init_per_zone_pages_min(void)

4180

{

4192

{

4181

unsigned long lowmem_kbytes;

4193

unsigned long lowmem_kbytes;

4182

4194

4183

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

4195

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

4184

4196

4185

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

4197

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

4186

if (min_free_kbytes < 128)

4198

if (min_free_kbytes < 128)

4187

min_free_kbytes = 128;

4199

min_free_kbytes = 128;

4188

if (min_free_kbytes > 65536)

4200

if (min_free_kbytes > 65536)

4189

min_free_kbytes = 65536;

4201

min_free_kbytes = 65536;

4190

setup_per_zone_pages_min();

4202

setup_per_zone_pages_min();

4191

setup_per_zone_lowmem_reserve();

4203

setup_per_zone_lowmem_reserve();

4192

return 0;

4204

return 0;

4193

}

4205

}

4194

module_init(init_per_zone_pages_min)

4206

module_init(init_per_zone_pages_min)

4195

4207

4196

/*

4208

/*

4197

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

4209

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

4198

* that we can call two helper functions whenever min_free_kbytes

4210

* that we can call two helper functions whenever min_free_kbytes

4199

* changes.

4211

* changes.

4200

*/

4212

*/

4201

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

4213

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

4202

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4214

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4203

{

4215

{

4204

proc_dointvec(table, write, file, buffer, length, ppos);

4216

proc_dointvec(table, write, file, buffer, length, ppos);

4205

if (write)

4217

if (write)

4206

setup_per_zone_pages_min();

4218

setup_per_zone_pages_min();

4207

return 0;

4219

return 0;

4208

}

4220

}

4209

4221

4210

#ifdef CONFIG_NUMA

4222

#ifdef CONFIG_NUMA

4211

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

4223

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

4212

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4224

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4213

{

4225

{

4214

struct zone *zone;

4226

struct zone *zone;

4215

int rc;

4227

int rc;

4216

4228

4217

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4229

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4218

if (rc)

4230

if (rc)

4219

return rc;

4231

return rc;

4220

4232

4221

for_each_zone(zone)

4233

for_each_zone(zone)

4222

zone->min_unmapped_pages = (zone->present_pages *

4234

zone->min_unmapped_pages = (zone->present_pages *

4223

sysctl_min_unmapped_ratio) / 100;

4235

sysctl_min_unmapped_ratio) / 100;

4224

return 0;

4236

return 0;

4225

}

4237

}

4226

4238

4227

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

4239

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

4228

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4240

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4229

{

4241

{

4230

struct zone *zone;

4242

struct zone *zone;

4231

int rc;

4243

int rc;

4232

4244

4233

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4245

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4234

if (rc)

4246

if (rc)

4235

return rc;

4247

return rc;

4236

4248

4237

for_each_zone(zone)

4249

for_each_zone(zone)

4238

zone->min_slab_pages = (zone->present_pages *

4250

zone->min_slab_pages = (zone->present_pages *

4239

sysctl_min_slab_ratio) / 100;

4251

sysctl_min_slab_ratio) / 100;

4240

return 0;

4252

return 0;

4241

}

4253

}

4242

#endif

4254

#endif

4243

4255

4244

/*

4256

/*

4245

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

4257

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

4246

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

4258

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

4247

* whenever sysctl_lowmem_reserve_ratio changes.

4259

* whenever sysctl_lowmem_reserve_ratio changes.

4248

*

4260

*

4249

* The reserve ratio obviously has absolutely no relation with the

4261

* The reserve ratio obviously has absolutely no relation with the

4250

* pages_min watermarks. The lowmem reserve ratio can only make sense

4262

* pages_min watermarks. The lowmem reserve ratio can only make sense

4251

* if in function of the boot time zone sizes.

4263

* if in function of the boot time zone sizes.

4252

*/

4264

*/

4253

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

4265

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

4254

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4266

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4255

{

4267

{

4256

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4268

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4257

setup_per_zone_lowmem_reserve();

4269

setup_per_zone_lowmem_reserve();

4258

return 0;

4270

return 0;

4259

}

4271

}

4260

4272

4261

/*

4273

/*

4262

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

4274

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

4263

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

4275

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

4264

* can have before it gets flushed back to buddy allocator.

4276

* can have before it gets flushed back to buddy allocator.

4265

*/

4277

*/

4266

4278

4267

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

4279

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

4268

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4280

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

4269

{

4281

{

4270

struct zone *zone;

4282

struct zone *zone;

4271

unsigned int cpu;

4283

unsigned int cpu;

4272

int ret;

4284

int ret;

4273

4285

4274

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4286

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

4275

if (!write || (ret == -EINVAL))

4287

if (!write || (ret == -EINVAL))

4276

return ret;

4288

return ret;

4277

for_each_zone(zone) {

4289

for_each_zone(zone) {

4278

for_each_online_cpu(cpu) {

4290

for_each_online_cpu(cpu) {

4279

unsigned long high;

4291

unsigned long high;

4280

high = zone->present_pages / percpu_pagelist_fraction;

4292

high = zone->present_pages / percpu_pagelist_fraction;

4281

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

4293

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

4282

}

4294

}

4283

}

4295

}

4284

return 0;

4296

return 0;

4285

}

4297

}

4286

4298

4287

int hashdist = HASHDIST_DEFAULT;

4299

int hashdist = HASHDIST_DEFAULT;

4288

4300

4289

#ifdef CONFIG_NUMA

4301

#ifdef CONFIG_NUMA

4290

static int __init set_hashdist(char *str)

4302

static int __init set_hashdist(char *str)

4291

{

4303

{

4292

if (!str)

4304

if (!str)

4293

return 0;

4305

return 0;

4294

hashdist = simple_strtoul(str, &str, 0);

4306

hashdist = simple_strtoul(str, &str, 0);

4295

return 1;

4307

return 1;

4296

}

4308

}

4297

__setup("hashdist=", set_hashdist);

4309

__setup("hashdist=", set_hashdist);

4298

#endif

4310

#endif

4299

4311

4300

/*

4312

/*

4301

* allocate a large system hash table from bootmem

4313

* allocate a large system hash table from bootmem

4302

* - it is assumed that the hash table must contain an exact power-of-2

4314

* - it is assumed that the hash table must contain an exact power-of-2

4303

* quantity of entries

4315

* quantity of entries

4304

* - limit is the number of hash buckets, not the total allocation size

4316

* - limit is the number of hash buckets, not the total allocation size

4305

*/

4317

*/

4306

void *__init alloc_large_system_hash(const char *tablename,

4318

void *__init alloc_large_system_hash(const char *tablename,

4307

unsigned long bucketsize,

4319

unsigned long bucketsize,

4308

unsigned long numentries,

4320

unsigned long numentries,

4309

int scale,

4321

int scale,

4310

int flags,

4322

int flags,

4311

unsigned int *_hash_shift,

4323

unsigned int *_hash_shift,

4312

unsigned int *_hash_mask,

4324

unsigned int *_hash_mask,

4313

unsigned long limit)

4325

unsigned long limit)

4314

{

4326

{

4315

unsigned long long max = limit;

4327

unsigned long long max = limit;

4316

unsigned long log2qty, size;

4328

unsigned long log2qty, size;

4317

void *table = NULL;

4329

void *table = NULL;

4318

4330

4319

/* allow the kernel cmdline to have a say */

4331

/* allow the kernel cmdline to have a say */

4320

if (!numentries) {

4332

if (!numentries) {

4321

/* round applicable memory size up to nearest megabyte */

4333

/* round applicable memory size up to nearest megabyte */

4322

numentries = nr_kernel_pages;

4334

numentries = nr_kernel_pages;

4323

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

4335

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

4324

numentries >>= 20 - PAGE_SHIFT;

4336

numentries >>= 20 - PAGE_SHIFT;

4325

numentries <<= 20 - PAGE_SHIFT;

4337

numentries <<= 20 - PAGE_SHIFT;

4326

4338

4327

/* limit to 1 bucket per 2^scale bytes of low memory */

4339

/* limit to 1 bucket per 2^scale bytes of low memory */

4328

if (scale > PAGE_SHIFT)

4340

if (scale > PAGE_SHIFT)

4329

numentries >>= (scale - PAGE_SHIFT);

4341

numentries >>= (scale - PAGE_SHIFT);

4330

else

4342

else

4331

numentries <<= (PAGE_SHIFT - scale);

4343

numentries <<= (PAGE_SHIFT - scale);

4332

4344

4333

/* Make sure we've got at least a 0-order allocation.. */

4345

/* Make sure we've got at least a 0-order allocation.. */

4334

if (unlikely((numentries * bucketsize) < PAGE_SIZE))

4346

if (unlikely((numentries * bucketsize) < PAGE_SIZE))

4335

numentries = PAGE_SIZE / bucketsize;

4347

numentries = PAGE_SIZE / bucketsize;

4336

}

4348

}

4337

numentries = roundup_pow_of_two(numentries);

4349

numentries = roundup_pow_of_two(numentries);

4338

4350

4339

/* limit allocation size to 1/16 total memory by default */

4351

/* limit allocation size to 1/16 total memory by default */

4340

if (max == 0) {

4352

if (max == 0) {

4341

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

4353

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

4342

do_div(max, bucketsize);

4354

do_div(max, bucketsize);

4343

}

4355

}

4344

4356

4345

if (numentries > max)

4357

if (numentries > max)

4346

numentries = max;

4358

numentries = max;

4347

4359

4348

log2qty = ilog2(numentries);

4360

log2qty = ilog2(numentries);

4349

4361

4350

do {

4362

do {

4351

size = bucketsize << log2qty;

4363

size = bucketsize << log2qty;

4352

if (flags & HASH_EARLY)

4364

if (flags & HASH_EARLY)

4353

table = alloc_bootmem(size);

4365

table = alloc_bootmem(size);

4354

else if (hashdist)

4366

else if (hashdist)

4355

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

4367

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

4356

else {

4368

else {

4357

unsigned long order = get_order(size);

4369

unsigned long order = get_order(size);

4358

table = (void*) __get_free_pages(GFP_ATOMIC, order);

4370

table = (void*) __get_free_pages(GFP_ATOMIC, order);

4359

/*

4371

/*

4360

* If bucketsize is not a power-of-two, we may free

4372

* If bucketsize is not a power-of-two, we may free

4361

* some pages at the end of hash table.

4373

* some pages at the end of hash table.

4362

*/

4374

*/

4363

if (table) {

4375

if (table) {

4364

unsigned long alloc_end = (unsigned long)table +

4376

unsigned long alloc_end = (unsigned long)table +

4365

(PAGE_SIZE << order);

4377

(PAGE_SIZE << order);

4366

unsigned long used = (unsigned long)table +

4378

unsigned long used = (unsigned long)table +

4367

PAGE_ALIGN(size);

4379

PAGE_ALIGN(size);

4368

split_page(virt_to_page(table), order);

4380

split_page(virt_to_page(table), order);

4369

while (used < alloc_end) {

4381

while (used < alloc_end) {

4370

free_page(used);

4382

free_page(used);

4371

used += PAGE_SIZE;

4383

used += PAGE_SIZE;

4372

}

4384

}

4373

}

4385

}

4374

}

4386

}

4375

} while (!table && size > PAGE_SIZE && --log2qty);

4387

} while (!table && size > PAGE_SIZE && --log2qty);

4376

4388

4377

if (!table)

4389

if (!table)

4378

panic("Failed to allocate %s hash table\n", tablename);

4390

panic("Failed to allocate %s hash table\n", tablename);

4379

4391

4380

printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",

4392

printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",

4381

tablename,

4393

tablename,

4382

(1U << log2qty),

4394

(1U << log2qty),

4383

ilog2(size) - PAGE_SHIFT,

4395

ilog2(size) - PAGE_SHIFT,

4384

size);

4396

size);

4385

4397

4386

if (_hash_shift)

4398

if (_hash_shift)

4387

*_hash_shift = log2qty;

4399

*_hash_shift = log2qty;

4388

if (_hash_mask)

4400

if (_hash_mask)

4389

*_hash_mask = (1 << log2qty) - 1;

4401

*_hash_mask = (1 << log2qty) - 1;

4390

4402

4391

return table;

4403

return table;

4392

}

4404

}

4393

4405

4394

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

4406

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

4395

struct page *pfn_to_page(unsigned long pfn)

4407

struct page *pfn_to_page(unsigned long pfn)

4396

{

4408

{

4397

return __pfn_to_page(pfn);

4409

return __pfn_to_page(pfn);

4398

}

4410

}

4399

unsigned long page_to_pfn(struct page *page)

4411

unsigned long page_to_pfn(struct page *page)

4400

{

4412

{

4401

return __page_to_pfn(page);

4413

return __page_to_pfn(page);

4402

}

4414

}

4403

EXPORT_SYMBOL(pfn_to_page);

4415

EXPORT_SYMBOL(pfn_to_page);

4404

EXPORT_SYMBOL(page_to_pfn);

4416

EXPORT_SYMBOL(page_to_pfn);

4405

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

4417

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

4406

4418

4407

/* Return a pointer to the bitmap storing bits affecting a block of pages */

4419

/* Return a pointer to the bitmap storing bits affecting a block of pages */

4408

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

4420

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

4409

unsigned long pfn)

4421

unsigned long pfn)

4410

{

4422

{

4411

#ifdef CONFIG_SPARSEMEM

4423

#ifdef CONFIG_SPARSEMEM

4412

return __pfn_to_section(pfn)->pageblock_flags;

4424

return __pfn_to_section(pfn)->pageblock_flags;

4413

#else

4425

#else

4414

return zone->pageblock_flags;

4426

return zone->pageblock_flags;

4415

#endif /* CONFIG_SPARSEMEM */

4427

#endif /* CONFIG_SPARSEMEM */

4416

}

4428

}

4417

4429

4418

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

4430

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

4419

{

4431

{

4420

#ifdef CONFIG_SPARSEMEM

4432

#ifdef CONFIG_SPARSEMEM

4421

pfn &= (PAGES_PER_SECTION-1);

4433

pfn &= (PAGES_PER_SECTION-1);

4422

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4434

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4423

#else

4435

#else

4424

pfn = pfn - zone->zone_start_pfn;

4436

pfn = pfn - zone->zone_start_pfn;

4425

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4437

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

4426

#endif /* CONFIG_SPARSEMEM */

4438

#endif /* CONFIG_SPARSEMEM */

4427

}

4439

}

4428

4440

4429

/**

4441

/**

4430

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

4442

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

4431

* @page: The page within the block of interest

4443

* @page: The page within the block of interest

4432

* @start_bitidx: The first bit of interest to retrieve

4444

* @start_bitidx: The first bit of interest to retrieve

4433

* @end_bitidx: The last bit of interest

4445

* @end_bitidx: The last bit of interest

4434

* returns pageblock_bits flags

4446

* returns pageblock_bits flags

4435

*/

4447

*/

4436

unsigned long get_pageblock_flags_group(struct page *page,

4448

unsigned long get_pageblock_flags_group(struct page *page,

4437

int start_bitidx, int end_bitidx)

4449

int start_bitidx, int end_bitidx)

4438

{

4450

{

4439

struct zone *zone;

4451

struct zone *zone;

4440

unsigned long *bitmap;

4452

unsigned long *bitmap;

4441

unsigned long pfn, bitidx;

4453

unsigned long pfn, bitidx;

4442

unsigned long flags = 0;

4454

unsigned long flags = 0;

4443

unsigned long value = 1;

4455

unsigned long value = 1;

4444

4456

4445

zone = page_zone(page);

4457

zone = page_zone(page);

4446

pfn = page_to_pfn(page);

4458

pfn = page_to_pfn(page);

4447

bitmap = get_pageblock_bitmap(zone, pfn);

4459

bitmap = get_pageblock_bitmap(zone, pfn);

4448

bitidx = pfn_to_bitidx(zone, pfn);

4460

bitidx = pfn_to_bitidx(zone, pfn);

4449

4461

4450

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4462

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4451

if (test_bit(bitidx + start_bitidx, bitmap))

4463

if (test_bit(bitidx + start_bitidx, bitmap))

4452

flags |= value;

4464

flags |= value;

4453

4465

4454

return flags;

4466

return flags;

4455

}

4467

}

4456

4468

4457

/**

4469

/**

4458

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

4470

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

4459

* @page: The page within the block of interest

4471

* @page: The page within the block of interest

4460

* @start_bitidx: The first bit of interest

4472

* @start_bitidx: The first bit of interest

4461

* @end_bitidx: The last bit of interest

4473

* @end_bitidx: The last bit of interest

4462

* @flags: The flags to set

4474

* @flags: The flags to set

4463

*/

4475

*/

4464

void set_pageblock_flags_group(struct page *page, unsigned long flags,

4476

void set_pageblock_flags_group(struct page *page, unsigned long flags,

4465

int start_bitidx, int end_bitidx)

4477

int start_bitidx, int end_bitidx)

4466

{

4478

{

4467

struct zone *zone;

4479

struct zone *zone;

4468

unsigned long *bitmap;

4480

unsigned long *bitmap;

4469

unsigned long pfn, bitidx;

4481

unsigned long pfn, bitidx;

4470

unsigned long value = 1;

4482

unsigned long value = 1;

4471

4483

4472

zone = page_zone(page);

4484

zone = page_zone(page);

4473

pfn = page_to_pfn(page);

4485

pfn = page_to_pfn(page);

4474

bitmap = get_pageblock_bitmap(zone, pfn);

4486

bitmap = get_pageblock_bitmap(zone, pfn);

4475

bitidx = pfn_to_bitidx(zone, pfn);

4487

bitidx = pfn_to_bitidx(zone, pfn);

4476

VM_BUG_ON(pfn < zone->zone_start_pfn);

4488

VM_BUG_ON(pfn < zone->zone_start_pfn);

4477

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

4489

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

4478

4490

4479

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4491

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

4480

if (flags & value)

4492

if (flags & value)

4481

__set_bit(bitidx + start_bitidx, bitmap);

4493

__set_bit(bitidx + start_bitidx, bitmap);

4482

else

4494

else

4483

__clear_bit(bitidx + start_bitidx, bitmap);

4495

__clear_bit(bitidx + start_bitidx, bitmap);

4484

}

4496

}

4485

4497

4486

/*

4498

/*

4487

* This is designed as sub function...plz see page_isolation.c also.

4499

* This is designed as sub function...plz see page_isolation.c also.

4488

* set/clear page block's type to be ISOLATE.

4500

* set/clear page block's type to be ISOLATE.

4489

* page allocater never alloc memory from ISOLATE block.

4501

* page allocater never alloc memory from ISOLATE block.

4490

*/

4502

*/

4491

4503

4492

int set_migratetype_isolate(struct page *page)

4504

int set_migratetype_isolate(struct page *page)

4493

{

4505

{

4494

struct zone *zone;

4506

struct zone *zone;

4495

unsigned long flags;

4507

unsigned long flags;

4496

int ret = -EBUSY;

4508

int ret = -EBUSY;

4497

4509

4498

zone = page_zone(page);

4510

zone = page_zone(page);

4499

spin_lock_irqsave(&zone->lock, flags);

4511

spin_lock_irqsave(&zone->lock, flags);

4500

/*

4512

/*

4501

* In future, more migrate types will be able to be isolation target.

4513

* In future, more migrate types will be able to be isolation target.

4502

*/

4514

*/

4503

if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)

4515

if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)

4504

goto out;

4516

goto out;

4505

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

4517

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

4506

move_freepages_block(zone, page, MIGRATE_ISOLATE);

4518

move_freepages_block(zone, page, MIGRATE_ISOLATE);

4507

ret = 0;

4519

ret = 0;

4508

out:

4520

out:

4509

spin_unlock_irqrestore(&zone->lock, flags);

4521

spin_unlock_irqrestore(&zone->lock, flags);

4510

if (!ret)

4522

if (!ret)

4511

drain_all_pages();

4523

drain_all_pages();

4512

return ret;

4524

return ret;

4513

}

4525

}

4514

4526

4515

void unset_migratetype_isolate(struct page *page)

4527

void unset_migratetype_isolate(struct page *page)

4516

{

4528

{

4517

struct zone *zone;

4529

struct zone *zone;

4518

unsigned long flags;

4530

unsigned long flags;

4519

zone = page_zone(page);

4531

zone = page_zone(page);

4520

spin_lock_irqsave(&zone->lock, flags);

4532

spin_lock_irqsave(&zone->lock, flags);

4521

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

4533

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

4522

goto out;

4534

goto out;

4523

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4535

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4524

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4536

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4525

out:

4537

out:

4526

spin_unlock_irqrestore(&zone->lock, flags);

4538

spin_unlock_irqrestore(&zone->lock, flags);

4527

}

4539

}

4528

4540

4529

#ifdef CONFIG_MEMORY_HOTREMOVE

4541

#ifdef CONFIG_MEMORY_HOTREMOVE

4530

/*

4542

/*

4531

* All pages in the range must be isolated before calling this.

4543

* All pages in the range must be isolated before calling this.

4532

*/

4544

*/

4533

void

4545

void

4534

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

4546

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

4535

{

4547

{

4536

struct page *page;

4548

struct page *page;

4537

struct zone *zone;

4549

struct zone *zone;

4538

int order, i;

4550

int order, i;

4539

unsigned long pfn;

4551

unsigned long pfn;

4540

unsigned long flags;

4552

unsigned long flags;

4541

/* find the first valid pfn */

4553

/* find the first valid pfn */

4542

for (pfn = start_pfn; pfn < end_pfn; pfn++)

4554

for (pfn = start_pfn; pfn < end_pfn; pfn++)

4543

if (pfn_valid(pfn))

4555

if (pfn_valid(pfn))

4544

break;

4556

break;

4545

if (pfn == end_pfn)

4557

if (pfn == end_pfn)

4546

return;

4558

return;

4547

zone = page_zone(pfn_to_page(pfn));

4559

zone = page_zone(pfn_to_page(pfn));

4548

spin_lock_irqsave(&zone->lock, flags);

4560

spin_lock_irqsave(&zone->lock, flags);

4549

pfn = start_pfn;

4561

pfn = start_pfn;

4550

while (pfn < end_pfn) {

4562

while (pfn < end_pfn) {

4551

if (!pfn_valid(pfn)) {

4563

if (!pfn_valid(pfn)) {

4552

pfn++;

4564

pfn++;

4553

continue;

4565

continue;

4554

}

4566

}

4555

page = pfn_to_page(pfn);

4567

page = pfn_to_page(pfn);

4556

BUG_ON(page_count(page));

4568

BUG_ON(page_count(page));

4557

BUG_ON(!PageBuddy(page));

4569

BUG_ON(!PageBuddy(page));

4558

order = page_order(page);

4570

order = page_order(page);

4559

#ifdef CONFIG_DEBUG_VM

4571

#ifdef CONFIG_DEBUG_VM

4560

printk(KERN_INFO "remove from free list %lx %d %lx\n",

4572

printk(KERN_INFO "remove from free list %lx %d %lx\n",

4561

pfn, 1 << order, end_pfn);

4573

pfn, 1 << order, end_pfn);

4562

#endif

4574

#endif

4563

list_del(&page->lru);

4575

list_del(&page->lru);

4564

rmv_page_order(page);

4576

rmv_page_order(page);

4565

zone->free_area[order].nr_free--;

4577

zone->free_area[order].nr_free--;

4566

__mod_zone_page_state(zone, NR_FREE_PAGES,

4578

__mod_zone_page_state(zone, NR_FREE_PAGES,

4567

- (1UL << order));

4579

- (1UL << order));

4568

for (i = 0; i < (1 << order); i++)

4580

for (i = 0; i < (1 << order); i++)

4569

SetPageReserved((page+i));

4581

SetPageReserved((page+i));

4570

pfn += (1 << order);

4582

pfn += (1 << order);

4571

}

4583

}

4572

spin_unlock_irqrestore(&zone->lock, flags);

4584

spin_unlock_irqrestore(&zone->lock, flags);

4573

}

4585

}

4574

#endif

4586

#endif

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

page allocator: smarter retry of costly-order allocations

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/memcontrol.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
   /*
    * MAX_ACTIVE_REGIONS determines the maximum number of distinct
    * ranges of memory (RAM) that may be registered with add_active_range().
    * Ranges passed to add_active_range() will be merged if possible
    * so the number of times add_active_range() can be called is
    * related to the number of nodes and the number of holes
    */
   #ifdef CONFIG_MAX_ACTIVE_REGIONS
     /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
     #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
   #else
     #if MAX_NUMNODES >= 32
       /* If there can be many nodes, allow up to 50 holes per node */
       #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
     #else
       /* By default, allow up to 256 distinct regions */
       #define MAX_ACTIVE_REGIONS 256
     #endif
   #endif
   static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
   static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
   static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
   /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
   int movable_zone;
   EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 EXPORT_SYMBOL(nr_node_ids);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 static void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	do {
 		seq = zone_span_seqbegin(zone);
 		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
 			ret = 1;
 		else if (pfn < zone->zone_start_pfn)
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	void *pc = page_get_page_cgroup(page);
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
 	if (pc) {
 		printk(KERN_EMERG "cgroup:%p\n", pc);
 		page_reset_bad_cgroup(page);
 	}
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
 	page->flags &= ~(1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_buddy );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
 	add_taint(TAINT_BAD_PAGE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 static void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		__SetPageTail(p);
 		p->first_page = page;
 	}
 }
 static void destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	if (unlikely(compound_order(page) != order))
 		bad_page(page);
 	if (unlikely(!PageHead(page)))
 			bad_page(page);
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) |
 				(p->first_page != page)))
 			bad_page(page);
 		__ClearPageTail(p);
 	}
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
 {
 	unsigned long buddy_idx = page_idx ^ (1 << order);
 	return page + (buddy_idx - page_idx);
 }
 static inline unsigned long
 __find_combined_index(unsigned long page_idx, unsigned int order)
 {
 	return (page_idx & ~(1 << order));
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we use PG_buddy.
  * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_buddy. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order)
 {
 	unsigned long page_idx;
 	int order_size = 1 << order;
 	int migratetype = get_pageblock_migratetype(page);
 	if (unlikely(PageCompound(page)))
 		destroy_compound_page(page, order);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & (order_size - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	__mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
 	while (order < MAX_ORDER-1) {
 		unsigned long combined_idx;
 		struct page *buddy;
 		buddy = __page_find_buddy(page, page_idx, order);
 		if (!page_is_buddy(page, buddy, order))
 			break;		/* Move the buddy up one level. */
 		list_del(&buddy->lru);
 		zone->free_area[order].nr_free--;
 		rmv_page_order(buddy);
 		combined_idx = __find_combined_index(page_idx, order);
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	list_add(&page->lru,
 		&zone->free_area[order].free_list[migratetype]);
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
 			1 << PG_buddy ))))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not free the page.  But we shall soon need
 	 * to do more, for when the ZERO_PAGE count wraps negative.
 	 */
 	return PageReserved(page);
 }
 /*
  * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pages_bulk(struct zone *zone, int count,
 					struct list_head *list, int order)
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
 	while (count--) {
 		struct page *page;
 		VM_BUG_ON(list_empty(list));
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_one_page list manipulates */
 		list_del(&page->lru);
 		__free_one_page(page, zone, order);
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
 	spin_lock(&zone->lock);
 	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order);
 	spin_unlock(&zone->lock);
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int i;
 	int reserved = 0;
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
 		return;
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order);
 	local_irq_restore(flags);
 }
 /*
  * permit the bootmem allocator to evade page validation on high-order frees
  */
 void __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
 		set_page_refcounted(page);
 		__free_page(page);
 	} else {
 		int loop;
 		prefetchw(page);
 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
 			struct page *p = &page[loop];
 			if (loop + 1 < BITS_PER_LONG)
 				prefetchw(p + 1);
 			__ClearPageReserved(p);
 			set_page_count(p, 0);
 		}
 		set_page_refcounted(page);
 		__free_pages(page, order);
 	}
 }
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- wli
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & (
 			1 << PG_lru	|
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback |
 			1 << PG_reserved |
 			1 << PG_buddy ))))
 		bad_page(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not allocate the page: as a safety net.
 	 */
 	if (PageReserved(page))
 		return 1;
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area * area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			struct page *start_page, struct page *end_page,
 			int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_del(&page->lru);
 		list_add(&page->lru,
 			&zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (start_pfn < zone->zone_start_pfn)
 		start_page = page;
 	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static struct page *__rmqueue_fallback(struct zone *zone, int order,
 						int start_migratetype)
 {
 	struct free_area * area;
 	int current_order;
 	struct page *page;
 	int migratetype, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0; i < MIGRATE_TYPES - 1; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				continue;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			/*
 			 * If breaking a large block of pages, move all free
 			 * pages to the preferred allocation list. If falling
 			 * back for a reclaimable kernel allocation, be more
 			 * agressive about taking ownership of free pages
 			 */
 			if (unlikely(current_order >= (pageblock_order >> 1)) ||
 					start_migratetype == MIGRATE_RECLAIMABLE) {
 				unsigned long pages;
 				pages = move_freepages_block(zone, page,
 								start_migratetype);
 				/* Claim the whole block if over half of it is free */
 				if (pages >= (1 << (pageblock_order-1)))
 					set_pageblock_migratetype(page,
 								start_migratetype);
 				migratetype = start_migratetype;
 			}
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			__mod_zone_page_state(zone, NR_FREE_PAGES,
 							-(1UL << order));
 			if (current_order == pageblock_order)
 				set_pageblock_migratetype(page,
 							start_migratetype);
 			expand(zone, page, order, current_order, area, migratetype);
 			return page;
 		}
 	}
 	/* Use MIGRATE_RESERVE rather than fail an allocation */
 	return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page))
 		page = __rmqueue_fallback(zone, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype)
 {
 	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		list_add(&page->lru, list);
 		set_page_private(page, migratetype);
 		list = &page->lru;
 	}
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	local_irq_save(flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	free_pages_bulk(zone, to_drain, &pcp->list, 0);
 	pcp->count -= to_drain;
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		if (!populated_zone(zone))
 			continue;
 		pset = zone_pcp(zone, cpu);
 		pcp = &pset->pcp;
 		local_irq_save(flags);
 		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 		pcp->count = 0;
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator
  */
 void drain_all_pages(void)
 {
 	on_each_cpu(drain_local_pages, NULL, 0, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (!zone->spanned_pages)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  */
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	if (PageAnon(page))
 		page->mapping = NULL;
 	if (free_pages_check(page))
 		return;
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	if (cold)
 		list_add_tail(&page->lru, &pcp->list);
 	else
 		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 		pcp->count -= pcp->batch;
 	}
 	local_irq_restore(flags);
 	put_cpu();
 }
 void free_hot_page(struct page *page)
 {
 	free_hot_cold_page(page, 0);
 }
 void free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 again:
 	cpu  = get_cpu();
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		pcp = &zone_pcp(zone, cpu)->pcp;
 		local_irq_save(flags);
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			if (unlikely(!pcp->count))
 				goto failed;
 		}
 		/* Find a page of the appropriate migrate type */
 		if (cold) {
 			list_for_each_entry_reverse(page, &pcp->list, lru)
 				if (page_private(page) == migratetype)
 					break;
 		} else {
 			list_for_each_entry(page, &pcp->list, lru)
 				if (page_private(page) == migratetype)
 					break;
 		}
 		/* Allocate more to the pcp list if necessary */
 		if (unlikely(&page->lru == &pcp->list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
 			page = list_entry(pcp->list.next, struct page, lru);
 		}
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 	}
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 	put_cpu();
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	put_cpu();
 	return NULL;
 }
 #define ALLOC_NO_WATERMARKS	0x01 /* don't check watermarks at all */
 #define ALLOC_WMARK_MIN		0x02 /* use pages_min watermark */
 #define ALLOC_WMARK_LOW		0x04 /* use pages_low watermark */
 #define ALLOC_WMARK_HIGH	0x08 /* use pages_high watermark */
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct fail_page_alloc_attr {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 	struct dentry *ignore_gfp_highmem_file;
 	struct dentry *ignore_gfp_wait_file;
 	struct dentry *min_order_file;
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return 0;
 	if (gfp_mask & __GFP_NOFAIL)
 		return 0;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return 0;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return 0;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	int err;
 	err = init_fault_attr_dentries(&fail_page_alloc.attr,
 				       "fail_page_alloc");
 	if (err)
 		return err;
 	dir = fail_page_alloc.attr.dentries.dir;
 	fail_page_alloc.ignore_gfp_wait_file =
 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				      &fail_page_alloc.ignore_gfp_wait);
 	fail_page_alloc.ignore_gfp_highmem_file =
 		debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				      &fail_page_alloc.ignore_gfp_highmem);
 	fail_page_alloc.min_order_file =
 		debugfs_create_u32("min-order", mode, dir,
 				   &fail_page_alloc.min_order);
 	if (!fail_page_alloc.ignore_gfp_wait_file ||
             !fail_page_alloc.ignore_gfp_highmem_file ||
             !fail_page_alloc.min_order_file) {
 		err = -ENOMEM;
 		debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
 		debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
 		debugfs_remove(fail_page_alloc.min_order_file);
 		cleanup_fault_attr_dentries(&fail_page_alloc.attr);
 	}
 	return err;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return 1 if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
 	int o;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return 0;
 	}
 	return 1;
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_HIGH_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone, *preferred_zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
 							&preferred_zone);
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				goto try_next_zone;
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
 			if (alloc_flags & ALLOC_WMARK_MIN)
 				mark = zone->pages_min;
 			else if (alloc_flags & ALLOC_WMARK_LOW)
 				mark = zone->pages_low;
 			else
 				mark = zone->pages_high;
 			if (!zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags)) {
 				if (!zone_reclaim_mode ||
 				    !zone_reclaim(zone, gfp_mask, order))
 					goto this_zone_full;
 			}
 		}
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
 		if (page)
 			break;
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 try_next_zone:
 		if (NUMA_BUILD && !did_zlc_setup) {
 			/* we do zlc_setup after the first zone is tried */
 			allowednodes = zlc_setup(zonelist, alloc_flags);
 			zlc_active = 1;
 			did_zlc_setup = 1;
 		}
 	}
 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 static struct page *
 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zoneref *z;
 	struct zone *zone;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int do_retry;
 	int alloc_flags;
-	int did_some_progress;
+	unsigned long did_some_progress;
+	unsigned long pages_reclaimed = 0;
 	might_sleep_if(wait);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 restart:
 	z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
 	if (unlikely(!z->zone)) {
 		/*
 		 * Happens if we have an empty zonelist as a result of
 		 * GFP_THISNODE being used on a memoryless node
 		 */
 		return NULL;
 	}
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 *
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags = ALLOC_WMARK_MIN;
 	if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
 	/* This allocation should allow future memory freeing. */
 rebalance:
 	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
 			&& !in_interrupt()) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
 			page = get_page_from_freelist(gfp_mask, nodemask, order,
 				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
 				congestion_wait(WRITE, HZ/50);
 				goto nofail_alloc;
 			}
 		}
 		goto nopage;
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	if (order != 0)
 		drain_all_pages();
 	if (likely(did_some_progress)) {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
 	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 		if (!try_set_zone_oom(zonelist, gfp_mask)) {
 			schedule_timeout_uninterruptible(1);
 			goto restart;
 		}
 		/*
 		 * Go through the zonelist yet one more time, keep
 		 * very high watermark here, this is only to catch
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
 		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 			order, zonelist, high_zoneidx,
 			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
 		}
 		/* The OOM killer will not help higher order allocs so fail */
 		if (order > PAGE_ALLOC_COSTLY_ORDER) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto nopage;
 		}
 		out_of_memory(zonelist, gfp_mask, order);
 		clear_zonelist_oom(zonelist, gfp_mask);
 		goto restart;
 	}
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
 	 *
-	 * In this implementation, either order <= PAGE_ALLOC_COSTLY_ORDER or
+	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-	 * __GFP_REPEAT mean __GFP_NOFAIL, but that may not be true in other
+	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
+	 *
+	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+	 * specified, then we retry until we no longer reclaim any pages
+	 * (above), or we've reclaimed an order of pages at least as
+	 * large as the allocation's order. In both cases, if the
+	 * allocation still fails, we stop retrying.
 	 */
+	pages_reclaimed += did_some_progress;
 	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
-		if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
+		if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-						(gfp_mask & __GFP_REPEAT))
 			do_retry = 1;
+		} else {
+			if (gfp_mask & __GFP_REPEAT &&
+				pages_reclaimed < (1 << order))
+					do_retry = 1;
+		}
 		if (gfp_mask & __GFP_NOFAIL)
 			do_retry = 1;
 	}
 	if (do_retry) {
 		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 nopage:
 	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
 		printk(KERN_WARNING "%s: page allocation failure."
 			" order:%d, mode:0x%x\n",
 			p->comm, order, gfp_mask);
 		dump_stack();
 		show_mem();
 	}
 got_pg:
 	return page;
 }
 struct page *
 __alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
 	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
 }
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
 }
 EXPORT_SYMBOL(__alloc_pages);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page * page;
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	struct page * page;
 	/*
 	 * get_zeroed_page() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 	if (page)
 		return (unsigned long) page_address(page);
 	return 0;
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __pagevec_free(struct pagevec *pvec)
 {
 	int i = pagevec_count(pvec);
 	while (--i >= 0)
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 static unsigned int nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned int sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = zone->pages_high;
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /*
  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  */
 unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /*
  * Amount of free RAM allocatable within all zones
  */
 unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (NUMA_BUILD)
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  */
 void show_free_areas(void)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = zone_pcp(zone, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE),
 		global_page_state(NR_INACTIVE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE) +
 			global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 	for_each_zone(zone) {
 		int i;
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active:%lukB"
 			" inactive:%lukB"
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
 			K(zone_page_state(zone, NR_ACTIVE)),
 			K(zone_page_state(zone, NR_INACTIVE)),
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		if (!populated_zone(zone))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			nr[order] = zone->free_area[order].nr_free;
 			total += nr[order] << order;
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++)
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 		printk("= %lukB\n", K(total));
 	}
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 	BUG_ON(zone_type >= MAX_NR_ZONES);
 	zone_type++;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	if (s)
 		return __parse_numa_zonelist_order(s);
 	return 0;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	if (write)
 		strncpy(saved_string, (char*)table->data,
 			NUMA_ZONELIST_ORDER_LEN);
 	ret = proc_dostring(table, write, file, buffer, length, ppos);
 	if (ret)
 		return ret;
 	if (write) {
 		int oldval = user_zonelist_order;
 		if (__parse_numa_zonelist_order((char*)table->data)) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char*)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order)
 			build_all_zonelists();
 	}
 	return 0;
 }
 #define MAX_NODE_LOAD (num_online_nodes())
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 	node_to_cpumask_ptr(tmp, 0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_HIGH_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		node_to_cpumask_ptr_next(tmp, n);
 		if (!cpus_empty(*tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size,total_size;
 	struct zone *z;
 	int average_size;
 	/*
          * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
   	 * If there is a node whose DMA/DMA32 memory is very big area on
  	 * local memory, NODE_ORDER may be suitable.
          */
 	average_size = total_size /
 				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = num_online_nodes();
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_load, 0, sizeof(node_load));
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
 		/*
 		 * If another node is sufficiently far away then it is better
 		 * to reclaim pages in a zone before going off node.
 		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (distance != node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 	pgdat->node_zonelists[1].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /* return values int ....just for stop_machine_run() */
 static int __build_all_zonelists(void *dummy)
 {
 	int nid;
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	return 0;
 }
 void build_all_zonelists(void)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			num_online_nodes(),
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on zone->pages_min. The memory within the
  * reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn;
 	struct page *page;
 	unsigned long reserve, block_migratetype;
 	/* Get the start pfn, end pfn and the number of blocks to reserve */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
 	reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
 							pageblock_order;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Blocks with reserved pages will never free, skip them. */
 		if (PageReserved(page))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* If this block is reserved, account for it */
 		if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
 			reserve--;
 			continue;
 		}
 		/* Suitable for reserving if this block is movable */
 		if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
 			set_pageblock_migratetype(page, MIGRATE_RESERVE);
 			move_freepages_block(zone, page, MIGRATE_RESERVE);
 			reserve--;
 			continue;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < z->zone_start_pfn + z->spanned_pages)
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = (1 << (fls(batch + batch/2)-1)) - 1;
 	return batch;
 }
 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	INIT_LIST_HEAD(&pcp->list);
 }
 /*
  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	struct per_cpu_pages *pcp;
 	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))
 		pcp->batch = PAGE_SHIFT * 8;
 }
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * Some NUMA counter updates may also be caught by the boot pagesets.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static struct per_cpu_pageset boot_pageset[NR_CPUS];
 /*
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
 static int __cpuinit process_zones(int cpu)
 {
 	struct zone *zone, *dzone;
 	int node = cpu_to_node(cpu);
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
 			goto bad;
 		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(zone_pcp(zone, cpu),
 			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
 	return 0;
 bad:
 	for_each_zone(dzone) {
 		if (!populated_zone(dzone))
 			continue;
 		if (dzone == zone)
 			break;
 		kfree(zone_pcp(dzone, cpu));
 		zone_pcp(dzone, cpu) = NULL;
 	}
 	return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
 	struct zone *zone;
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
 		/* Free per_cpu_pageset if it is slab allocated */
 		if (pset != &boot_pageset[cpu])
 			kfree(pset);
 		zone_pcp(zone, cpu) = NULL;
 	}
 }
 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
 	int cpu = (long)hcpu;
 	int ret = NOTIFY_OK;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		if (process_zones(cpu))
 			ret = NOTIFY_BAD;
 		break;
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		free_zone_pagesets(cpu);
 		break;
 	default:
 		break;
 	}
 	return ret;
 }
 static struct notifier_block __cpuinitdata pageset_notifier =
 	{ &pageset_cpuup_callback, NULL, 0 };
 void __init setup_per_cpu_pageset(void)
 {
 	int err;
 	/* Initialize per_cpu_pageset for cpu 0.
 	 * A cpuup callback will do this for every cpu
 	 * as it comes online
 	 */
 	err = process_zones(smp_processor_id());
 	BUG_ON(err);
 	register_cpu_notifier(&pageset_notifier);
 }
 #endif
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
  	if (system_state == SYSTEM_BOOTING) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
 		zone_pcp(zone, cpu) = &boot_pageset[cpu];
 		setup_pageset(&boot_pageset[cpu],0);
 #else
 		setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
 	}
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 			zone->name, zone->present_pages, batch);
 }
 __meminit int init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 /*
  * Basic iterator support. Return the first range of PFNs for a node
  * Note: nid == MAX_NUMNODES returns first region regardless of node
  */
 static int __meminit first_active_region_index_in_nid(int nid)
 {
 	int i;
 	for (i = 0; i < nr_nodemap_entries; i++)
 		if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
 			return i;
 	return -1;
 }
 /*
  * Basic iterator support. Return the next active range of PFNs for a node
  * Note: nid == MAX_NUMNODES returns next region regardless of node
  */
 static int __meminit next_active_region_index_in_nid(int index, int nid)
 {
 	for (index = index + 1; index < nr_nodemap_entries; index++)
 		if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
 			return index;
 	return -1;
 }
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int i;
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		unsigned long start_pfn = early_node_map[i].start_pfn;
 		unsigned long end_pfn = early_node_map[i].end_pfn;
 		if (start_pfn <= pfn && pfn < end_pfn)
 			return early_node_map[i].nid;
 	}
 	return 0;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 /* Basic iterator support to walk early_node_map[] */
 #define for_each_active_range_index_in_nid(i, nid) \
 	for (i = first_active_region_index_in_nid(nid); i != -1; \
 				i = next_active_region_index_in_nid(i, nid))
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn)
 {
 	int i;
 	for_each_active_range_index_in_nid(i, nid) {
 		unsigned long size_pages = 0;
 		unsigned long end_pfn = early_node_map[i].end_pfn;
 		if (early_node_map[i].start_pfn >= max_low_pfn)
 			continue;
 		if (end_pfn > max_low_pfn)
 			end_pfn = max_low_pfn;
 		size_pages = end_pfn - early_node_map[i].start_pfn;
 		free_bootmem_node(NODE_DATA(early_node_map[i].nid),
 				PFN_PHYS(early_node_map[i].start_pfn),
 				size_pages << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	int i;
 	for_each_active_range_index_in_nid(i, nid)
 		memory_present(early_node_map[i].nid,
 				early_node_map[i].start_pfn,
 				early_node_map[i].end_pfn);
 }
 /**
  * push_node_boundaries - Push node boundaries to at least the requested boundary
  * @nid: The nid of the node to push the boundary for
  * @start_pfn: The start pfn of the node
  * @end_pfn: The end pfn of the node
  *
  * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
  * time. Specifically, on x86_64, SRAT will report ranges that can potentially
  * be hotplugged even though no physical memory exists. This function allows
  * an arch to push out the node boundaries so mem_map is allocated that can
  * be used later.
  */
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 void __init push_node_boundaries(unsigned int nid,
 		unsigned long start_pfn, unsigned long end_pfn)
 {
 	printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
 			nid, start_pfn, end_pfn);
 	/* Initialise the boundary for this node if necessary */
 	if (node_boundary_end_pfn[nid] == 0)
 		node_boundary_start_pfn[nid] = -1UL;
 	/* Update the boundaries */
 	if (node_boundary_start_pfn[nid] > start_pfn)
 		node_boundary_start_pfn[nid] = start_pfn;
 	if (node_boundary_end_pfn[nid] < end_pfn)
 		node_boundary_end_pfn[nid] = end_pfn;
 }
 /* If necessary, push the node boundary out for reserve hotadd */
 static void __meminit account_node_boundary(unsigned int nid,
 		unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
 			nid, *start_pfn, *end_pfn);
 	/* Return if boundary information has not been provided */
 	if (node_boundary_end_pfn[nid] == 0)
 		return;
 	/* Check the boundaries and update if necessary */
 	if (node_boundary_start_pfn[nid] < *start_pfn)
 		*start_pfn = node_boundary_start_pfn[nid];
 	if (node_boundary_end_pfn[nid] > *end_pfn)
 		*end_pfn = node_boundary_end_pfn[nid];
 }
 #else
 void __init push_node_boundaries(unsigned int nid,
 		unsigned long start_pfn, unsigned long end_pfn) {}
 static void __meminit account_node_boundary(unsigned int nid,
 		unsigned long *start_pfn, unsigned long *end_pfn) {}
 #endif
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_active_range_index_in_nid(i, nid) {
 		*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
 		*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 	/* Push the node boundaries out if requested */
 	account_node_boundary(nid, start_pfn, end_pfn);
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independant of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the node and zone */
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	int i = 0;
 	unsigned long prev_end_pfn = 0, hole_pages = 0;
 	unsigned long start_pfn;
 	/* Find the end_pfn of the first active range of pfns in the node */
 	i = first_active_region_index_in_nid(nid);
 	if (i == -1)
 		return 0;
 	prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
 	/* Account for ranges before physical memory on this node */
 	if (early_node_map[i].start_pfn > range_start_pfn)
 		hole_pages = prev_end_pfn - range_start_pfn;
 	/* Find all holes for the zone within the node */
 	for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
 		/* No need to continue if prev_end_pfn is outside the zone */
 		if (prev_end_pfn >= range_end_pfn)
 			break;
 		/* Make sure the end of the zone is not within the hole */
 		start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
 		prev_end_pfn = max(prev_end_pfn, range_start_pfn);
 		/* Update the hole size cound and move on */
 		if (start_pfn > range_start_pfn) {
 			BUG_ON(prev_end_pfn > start_pfn);
 			hole_pages += start_pfn - prev_end_pfn;
 		}
 		prev_end_pfn = early_node_map[i].end_pfn;
 	}
 	/* Account for ranges past physical memory on this node */
 	if (range_end_pfn > prev_end_pfn)
 		hole_pages += range_end_pfn -
 				max(range_start_pfn, prev_end_pfn);
 	return hole_pages;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
 							node_start_pfn);
 	zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
 							node_end_pfn);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 								zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 								zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize) {
 		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 		memset(zone->pageblock_flags, 0, usemapsize);
 	}
 }
 #else
 static void inline setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Return a sensible default order for the pageblock size. */
 static inline int pageblock_default_order(void)
 {
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		return HUGETLB_PAGE_ORDER;
 	return MAX_ORDER-1;
 }
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 static inline void __init set_pageblock_order(unsigned int order)
 {
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * and pageblock_default_order() are unused as pageblock_order is set
  * at compile-time. See include/linux/pageblock-flags.h for the values of
  * pageblock_order based on the kernel config
  */
 static inline int pageblock_default_order(unsigned int order)
 {
 	return MAX_ORDER-1;
 }
 #define set_pageblock_order(x)	do {} while (0)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
 								zholes_size);
 		/*
 		 * Adjust realsize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
 		if (realsize >= memmap_pages) {
 			realsize -= memmap_pages;
 			printk(KERN_DEBUG
 				"  %s zone: %lu pages used for memmap\n",
 				zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds realsize %lu\n",
 				zone_names[j], memmap_pages, realsize);
 		/* Account for reserved pages */
 		if (j == 0 && realsize > dma_reserve) {
 			realsize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone->prev_priority = DEF_PRIORITY;
 		zone_pcp_init(zone);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		zone->nr_scan_active = 0;
 		zone->nr_scan_inactive = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
 			continue;
 		set_pageblock_order(pageblock_default_order());
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long node_start_pfn,
 		unsigned long *zholes_size)
 {
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 static void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #else
 static inline void setup_nr_node_ids(void)
 {
 }
 #endif
 /**
  * add_active_range - Register a range of PFNs backed by physical memory
  * @nid: The node ID the range resides on
  * @start_pfn: The start PFN of the available physical memory
  * @end_pfn: The end PFN of the available physical memory
  *
  * These ranges are stored in an early_node_map[] and later used by
  * free_area_init_nodes() to calculate zone sizes and holes. If the
  * range spans a memory hole, it is up to the architecture to ensure
  * the memory is not freed by the bootmem allocator. If possible
  * the range being registered will be merged with existing ranges.
  */
 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 						unsigned long end_pfn)
 {
 	int i;
 	printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
 			  "%d entries of %d used\n",
 			  nid, start_pfn, end_pfn,
 			  nr_nodemap_entries, MAX_ACTIVE_REGIONS);
 	/* Merge with existing active regions if possible */
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		if (early_node_map[i].nid != nid)
 			continue;
 		/* Skip if an existing region covers this new one */
 		if (start_pfn >= early_node_map[i].start_pfn &&
 				end_pfn <= early_node_map[i].end_pfn)
 			return;
 		/* Merge forward if suitable */
 		if (start_pfn <= early_node_map[i].end_pfn &&
 				end_pfn > early_node_map[i].end_pfn) {
 			early_node_map[i].end_pfn = end_pfn;
 			return;
 		}
 		/* Merge backward if suitable */
 		if (start_pfn < early_node_map[i].end_pfn &&
 				end_pfn >= early_node_map[i].start_pfn) {
 			early_node_map[i].start_pfn = start_pfn;
 			return;
 		}
 	}
 	/* Check that early_node_map is large enough */
 	if (i >= MAX_ACTIVE_REGIONS) {
 		printk(KERN_CRIT "More than %d memory regions, truncating\n",
 							MAX_ACTIVE_REGIONS);
 		return;
 	}
 	early_node_map[i].nid = nid;
 	early_node_map[i].start_pfn = start_pfn;
 	early_node_map[i].end_pfn = end_pfn;
 	nr_nodemap_entries = i + 1;
 }
 /**
  * shrink_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
  * @old_end_pfn: The old end PFN of the range
  * @new_end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
  * The map is kept at the end physical page range that has already been
  * registered with add_active_range(). This function allows an arch to shrink
  * an existing registered range.
  */
 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
 						unsigned long new_end_pfn)
 {
 	int i;
 	/* Find the old active region end and shrink */
 	for_each_active_range_index_in_nid(i, nid)
 		if (early_node_map[i].end_pfn == old_end_pfn) {
 			early_node_map[i].end_pfn = new_end_pfn;
 			break;
 		}
 }
 /**
  * remove_all_active_ranges - Remove all currently registered regions
  *
  * During discovery, it may be found that a table like SRAT is invalid
  * and an alternative discovery method must be used. This function removes
  * all currently registered regions.
  */
 void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
 	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 /* Compare two active node_active_regions */
 static int __init cmp_node_active_region(const void *a, const void *b)
 {
 	struct node_active_region *arange = (struct node_active_region *)a;
 	struct node_active_region *brange = (struct node_active_region *)b;
 	/* Done this way to avoid overflows */
 	if (arange->start_pfn > brange->start_pfn)
 		return 1;
 	if (arange->start_pfn < brange->start_pfn)
 		return -1;
 	return 0;
 }
 /* sort the node_map by start_pfn */
 static void __init sort_node_map(void)
 {
 	sort(early_node_map, (size_t)nr_nodemap_entries,
 			sizeof(struct node_active_region),
 			cmp_node_active_region, NULL);
 }
 /* Find the lowest pfn for a node */
 unsigned long __init find_min_pfn_for_node(unsigned long nid)
 {
 	int i;
 	unsigned long min_pfn = ULONG_MAX;
 	/* Assuming a sorted map, the first range found has the starting pfn */
 	for_each_active_range_index_in_nid(i, nid)
 		min_pfn = min(min_pfn, early_node_map[i].start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %lu\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /**
  * find_max_pfn_with_active_regions - Find the maximum PFN registered
  *
  * It returns the maximum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_max_pfn_with_active_regions(void)
 {
 	int i;
 	unsigned long max_pfn = 0;
 	for (i = 0; i < nr_nodemap_entries; i++)
 		max_pfn = max(max_pfn, early_node_map[i].end_pfn);
 	return max_pfn;
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_HIGH_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	int i;
 	unsigned long totalpages = 0;
 	for (i = 0; i < nr_nodemap_entries; i++) {
 		unsigned long pages = early_node_map[i].end_pfn -
 						early_node_map[i].start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
 	}
   	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		return;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_active_range_index_in_nid(i, nid) {
 			unsigned long start_pfn, end_pfn;
 			unsigned long size_pages;
 			start_pfn = max(early_node_map[i].start_pfn,
 						zone_movable_pfn[nid]);
 			end_pfn = early_node_map[i].end_pfn;
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisified
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisified
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 }
 /* Any regular memory on that node ? */
 static void check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
 	enum zone_type zone_type;
 	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages)
 			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 	}
 #endif
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long nid;
 	enum zone_type i;
 	/* Sort early_node_map as initialisation assumes it is sorted */
 	sort_node_map();
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes(zone_movable_pfn);
 	/* Print out the zone ranges */
 	printk("Zone PFN ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk("  %-8s %8lu -> %8lu\n",
 				zone_names[i],
 				arch_zone_lowest_possible_pfn[i],
 				arch_zone_highest_possible_pfn[i]);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start PFN for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
 	}
 	/* Print out the early_node_map[] */
 	printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
 	for (i = 0; i < nr_nodemap_entries; i++)
 		printk("  %3d: %8lu -> %8lu\n", early_node_map[i].nid,
 						early_node_map[i].start_pfn,
 						early_node_map[i].end_pfn);
 	/* Initialise every node */
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, pgdat, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 		check_for_regular_memory(pgdat);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, NODE_DATA(0), zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat pages_high as reserved pages. */
 			max += zone->pages_high;
 			if (max > zone->present_pages)
 				max = zone->present_pages;
 			reserve_pages += max;
 		}
 	}
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = present_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_pages_min - called when min_free_kbytes changes.
  *
  * Ensures that the pages_{min,low,high} values for each zone are set correctly
  * with respect to min_free_kbytes.
  */
 void setup_per_zone_pages_min(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		tmp = (u64)pages_min * zone->present_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The (pages_high-pages_low) and (pages_low-pages_min)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			int min_pages;
 			min_pages = zone->present_pages / 1024;
 			if (min_pages < SWAP_CLUSTER_MAX)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->pages_min = tmp;
 		}
 		zone->pages_low   = zone->pages_min + (tmp >> 2);
 		zone->pages_high  = zone->pages_min + (tmp >> 1);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 static int __init init_per_zone_pages_min(void)
 {
 	unsigned long lowmem_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (min_free_kbytes < 128)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (write)
 		setup_per_zone_pages_min();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->present_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->present_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * pages_min watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
  * can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
 	for_each_zone(zone) {
 		for_each_online_cpu(cpu) {
 			unsigned long  high;
 			high = zone->present_pages / percpu_pagelist_fraction;
 			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
 		}
 	}
 	return 0;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long limit)
 {
 	unsigned long long max = limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			unsigned long order = get_order(size);
 			table = (void*) __get_free_pages(GFP_ATOMIC, order);
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table.
 			 */
 			if (table) {
 				unsigned long alloc_end = (unsigned long)table +
 						(PAGE_SIZE << order);
 				unsigned long used = (unsigned long)table +
 						PAGE_ALIGN(size);
 				split_page(virt_to_page(table), order);
 				while (used < alloc_end) {
 					free_page(used);
 					used += PAGE_SIZE;
 				}
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1U << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
 	return __pfn_to_page(pfn);
 }
 unsigned long page_to_pfn(struct page *page)
 {
 	return __page_to_pfn(page);
 }
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - zone->zone_start_pfn;
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_group(struct page *page,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long flags = 0;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (test_bit(bitidx + start_bitidx, bitmap))
 			flags |= value;
 	return flags;
 }
 /**
  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	VM_BUG_ON(pfn < zone->zone_start_pfn);
 	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (flags & value)
 			__set_bit(bitidx + start_bitidx, bitmap);
 		else
 			__clear_bit(bitidx + start_bitidx, bitmap);
 }
 /*
  * This is designed as sub function...plz see page_isolation.c also.
  * set/clear page block's type to be ISOLATE.
  * page allocater never alloc memory from ISOLATE block.
  */
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags;
 	int ret = -EBUSY;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	/*
 	 * In future, more migrate types will be able to be isolation target.
 	 */
 	if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
 		goto out;
 	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 	move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	ret = 0;
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 	if (!ret)
 		drain_all_pages();
 	return ret;
 }
 void unset_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 	move_freepages_block(zone, page, MIGRATE_MOVABLE);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES,
 				      - (1UL << order));
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif

 /*
  *  linux/mm/vmscan.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
 #include "internal.h"
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 	int may_writepage;
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
 	 * In this context, it doesn't matter that we scan the
 	 * whole list at once. */
 	int swap_cluster_max;
 	int swappiness;
 	int all_unreclaimable;
 	int order;
 	/* Which cgroup do we reclaim from */
 	struct mem_cgroup *mem_cgroup;
 	/* Pluggable isolate pages callback */
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
 			struct zone *z, struct mem_cgroup *mem_cont,
 			int active);
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetch(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetchw(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 /*
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 #define scan_global_lru(sc)	(!(sc)->mem_cgroup)
 #else
 #define scan_global_lru(sc)	(1)
 #endif
 /*
  * Add a shrinker callback to be called from the vm
  */
 void register_shrinker(struct shrinker *shrinker)
 {
 	shrinker->nr = 0;
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
 }
 EXPORT_SYMBOL(register_shrinker);
 /*
  * Remove one
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
 /*
  * Call the shrink functions to age shrinkable caches
  *
  * Here we assume it costs one seek to replace a lru page and that it also
  * takes a seek to recreate a cache object.  With this in mind we age equal
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
  * If the vm encountered mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  *
  * `lru_pages' represents the number of on-LRU pages in all the zones which
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
 	if (!down_read_trylock(&shrinker_rwsem))
 		return 1;	/* Assume we'll be able to shrink next time */
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
 		unsigned long total_scan;
 		unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
 		delta = (4 * scanned) / shrinker->seeks;
 		delta *= max_pass;
 		do_div(delta, lru_pages + 1);
 		shrinker->nr += delta;
 		if (shrinker->nr < 0) {
 			printk(KERN_ERR "%s: nr=%ld\n",
 					__FUNCTION__, shrinker->nr);
 			shrinker->nr = max_pass;
 		}
 		/*
 		 * Avoid risking looping forever due to too large nr value:
 		 * never try to free more than twice the estimate number of
 		 * freeable entries.
 		 */
 		if (shrinker->nr > max_pass * 2)
 			shrinker->nr = max_pass * 2;
 		total_scan = shrinker->nr;
 		shrinker->nr = 0;
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
 			int shrink_ret;
 			int nr_before;
 			nr_before = (*shrinker->shrink)(0, gfp_mask);
 			shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
 			if (shrink_ret == -1)
 				break;
 			if (shrink_ret < nr_before)
 				ret += nr_before - shrink_ret;
 			count_vm_events(SLABS_SCANNED, this_scan);
 			total_scan -= this_scan;
 			cond_resched();
 		}
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
 	return ret;
 }
 /* Called without lock on whether page is mapped, so answer is unstable */
 static inline int page_mapping_inuse(struct page *page)
 {
 	struct address_space *mapping;
 	/* Page is in somebody's page tables. */
 	if (page_mapped(page))
 		return 1;
 	/* Be more reluctant to reclaim swapcache than pagecache */
 	if (PageSwapCache(page))
 		return 1;
 	mapping = page_mapping(page);
 	if (!mapping)
 		return 0;
 	/* File is mmap'd by somebody? */
 	return mapping_mapped(mapping);
 }
 static inline int is_page_cache_freeable(struct page *page)
 {
 	return page_count(page) - !!PagePrivate(page) == 2;
 }
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
 	return 0;
 }
 /*
  * We detected a synchronous write error writing a page out.  Probably
  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
  * fsync(), msync() or close().
  *
  * The tricky part is that after writepage we cannot touch the mapping: nothing
  * prevents it from being freed up.  But we have a ref on the page and once
  * that page is locked, the mapping is pinned.
  *
  * We're allowed to run sleeping lock_page() here because we know the caller has
  * __GFP_FS.
  */
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
 	if (page_mapping(page) == mapping)
 		mapping_set_error(mapping, error);
 	unlock_page(page);
 }
 /* Request for sync pageout. */
 enum pageout_io {
 	PAGEOUT_IO_ASYNC,
 	PAGEOUT_IO_SYNC,
 };
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
 	PAGE_KEEP,
 	/* move page to the active list, page is locked */
 	PAGE_ACTIVATE,
 	/* page has been sent to the disk successfully, page is unlocked */
 	PAGE_SUCCESS,
 	/* page is clean and locked */
 	PAGE_CLEAN,
 } pageout_t;
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
 						enum pageout_io sync_writeback)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
 	 * will be non-blocking.  To prevent this allocation from being
 	 * stalled by pagecache activity.  But note that there may be
 	 * stalls if we need to run get_block().  We could test
 	 * PagePrivate for that.
 	 *
 	 * If this process is currently in generic_file_write() against
 	 * this page's queue, we can perform writeback even if that
 	 * will block.
 	 *
 	 * If the page is swapcache, write it back even if that would
 	 * block, for some throttling. This happens by accident, because
 	 * swap_backing_dev_info is bust: it doesn't reflect the
 	 * congestion state of the swapdevs.  Easy to fix, if needed.
 	 * See swapfile.c:page_queue_congested().
 	 */
 	if (!is_page_cache_freeable(page))
 		return PAGE_KEEP;
 	if (!mapping) {
 		/*
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
 		if (PagePrivate(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __FUNCTION__);
 				return PAGE_CLEAN;
 			}
 		}
 		return PAGE_KEEP;
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
 	if (!may_write_to_queue(mapping->backing_dev_info))
 		return PAGE_KEEP;
 	if (clear_page_dirty_for_io(page)) {
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.nonblocking = 1,
 			.for_reclaim = 1,
 		};
 		SetPageReclaim(page);
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
 		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
 		/*
 		 * Wait on writeback if requested to. This happens when
 		 * direct reclaiming a large contiguous area and the
 		 * first attempt to free a range of pages fails.
 		 */
 		if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
 			wait_on_page_writeback(page);
 		if (!PageWriteback(page)) {
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
 	return PAGE_CLEAN;
 }
 /*
  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
  * someone else has a ref on the page, abort and return 0.  If it was
  * successfully detached, return 1.  Assumes the caller has a single ref on
  * this page.
  */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 	write_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
 	 * Must be careful with the order of the tests. When someone has
 	 * a ref to the page, it may be possible that they dirty it then
 	 * drop the reference. So if PageDirty is tested before page_count
 	 * here, then the following race may occur:
 	 *
 	 * get_user_pages(&page);
 	 * [user mapping goes away]
 	 * write_to(page);
 	 *				!PageDirty(page)    [good]
 	 * SetPageDirty(page);
 	 * put_page(page);
 	 *				!page_count(page)   [good, discard it]
 	 *
 	 * [oops, our write_to data is lost]
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
 	 * load is not satisfied before that of page->_count.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
 	if (unlikely(page_count(page) != 2))
 		goto cannot_free;
 	smp_rmb();
 	if (unlikely(PageDirty(page)))
 		goto cannot_free;
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		write_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
 		__put_page(page);	/* The pagecache ref */
 		return 1;
 	}
 	__remove_from_page_cache(page);
 	write_unlock_irq(&mapping->tree_lock);
 	__put_page(page);
 	return 1;
 cannot_free:
 	write_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
 					struct scan_control *sc,
 					enum pageout_io sync_writeback)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
 	unsigned long nr_reclaimed = 0;
 	cond_resched();
 	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
 		int referenced;
 		cond_resched();
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 		if (TestSetPageLocked(page))
 			goto keep;
 		VM_BUG_ON(PageActive(page));
 		sc->nr_scanned++;
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 		if (PageWriteback(page)) {
 			/*
 			 * Synchronous reclaim is performed in two passes,
 			 * first an asynchronous pass over the list to
 			 * start parallel writeback, and a second synchronous
 			 * pass to wait for the IO to complete.  Wait here
 			 * for any page for which writeback has already
 			 * started.
 			 */
 			if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
 				wait_on_page_writeback(page);
 			else
 				goto keep_locked;
 		}
 		referenced = page_referenced(page, 1, sc->mem_cgroup);
 		/* In active use or really unfreeable?  Activate it. */
 		if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
 					referenced && page_mapping_inuse(page))
 			goto activate_locked;
 #ifdef CONFIG_SWAP
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page))
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
 #endif /* CONFIG_SWAP */
 		mapping = page_mapping(page);
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
 		if (PageDirty(page)) {
 			if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
 			if (!sc->may_writepage)
 				goto keep_locked;
 			/* Page is dirty, try to write it out here */
 			switch (pageout(page, mapping, sync_writeback)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page) || PageDirty(page))
 					goto keep;
 				/*
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
 				if (TestSetPageLocked(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
 				mapping = page_mapping(page);
 			case PAGE_CLEAN:
 				; /* try to free the page below */
 			}
 		}
 		/*
 		 * If the page has buffers, try to free the buffer mappings
 		 * associated with this page. If we succeed we try to free
 		 * the page as well.
 		 *
 		 * We do this even if the page is PageDirty().
 		 * try_to_release_page() does not perform I/O, but it is
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
 		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
 		 * Rarely, pages can have buffers and no ->mapping.  These are
 		 * the pages which were not successfully invalidated in
 		 * truncate_complete_page().  We try to drop those buffers here
 		 * and if that worked, and the page is no longer mapped into
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (PagePrivate(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1)
 				goto free_it;
 		}
 		if (!mapping || !remove_mapping(mapping, page))
 			goto keep_locked;
 free_it:
 		unlock_page(page);
 		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page));
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	count_vm_events(PGACTIVATE, pgactivate);
 	return nr_reclaimed;
 }
 /* LRU Isolation modes. */
 #define ISOLATE_INACTIVE 0	/* Isolate inactive pages. */
 #define ISOLATE_ACTIVE 1	/* Isolate active pages. */
 #define ISOLATE_BOTH 2		/* Isolate both active and inactive pages. */
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
  * freed elsewhere are also ignored.
  *
  * page:	page to consider
  * mode:	one of the LRU isolation modes defined above
  *
  * returns 0 on success, -ve errno on failure.
  */
 int __isolate_lru_page(struct page *page, int mode)
 {
 	int ret = -EINVAL;
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
 		return ret;
 	/*
 	 * When checking the active state, we need to be sure we are
 	 * dealing with comparible boolean values.  Take the logical not
 	 * of each.
 	 */
 	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
 		return ret;
 	ret = -EBUSY;
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
 		 * sure the page is not being freed elsewhere -- the
 		 * page release code relies on it.
 		 */
 		ClearPageLRU(page);
 		ret = 0;
 	}
 	return ret;
 }
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:	The number of pages to look through on the list.
  * @src:	The LRU list to pull pages off.
  * @dst:	The temp list to put pages on to.
  * @scanned:	The number of pages that were scanned.
  * @order:	The caller's attempted allocation order
  * @mode:	One of the LRU isolation modes
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
 		unsigned long *scanned, int order, int mode)
 {
 	unsigned long nr_taken = 0;
 	unsigned long scan;
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
 		unsigned long pfn;
 		unsigned long end_pfn;
 		unsigned long page_pfn;
 		int zone_id;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 		VM_BUG_ON(!PageLRU(page));
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
 			list_move(&page->lru, dst);
 			nr_taken++;
 			break;
 		case -EBUSY:
 			/* else it is being freed elsewhere */
 			list_move(&page->lru, src);
 			continue;
 		default:
 			BUG();
 		}
 		if (!order)
 			continue;
 		/*
 		 * Attempt to take all pages in the order aligned region
 		 * surrounding the tag page.  Only take those pages of
 		 * the same active state as that tag page.  We may safely
 		 * round the target page pfn down to the requested order
 		 * as the mem_map is guarenteed valid out to MAX_ORDER,
 		 * where that page is in a different zone we will detect
 		 * it from its zone id and abort this block scan.
 		 */
 		zone_id = page_zone_id(page);
 		page_pfn = page_to_pfn(page);
 		pfn = page_pfn & ~((1 << order) - 1);
 		end_pfn = pfn + (1 << order);
 		for (; pfn < end_pfn; pfn++) {
 			struct page *cursor_page;
 			/* The target page is in the block, ignore it. */
 			if (unlikely(pfn == page_pfn))
 				continue;
 			/* Avoid holes within the zone. */
 			if (unlikely(!pfn_valid_within(pfn)))
 				break;
 			cursor_page = pfn_to_page(pfn);
 			/* Check that we have not crossed a zone boundary. */
 			if (unlikely(page_zone_id(cursor_page) != zone_id))
 				continue;
 			switch (__isolate_lru_page(cursor_page, mode)) {
 			case 0:
 				list_move(&cursor_page->lru, dst);
 				nr_taken++;
 				scan++;
 				break;
 			case -EBUSY:
 				/* else it is being freed elsewhere */
 				list_move(&cursor_page->lru, src);
 			default:
 				break;
 			}
 		}
 	}
 	*scanned = scan;
 	return nr_taken;
 }
 static unsigned long isolate_pages_global(unsigned long nr,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active)
 {
 	if (active)
 		return isolate_lru_pages(nr, &z->active_list, dst,
 						scanned, order, mode);
 	else
 		return isolate_lru_pages(nr, &z->inactive_list, dst,
 						scanned, order, mode);
 }
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
  */
 static unsigned long clear_active_flags(struct list_head *page_list)
 {
 	int nr_active = 0;
 	struct page *page;
 	list_for_each_entry(page, page_list, lru)
 		if (PageActive(page)) {
 			ClearPageActive(page);
 			nr_active++;
 		}
 	return nr_active;
 }
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
 				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	pagevec_init(&pvec, 1);
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	do {
 		struct page *page;
 		unsigned long nr_taken;
 		unsigned long nr_scan;
 		unsigned long nr_freed;
 		unsigned long nr_active;
 		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
 			     &page_list, &nr_scan, sc->order,
 			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
 					     ISOLATE_BOTH : ISOLATE_INACTIVE,
 				zone, sc->mem_cgroup, 0);
 		nr_active = clear_active_flags(&page_list);
 		__count_vm_events(PGDEACTIVATE, nr_active);
 		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
 		__mod_zone_page_state(zone, NR_INACTIVE,
 						-(nr_taken - nr_active));
 		if (scan_global_lru(sc))
 			zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 		nr_scanned += nr_scan;
 		nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
 		/*
 		 * If we are direct reclaiming for contiguous pages and we do
 		 * not reclaim everything in the list, try again and wait
 		 * for IO to complete. This will stall high-order allocations
 		 * but that should be acceptable to the caller
 		 */
 		if (nr_freed < nr_taken && !current_is_kswapd() &&
 					sc->order > PAGE_ALLOC_COSTLY_ORDER) {
 			congestion_wait(WRITE, HZ/10);
 			/*
 			 * The attempt at page out may have made some
 			 * of the pages active, mark them inactive again.
 			 */
 			nr_active = clear_active_flags(&page_list);
 			count_vm_events(PGDEACTIVATE, nr_active);
 			nr_freed += shrink_page_list(&page_list, sc,
 							PAGEOUT_IO_SYNC);
 		}
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 			__count_vm_events(KSWAPD_STEAL, nr_freed);
 		} else if (scan_global_lru(sc))
 			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
 		__count_zone_vm_events(PGSTEAL, zone, nr_freed);
 		if (nr_taken == 0)
 			goto done;
 		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
 			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
 			else
 				add_page_to_inactive_list(zone, page);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
   	} while (nr_scanned < max_scan);
 	spin_unlock(&zone->lru_lock);
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
 /*
  * We are about to scan this zone at a certain priority level.  If that priority
  * level is smaller (ie: more urgent) than the previous priority, then note
  * that priority level within the zone.  This is done so that when the next
  * process comes in to scan this zone, it will immediately start out at this
  * priority level rather than having to build up its own scanning priority.
  * Here, this priority affects only the reclaim-mapped threshold.
  */
 static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 {
 	if (priority < zone->prev_priority)
 		zone->prev_priority = priority;
 }
 static inline int zone_is_near_oom(struct zone *zone)
 {
 	return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
 				+ zone_page_state(zone, NR_INACTIVE))*3;
 }
 /*
  * Determine we should try to reclaim mapped pages.
  * This is called only when sc->mem_cgroup is NULL.
  */
 static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
 				int priority)
 {
 	long mapped_ratio;
 	long distress;
 	long swap_tendency;
 	long imbalance;
 	int reclaim_mapped = 0;
 	int prev_priority;
 	if (scan_global_lru(sc) && zone_is_near_oom(zone))
 		return 1;
 	/*
 	 * `distress' is a measure of how much trouble we're having
 	 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
 	 */
 	if (scan_global_lru(sc))
 		prev_priority = zone->prev_priority;
 	else
 		prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
 	distress = 100 >> min(prev_priority, priority);
 	/*
 	 * The point of this algorithm is to decide when to start
 	 * reclaiming mapped memory instead of just pagecache.  Work out
 	 * how much memory
 	 * is mapped.
 	 */
 	if (scan_global_lru(sc))
 		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES)) * 100) /
 					vm_total_pages;
 	else
 		mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
 	/*
 	 * Now decide how much we really want to unmap some pages.  The
 	 * mapped ratio is downgraded - just because there's a lot of
 	 * mapped memory doesn't necessarily mean that page reclaim
 	 * isn't succeeding.
 	 *
 	 * The distress ratio is important - we don't want to start
 	 * going oom.
 	 *
 	 * A 100% value of vm_swappiness overrides this algorithm
 	 * altogether.
 	 */
 	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
 	/*
 	 * If there's huge imbalance between active and inactive
 	 * (think active 100 times larger than inactive) we should
 	 * become more permissive, or the system will take too much
 	 * cpu before it start swapping during memory pressure.
 	 * Distress is about avoiding early-oom, this is about
 	 * making swappiness graceful despite setting it to low
 	 * values.
 	 *
 	 * Avoid div by zero with nr_inactive+1, and max resulting
 	 * value is vm_total_pages.
 	 */
 	if (scan_global_lru(sc)) {
 		imbalance  = zone_page_state(zone, NR_ACTIVE);
 		imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
 	} else
 		imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
 	/*
 	 * Reduce the effect of imbalance if swappiness is low,
 	 * this means for a swappiness very low, the imbalance
 	 * must be much higher than 100 for this logic to make
 	 * the difference.
 	 *
 	 * Max temporary value is vm_total_pages*100.
 	 */
 	imbalance *= (vm_swappiness + 1);
 	imbalance /= 100;
 	/*
 	 * If not much of the ram is mapped, makes the imbalance
 	 * less relevant, it's high priority we refill the inactive
 	 * list with mapped pages only in presence of high ratio of
 	 * mapped pages.
 	 *
 	 * Max temporary value is vm_total_pages*100.
 	 */
 	imbalance *= mapped_ratio;
 	imbalance /= 100;
 	/* apply imbalance feedback to swap_tendency */
 	swap_tendency += imbalance;
 	/*
 	 * Now use this metric to decide whether to start moving mapped
 	 * memory onto the inactive list.
 	 */
 	if (swap_tendency >= 100)
 		reclaim_mapped = 1;
 	return reclaim_mapped;
 }
 /*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
  * appropriate to hold zone->lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
  * should drop zone->lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 				struct scan_control *sc, int priority)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
 	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
 	if (sc->may_swap)
 		reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
 					ISOLATE_ACTIVE, zone,
 					sc->mem_cgroup, 1);
 	/*
 	 * zone->pages_scanned is used for detect zone's oom
 	 * mem_cgroup remembers nr_scan by itself.
 	 */
 	if (scan_global_lru(sc))
 		zone->pages_scanned += pgscanned;
 	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (page_mapped(page)) {
 			if (!reclaim_mapped ||
 			    (total_swap_pages == 0 && PageAnon(page)) ||
 			    page_referenced(page, 0, sc->mem_cgroup)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
 		}
 		list_add(&page->lru, &l_inactive);
 	}
 	pagevec_init(&pvec, 1);
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		VM_BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 		list_move(&page->lru, &zone->inactive_list);
 		mem_cgroup_move_lists(page, false);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
 			if (buffer_heads_over_limit)
 				pagevec_strip(&pvec);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
 	__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
 		pagevec_strip(&pvec);
 		spin_lock_irq(&zone->lru_lock);
 	}
 	pgmoved = 0;
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		VM_BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		mem_cgroup_move_lists(page, true);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
 			pgmoved = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
 	__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 }
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static unsigned long shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
 	if (scan_global_lru(sc)) {
 		/*
 		 * Add one to nr_to_scan just to make sure that the kernel
 		 * will slowly sift through the active list.
 		 */
 		zone->nr_scan_active +=
 			(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
 		nr_active = zone->nr_scan_active;
 		zone->nr_scan_inactive +=
 			(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
 		nr_inactive = zone->nr_scan_inactive;
 		if (nr_inactive >= sc->swap_cluster_max)
 			zone->nr_scan_inactive = 0;
 		else
 			nr_inactive = 0;
 		if (nr_active >= sc->swap_cluster_max)
 			zone->nr_scan_active = 0;
 		else
 			nr_active = 0;
 	} else {
 		/*
 		 * This reclaim occurs not because zone memory shortage but
 		 * because memory controller hits its limit.
 		 * Then, don't modify zone reclaim related data.
 		 */
 		nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
 					zone, priority);
 		nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
 					zone, priority);
 	}
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
 			shrink_active_list(nr_to_scan, zone, sc, priority);
 		}
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
 			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
 								sc);
 		}
 	}
 	throttle_vm_writeout(sc->gfp_mask);
 	return nr_reclaimed;
 }
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
  * We reclaim from a zone even if that zone is over pages_high.  Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
  * b) The zones may be over pages_high but they must go *over* pages_high to
  *    satisfy the `incremental min' zone defense algorithm.
  *
  * Returns the number of reclaimed pages.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
 static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	unsigned long nr_reclaimed = 0;
 	struct zoneref *z;
 	struct zone *zone;
 	sc->all_unreclaimable = 1;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		if (!populated_zone(zone))
 			continue;
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
 		 */
 		if (scan_global_lru(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 			note_zone_scanning_priority(zone, priority);
 			if (zone_is_all_unreclaimable(zone) &&
 						priority != DEF_PRIORITY)
 				continue;	/* Let kswapd poll it */
 			sc->all_unreclaimable = 0;
 		} else {
 			/*
 			 * Ignore cpuset limitation here. We just want to reduce
 			 * # of used pages by us regardless of memory shortage.
 			 */
 			sc->all_unreclaimable = 0;
 			mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
 							priority);
 		}
 		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
 	return nr_reclaimed;
 }
 /*
  * This is the main entry point to direct page reclaim.
  *
  * If a full scan of the inactive list fails to free enough memory then we
  * are "out of memory" and something needs to be killed.
  *
  * If the caller is !__GFP_FS then the probability of a failure is reasonably
  * high - the zone may be full of dirty or under-writeback pages, which this
  * caller can't do much about.  We kick pdflush and take explicit naps in the
  * hope that some of these pages can be written.  But if the allocating task
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
+ *
+ * returns:	0, if no pages reclaimed
+ * 		else, the number of pages reclaimed
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc)
 {
 	int priority;
 	int ret = 0;
 	unsigned long total_scanned = 0;
 	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 	if (scan_global_lru(sc))
 		count_vm_event(ALLOCSTALL);
 	/*
 	 * mem_cgroup will not do shrink_slab.
 	 */
 	if (scan_global_lru(sc)) {
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 			lru_pages += zone_page_state(zone, NR_ACTIVE)
 					+ zone_page_state(zone, NR_INACTIVE);
 		}
 	}
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc->nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
 		nr_reclaimed += shrink_zones(priority, zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
 			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
 			}
 		}
 		total_scanned += sc->nr_scanned;
 		if (nr_reclaimed >= sc->swap_cluster_max) {
-			ret = 1;
+			ret = nr_reclaimed;
 			goto out;
 		}
 		/*
 		 * Try to write back as many pages as we just scanned.  This
 		 * tends to cause slow streaming writers to write data to the
 		 * disk smoothly, at the dirtying rate, which is nice.   But
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		if (total_scanned > sc->swap_cluster_max +
 					sc->swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc->may_writepage = 1;
 		}
 		/* Take a nap, wait for some writeback to complete */
 		if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
 			congestion_wait(WRITE, HZ/10);
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
 	if (!sc->all_unreclaimable && scan_global_lru(sc))
-		ret = 1;
+		ret = nr_reclaimed;
 out:
 	/*
 	 * Now that we've scanned all the zones at this priority level, note
 	 * that level within the zone so that the next thread which performs
 	 * scanning of this zone will immediately start out at this priority
 	 * level.  This affects only the decision whether or not to bring
 	 * mapped pages onto the inactive list.
 	 */
 	if (priority < 0)
 		priority = 0;
 	if (scan_global_lru(sc)) {
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 			zone->prev_priority = priority;
 		}
 	} else
 		mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
 	return ret;
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 								gfp_t gfp_mask)
 {
 	struct scan_control sc = {
 		.gfp_mask = gfp_mask,
 		.may_writepage = !laptop_mode,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.may_swap = 1,
 		.swappiness = vm_swappiness,
 		.order = order,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
 	};
 	return do_try_to_free_pages(zonelist, &sc);
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 						gfp_t gfp_mask)
 {
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 		.order = 0,
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 	};
 	struct zonelist *zonelist;
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 	zonelist = NODE_DATA(numa_node_id())->node_zonelists;
 	return do_try_to_free_pages(zonelist, &sc);
 }
 #endif
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
  *
  * Returns the number of pages which were actually freed.
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
  * What we do is to detect the case where all pages in the zone have been
  * scanned twice and there has been zero successful reclaim.  Mark the zone as
  * dead and from now on, only perform a short scan.  Basically we're polling
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > pages_high, but once a zone is found to have
  * free_pages <= pages_high, we scan that zone and the lower zones regardless
  * of the number of free pages in the lower zones.  This interoperates with
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 {
 	int all_zones_ok;
 	int priority;
 	int i;
 	unsigned long total_scanned;
 	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
 		.order = order,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
 	};
 	/*
 	 * temp_priority is used to remember the scanning priority at which
 	 * this zone was successfully refilled to free_pages == pages_high.
 	 */
 	int temp_priority[MAX_NR_ZONES];
 loop_again:
 	total_scanned = 0;
 	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 	for (i = 0; i < pgdat->nr_zones; i++)
 		temp_priority[i] = DEF_PRIORITY;
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		unsigned long lru_pages = 0;
 		/* The swap token gets in the way of swapout... */
 		if (!priority)
 			disable_swap_token();
 		all_zones_ok = 1;
 		/*
 		 * Scan in the highmem->dma direction for the highest
 		 * zone which needs scanning
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (zone_is_all_unreclaimable(zone) &&
 			    priority != DEF_PRIORITY)
 				continue;
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       0, 0)) {
 				end_zone = i;
 				break;
 			}
 		}
 		if (i < 0)
 			goto out;
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			lru_pages += zone_page_state(zone, NR_ACTIVE)
 					+ zone_page_state(zone, NR_INACTIVE);
 		}
 		/*
 		 * Now scan the zone in the dma->highmem direction, stopping
 		 * at the last zone which needs scanning.
 		 *
 		 * We do this because the page allocator works in the opposite
 		 * direction.  This prevents the page allocator from allocating
 		 * pages behind kswapd's direction of progress, which would
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			int nr_slab;
 			if (!populated_zone(zone))
 				continue;
 			if (zone_is_all_unreclaimable(zone) &&
 					priority != DEF_PRIORITY)
 				continue;
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       end_zone, 0))
 				all_zones_ok = 0;
 			temp_priority[i] = priority;
 			sc.nr_scanned = 0;
 			note_zone_scanning_priority(zone, priority);
 			/*
 			 * We put equal pressure on every zone, unless one
 			 * zone has way too many pages free already.
 			 */
 			if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
 						end_zone, 0))
 				nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone_is_all_unreclaimable(zone))
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
 				(zone_page_state(zone, NR_ACTIVE)
 				+ zone_page_state(zone, NR_INACTIVE)) * 6)
 					zone_set_flag(zone,
 						      ZONE_ALL_UNRECLAIMABLE);
 			/*
 			 * If we've done a decent amount of scanning and
 			 * the reclaim ratio is low, start doing writepage
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
 			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
 		/*
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
 		if (total_scanned && priority < DEF_PRIORITY - 2)
 			congestion_wait(WRITE, HZ/10);
 		/*
 		 * We do this so kswapd doesn't build up large priorities for
 		 * example when it is freeing in parallel with allocators. It
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
 		if (nr_reclaimed >= SWAP_CLUSTER_MAX)
 			break;
 	}
 out:
 	/*
 	 * Note within each zone the priority level at which this zone was
 	 * brought into a happy state.  So that the next thread which scans this
 	 * zone will start out at that priority level.
 	 */
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		zone->prev_priority = temp_priority[i];
 	}
 	if (!all_zones_ok) {
 		cond_resched();
 		try_to_freeze();
 		goto loop_again;
 	}
 	return nr_reclaimed;
 }
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
  * that frees anything up. This is needed for things like routing
  * etc, where we otherwise might have all activity going on in
  * asynchronous contexts that cannot page things out.
  *
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
 static int kswapd(void *p)
 {
 	unsigned long order;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 	DEFINE_WAIT(wait);
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 	node_to_cpumask_ptr(cpumask, pgdat->node_id);
 	if (!cpus_empty(*cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "__alloc_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
 	 * you need a small amount of memory in order to be able to
 	 * page out something else, and this flag essentially protects
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 	order = 0;
 	for ( ; ; ) {
 		unsigned long new_order;
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		new_order = pgdat->kswapd_max_order;
 		pgdat->kswapd_max_order = 0;
 		if (order < new_order) {
 			/*
 			 * Don't sleep if someone wants a larger 'order'
 			 * allocation
 			 */
 			order = new_order;
 		} else {
 			if (!freezing(current))
 				schedule();
 			order = pgdat->kswapd_max_order;
 		}
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		if (!try_to_freeze()) {
 			/* We can speed up thawing tasks if we don't call
 			 * balance_pgdat after returning from the refrigerator
 			 */
 			balance_pgdat(pgdat, order);
 		}
 	}
 	return 0;
 }
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
 void wakeup_kswapd(struct zone *zone, int order)
 {
 	pg_data_t *pgdat;
 	if (!populated_zone(zone))
 		return;
 	pgdat = zone->zone_pgdat;
 	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 		return;
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 #ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
  * from LRU lists system-wide, for given pass and priority, and returns the
  * number of reclaimed pages
  *
  * For pass > 3 we also try to shrink the LRU lists that contain a few pages
  */
 static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 				      int pass, struct scan_control *sc)
 {
 	struct zone *zone;
 	unsigned long nr_to_scan, ret = 0;
 	for_each_zone(zone) {
 		if (!populated_zone(zone))
 			continue;
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 		/* For pass = 0 we don't shrink the active list */
 		if (pass > 0) {
 			zone->nr_scan_active +=
 				(zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
 			if (zone->nr_scan_active >= nr_pages || pass > 3) {
 				zone->nr_scan_active = 0;
 				nr_to_scan = min(nr_pages,
 					zone_page_state(zone, NR_ACTIVE));
 				shrink_active_list(nr_to_scan, zone, sc, prio);
 			}
 		}
 		zone->nr_scan_inactive +=
 			(zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
 		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
 			zone->nr_scan_inactive = 0;
 			nr_to_scan = min(nr_pages,
 				zone_page_state(zone, NR_INACTIVE));
 			ret += shrink_inactive_list(nr_to_scan, zone, sc);
 			if (ret >= nr_pages)
 				return ret;
 		}
 	}
 	return ret;
 }
 static unsigned long count_lru_pages(void)
 {
 	return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
 }
 /*
  * Try to free `nr_pages' of memory, system-wide, and return the number of
  * freed pages.
  *
  * Rather than trying to age LRUs the aim is to preserve the overall
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	unsigned long lru_pages, nr_slab;
 	unsigned long ret = 0;
 	int pass;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.may_swap = 0,
 		.swap_cluster_max = nr_pages,
 		.may_writepage = 1,
 		.swappiness = vm_swappiness,
 		.isolate_pages = isolate_pages_global,
 	};
 	current->reclaim_state = &reclaim_state;
 	lru_pages = count_lru_pages();
 	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
 		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 		ret += reclaim_state.reclaimed_slab;
 		if (ret >= nr_pages)
 			goto out;
 		nr_slab -= reclaim_state.reclaimed_slab;
 	}
 	/*
 	 * We try to shrink LRUs in 5 passes:
 	 * 0 = Reclaim from inactive_list only
 	 * 1 = Reclaim from active list but don't reclaim mapped
 	 * 2 = 2nd pass of type 1
 	 * 3 = Reclaim mapped (normal reclaim)
 	 * 4 = 2nd pass of type 3
 	 */
 	for (pass = 0; pass < 5; pass++) {
 		int prio;
 		/* Force reclaiming mapped pages in the passes #3 and #4 */
 		if (pass > 2) {
 			sc.may_swap = 1;
 			sc.swappiness = 100;
 		}
 		for (prio = DEF_PRIORITY; prio >= 0; prio--) {
 			unsigned long nr_to_scan = nr_pages - ret;
 			sc.nr_scanned = 0;
 			ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
 			if (ret >= nr_pages)
 				goto out;
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
 					count_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
 				congestion_wait(WRITE, HZ / 10);
 		}
 	}
 	/*
 	 * If ret = 0, we could not shrink LRUs, but there may be something
 	 * in slab caches
 	 */
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
 out:
 	current->reclaim_state = NULL;
 	return ret;
 }
 #endif
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	int nid;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_HIGH_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
 			node_to_cpumask_ptr(mask, pgdat->node_id);
 			if (any_online_cpu(*mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
 				set_cpus_allowed_ptr(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
 }
 /*
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
 int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	int ret = 0;
 	if (pgdat->kswapd)
 		return 0;
 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
 		printk("Failed to start kswapd on node %d\n",nid);
 		ret = -1;
 	}
 	return ret;
 }
 static int __init kswapd_init(void)
 {
 	int nid;
 	swap_setup();
 	for_each_node_state(nid, N_HIGH_MEMORY)
  		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
 module_init(kswapd_init)
 #ifdef CONFIG_NUMA
 /*
  * Zone reclaim mode
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
  */
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_cache on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
 #define ZONE_RECLAIM_PRIORITY 4
 /*
  * Percentage of pages in a zone that must be unmapped for zone_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
  * slab reclaim needs to occur.
  */
 int sysctl_min_slab_ratio = 5;
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
 	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.swap_cluster_max = max_t(unsigned long, nr_pages,
 					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 		.swappiness = vm_swappiness,
 		.isolate_pages = isolate_pages_global,
 	};
 	unsigned long slab_reclaimable;
 	disable_swap_token();
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_SWAP.
 	 */
 	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	if (zone_page_state(zone, NR_FILE_PAGES) -
 		zone_page_state(zone, NR_FILE_MAPPED) >
 		zone->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
 		priority = ZONE_RECLAIM_PRIORITY;
 		do {
 			note_zone_scanning_priority(zone, priority);
 			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			priority--;
 		} while (priority >= 0 && nr_reclaimed < nr_pages);
 	}
 	slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	if (slab_reclaimable > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
 		 * many pages were freed in this zone. So we take the current
 		 * number of slab pages and shake the slab until it is reduced
 		 * by the same nr_pages that we used for reclaiming unmapped
 		 * pages.
 		 *
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
 		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
 				slab_reclaimable - nr_pages)
 			;
 		/*
 		 * Update nr_reclaimed by the number of slab pages we
 		 * reclaimed from this zone.
 		 */
 		nr_reclaimed += slab_reclaimable -
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	}
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	return nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	int node_id;
 	int ret;
 	/*
 	 * Zone reclaim reclaims unmapped file backed pages and
 	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
 	 * thrown out if the zone is overallocated. So we do not reclaim
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
 	if (zone_page_state(zone, NR_FILE_PAGES) -
 	    zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
 	    && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
 			<= zone->min_slab_pages)
 		return 0;
 	if (zone_is_all_unreclaimable(zone))
 		return 0;
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
 			return 0;
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
 	 * have associated processors. This will favor the local processor
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
 	node_id = zone_to_nid(zone);
 	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
 		return 0;
 	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
 		return 0;
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 	return ret;
 }
 #endif