Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/memory.h>

54

#include <linux/memory.h>

55

#include <linux/compaction.h>

55

#include <linux/compaction.h>

56

#include <trace/events/kmem.h>

56

#include <trace/events/kmem.h>

57

#include <linux/ftrace_event.h>

57

#include <linux/ftrace_event.h>

58

#include <linux/memcontrol.h>

58

#include <linux/memcontrol.h>

59

#include <linux/prefetch.h>

59

#include <linux/prefetch.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

63

#include <asm/tlbflush.h>

63

#include <asm/tlbflush.h>

64

#include <asm/div64.h>

64

#include <asm/div64.h>

65

#include "internal.h"

65

#include "internal.h"

66

67

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

67

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

68

DEFINE_PER_CPU(int, numa_node);

68

DEFINE_PER_CPU(int, numa_node);

69

EXPORT_PER_CPU_SYMBOL(numa_node);

69

EXPORT_PER_CPU_SYMBOL(numa_node);

70

#endif

70

#endif

71

72

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

72

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

73

/*

73

/*

74

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

74

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

75

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

75

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

76

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

76

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

77

* defined in <linux/topology.h>.

77

* defined in <linux/topology.h>.

78

*/

78

*/

79

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

79

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

80

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

80

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

81

#endif

81

#endif

82

83

/*

83

/*

84

* Array of node states.

84

* Array of node states.

85

*/

85

*/

86

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

86

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

87

[N_POSSIBLE] = NODE_MASK_ALL,

87

[N_POSSIBLE] = NODE_MASK_ALL,

88

[N_ONLINE] = { { [0] = 1UL } },

88

[N_ONLINE] = { { [0] = 1UL } },

89

#ifndef CONFIG_NUMA

89

#ifndef CONFIG_NUMA

90

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

90

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

91

#ifdef CONFIG_HIGHMEM

91

#ifdef CONFIG_HIGHMEM

92

[N_HIGH_MEMORY] = { { [0] = 1UL } },

92

[N_HIGH_MEMORY] = { { [0] = 1UL } },

93

#endif

93

#endif

94

[N_CPU] = { { [0] = 1UL } },

94

[N_CPU] = { { [0] = 1UL } },

95

#endif /* NUMA */

95

#endif /* NUMA */

96

};

96

};

97

EXPORT_SYMBOL(node_states);

97

EXPORT_SYMBOL(node_states);

98

99

unsigned long totalram_pages __read_mostly;

99

unsigned long totalram_pages __read_mostly;

100

unsigned long totalreserve_pages __read_mostly;

100

unsigned long totalreserve_pages __read_mostly;

101

/*

101

/*

102

* When calculating the number of globally allowed dirty pages, there

102

* When calculating the number of globally allowed dirty pages, there

103

* is a certain number of per-zone reserves that should not be

103

* is a certain number of per-zone reserves that should not be

104

* considered dirtyable memory. This is the sum of those reserves

104

* considered dirtyable memory. This is the sum of those reserves

105

* over all existing zones that contribute dirtyable memory.

105

* over all existing zones that contribute dirtyable memory.

106

*/

106

*/

107

unsigned long dirty_balance_reserve __read_mostly;

107

unsigned long dirty_balance_reserve __read_mostly;

108

109

int percpu_pagelist_fraction;

109

int percpu_pagelist_fraction;

110

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

110

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

111

112

#ifdef CONFIG_PM_SLEEP

112

#ifdef CONFIG_PM_SLEEP

113

/*

113

/*

114

* The following functions are used by the suspend/hibernate code to temporarily

114

* The following functions are used by the suspend/hibernate code to temporarily

115

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

115

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

116

* while devices are suspended. To avoid races with the suspend/hibernate code,

116

* while devices are suspended. To avoid races with the suspend/hibernate code,

117

* they should always be called with pm_mutex held (gfp_allowed_mask also should

117

* they should always be called with pm_mutex held (gfp_allowed_mask also should

118

* only be modified with pm_mutex held, unless the suspend/hibernate code is

118

* only be modified with pm_mutex held, unless the suspend/hibernate code is

119

* guaranteed not to run in parallel with that modification).

119

* guaranteed not to run in parallel with that modification).

120

*/

120

*/

121

122

static gfp_t saved_gfp_mask;

122

static gfp_t saved_gfp_mask;

123

124

void pm_restore_gfp_mask(void)

124

void pm_restore_gfp_mask(void)

125

{

125

{

126

WARN_ON(!mutex_is_locked(&pm_mutex));

126

WARN_ON(!mutex_is_locked(&pm_mutex));

127

if (saved_gfp_mask) {

127

if (saved_gfp_mask) {

128

gfp_allowed_mask = saved_gfp_mask;

128

gfp_allowed_mask = saved_gfp_mask;

129

saved_gfp_mask = 0;

129

saved_gfp_mask = 0;

130

}

130

}

131

}

131

}

132

133

void pm_restrict_gfp_mask(void)

133

void pm_restrict_gfp_mask(void)

134

{

134

{

135

WARN_ON(!mutex_is_locked(&pm_mutex));

135

WARN_ON(!mutex_is_locked(&pm_mutex));

136

WARN_ON(saved_gfp_mask);

136

WARN_ON(saved_gfp_mask);

137

saved_gfp_mask = gfp_allowed_mask;

137

saved_gfp_mask = gfp_allowed_mask;

138

gfp_allowed_mask &= ~GFP_IOFS;

138

gfp_allowed_mask &= ~GFP_IOFS;

139

}

139

}

140

141

bool pm_suspended_storage(void)

141

bool pm_suspended_storage(void)

142

{

142

{

143

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

143

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

144

return false;

144

return false;

145

return true;

145

return true;

146

}

146

}

147

#endif /* CONFIG_PM_SLEEP */

147

#endif /* CONFIG_PM_SLEEP */

148

149

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

149

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

150

int pageblock_order __read_mostly;

150

int pageblock_order __read_mostly;

151

#endif

151

#endif

152

153

static void __free_pages_ok(struct page *page, unsigned int order);

153

static void __free_pages_ok(struct page *page, unsigned int order);

154

155

/*

155

/*

156

* results with 256, 32 in the lowmem_reserve sysctl:

156

* results with 256, 32 in the lowmem_reserve sysctl:

157

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

157

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

158

* 1G machine -> (16M dma, 784M normal, 224M high)

158

* 1G machine -> (16M dma, 784M normal, 224M high)

159

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

159

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

160

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

160

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

161

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

161

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

162

*

162

*

163

* TBD: should special case ZONE_DMA32 machines here - in those we normally

163

* TBD: should special case ZONE_DMA32 machines here - in those we normally

164

* don't need any ZONE_NORMAL reservation

164

* don't need any ZONE_NORMAL reservation

165

*/

165

*/

166

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

166

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

167

#ifdef CONFIG_ZONE_DMA

167

#ifdef CONFIG_ZONE_DMA

168

256,

168

256,

169

#endif

169

#endif

170

#ifdef CONFIG_ZONE_DMA32

170

#ifdef CONFIG_ZONE_DMA32

171

256,

171

256,

172

#endif

172

#endif

173

#ifdef CONFIG_HIGHMEM

173

#ifdef CONFIG_HIGHMEM

174

32,

174

32,

175

#endif

175

#endif

176

32,

176

32,

177

};

177

};

178

179

EXPORT_SYMBOL(totalram_pages);

179

EXPORT_SYMBOL(totalram_pages);

180

181

static char * const zone_names[MAX_NR_ZONES] = {

181

static char * const zone_names[MAX_NR_ZONES] = {

182

#ifdef CONFIG_ZONE_DMA

182

#ifdef CONFIG_ZONE_DMA

183

"DMA",

183

"DMA",

184

#endif

184

#endif

185

#ifdef CONFIG_ZONE_DMA32

185

#ifdef CONFIG_ZONE_DMA32

186

"DMA32",

186

"DMA32",

187

#endif

187

#endif

188

"Normal",

188

"Normal",

189

#ifdef CONFIG_HIGHMEM

189

#ifdef CONFIG_HIGHMEM

190

"HighMem",

190

"HighMem",

191

#endif

191

#endif

192

"Movable",

192

"Movable",

193

};

193

};

194

195

int min_free_kbytes = 1024;

195

int min_free_kbytes = 1024;

196

197

static unsigned long __meminitdata nr_kernel_pages;

197

static unsigned long __meminitdata nr_kernel_pages;

198

static unsigned long __meminitdata nr_all_pages;

198

static unsigned long __meminitdata nr_all_pages;

199

static unsigned long __meminitdata dma_reserve;

199

static unsigned long __meminitdata dma_reserve;

200

201

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

201

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

202

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

202

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

203

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

203

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

204

static unsigned long __initdata required_kernelcore;

204

static unsigned long __initdata required_kernelcore;

205

static unsigned long __initdata required_movablecore;

205

static unsigned long __initdata required_movablecore;

206

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

206

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

207

208

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

208

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

209

int movable_zone;

209

int movable_zone;

210

EXPORT_SYMBOL(movable_zone);

210

EXPORT_SYMBOL(movable_zone);

211

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

211

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

212

213

#if MAX_NUMNODES > 1

213

#if MAX_NUMNODES > 1

214

int nr_node_ids __read_mostly = MAX_NUMNODES;

214

int nr_node_ids __read_mostly = MAX_NUMNODES;

215

int nr_online_nodes __read_mostly = 1;

215

int nr_online_nodes __read_mostly = 1;

216

EXPORT_SYMBOL(nr_node_ids);

216

EXPORT_SYMBOL(nr_node_ids);

217

EXPORT_SYMBOL(nr_online_nodes);

217

EXPORT_SYMBOL(nr_online_nodes);

218

#endif

218

#endif

219

220

int page_group_by_mobility_disabled __read_mostly;

220

int page_group_by_mobility_disabled __read_mostly;

221

222

static void set_pageblock_migratetype(struct page *page, int migratetype)

222

static void set_pageblock_migratetype(struct page *page, int migratetype)

223

{

223

{

224

225

if (unlikely(page_group_by_mobility_disabled))

225

if (unlikely(page_group_by_mobility_disabled))

226

migratetype = MIGRATE_UNMOVABLE;

226

migratetype = MIGRATE_UNMOVABLE;

227

228

set_pageblock_flags_group(page, (unsigned long)migratetype,

228

set_pageblock_flags_group(page, (unsigned long)migratetype,

229

PB_migrate, PB_migrate_end);

229

PB_migrate, PB_migrate_end);

230

}

230

}

231

232

bool oom_killer_disabled __read_mostly;

232

bool oom_killer_disabled __read_mostly;

233

234

#ifdef CONFIG_DEBUG_VM

234

#ifdef CONFIG_DEBUG_VM

235

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

235

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

236

{

236

{

237

int ret = 0;

237

int ret = 0;

238

unsigned seq;

238

unsigned seq;

239

unsigned long pfn = page_to_pfn(page);

239

unsigned long pfn = page_to_pfn(page);

240

241

do {

241

do {

242

seq = zone_span_seqbegin(zone);

242

seq = zone_span_seqbegin(zone);

243

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

243

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

244

ret = 1;

244

ret = 1;

245

else if (pfn < zone->zone_start_pfn)

245

else if (pfn < zone->zone_start_pfn)

246

ret = 1;

246

ret = 1;

247

} while (zone_span_seqretry(zone, seq));

247

} while (zone_span_seqretry(zone, seq));

248

249

return ret;

249

return ret;

250

}

250

}

251

252

static int page_is_consistent(struct zone *zone, struct page *page)

252

static int page_is_consistent(struct zone *zone, struct page *page)

253

{

253

{

254

if (!pfn_valid_within(page_to_pfn(page)))

254

if (!pfn_valid_within(page_to_pfn(page)))

255

return 0;

255

return 0;

256

if (zone != page_zone(page))

256

if (zone != page_zone(page))

257

return 0;

257

return 0;

258

259

return 1;

259

return 1;

260

}

260

}

261

/*

261

/*

262

* Temporary debugging check for pages not lying within a given zone.

262

* Temporary debugging check for pages not lying within a given zone.

263

*/

263

*/

264

static int bad_range(struct zone *zone, struct page *page)

264

static int bad_range(struct zone *zone, struct page *page)

265

{

265

{

266

if (page_outside_zone_boundaries(zone, page))

266

if (page_outside_zone_boundaries(zone, page))

267

return 1;

267

return 1;

268

if (!page_is_consistent(zone, page))

268

if (!page_is_consistent(zone, page))

269

return 1;

269

return 1;

270

271

return 0;

271

return 0;

272

}

272

}

273

#else

273

#else

274

static inline int bad_range(struct zone *zone, struct page *page)

274

static inline int bad_range(struct zone *zone, struct page *page)

275

{

275

{

276

return 0;

276

return 0;

277

}

277

}

278

#endif

278

#endif

279

280

static void bad_page(struct page *page)

280

static void bad_page(struct page *page)

281

{

281

{

282

static unsigned long resume;

282

static unsigned long resume;

283

static unsigned long nr_shown;

283

static unsigned long nr_shown;

284

static unsigned long nr_unshown;

284

static unsigned long nr_unshown;

285

286

/* Don't complain about poisoned pages */

286

/* Don't complain about poisoned pages */

287

if (PageHWPoison(page)) {

287

if (PageHWPoison(page)) {

288

reset_page_mapcount(page); /* remove PageBuddy */

288

reset_page_mapcount(page); /* remove PageBuddy */

289

return;

289

return;

290

}

290

}

291

292

/*

292

/*

293

* Allow a burst of 60 reports, then keep quiet for that minute;

293

* Allow a burst of 60 reports, then keep quiet for that minute;

294

* or allow a steady drip of one report per second.

294

* or allow a steady drip of one report per second.

295

*/

295

*/

296

if (nr_shown == 60) {

296

if (nr_shown == 60) {

297

if (time_before(jiffies, resume)) {

297

if (time_before(jiffies, resume)) {

298

nr_unshown++;

298

nr_unshown++;

299

goto out;

299

goto out;

300

}

300

}

301

if (nr_unshown) {

301

if (nr_unshown) {

302

printk(KERN_ALERT

302

printk(KERN_ALERT

303

"BUG: Bad page state: %lu messages suppressed\n",

303

"BUG: Bad page state: %lu messages suppressed\n",

304

nr_unshown);

304

nr_unshown);

305

nr_unshown = 0;

305

nr_unshown = 0;

306

}

306

}

307

nr_shown = 0;

307

nr_shown = 0;

308

}

308

}

309

if (nr_shown++ == 0)

309

if (nr_shown++ == 0)

310

resume = jiffies + 60 * HZ;

310

resume = jiffies + 60 * HZ;

311

312

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

312

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

313

current->comm, page_to_pfn(page));

313

current->comm, page_to_pfn(page));

314

dump_page(page);

314

dump_page(page);

315

316

print_modules();

316

print_modules();

317

dump_stack();

317

dump_stack();

318

out:

318

out:

319

/* Leave bad fields for debug, except PageBuddy could make trouble */

319

/* Leave bad fields for debug, except PageBuddy could make trouble */

320

reset_page_mapcount(page); /* remove PageBuddy */

320

reset_page_mapcount(page); /* remove PageBuddy */

321

add_taint(TAINT_BAD_PAGE);

321

add_taint(TAINT_BAD_PAGE);

322

}

322

}

323

324

/*

324

/*

325

* Higher-order pages are called "compound pages". They are structured thusly:

325

* Higher-order pages are called "compound pages". They are structured thusly:

326

*

326

*

327

* The first PAGE_SIZE page is called the "head page".

327

* The first PAGE_SIZE page is called the "head page".

328

*

328

*

329

* The remaining PAGE_SIZE pages are called "tail pages".

329

* The remaining PAGE_SIZE pages are called "tail pages".

330

*

330

*

331

* All pages have PG_compound set. All tail pages have their ->first_page

331

* All pages have PG_compound set. All tail pages have their ->first_page

332

* pointing at the head page.

332

* pointing at the head page.

333

*

333

*

334

* The first tail page's ->lru.next holds the address of the compound page's

334

* The first tail page's ->lru.next holds the address of the compound page's

335

* put_page() function. Its ->lru.prev holds the order of allocation.

335

* put_page() function. Its ->lru.prev holds the order of allocation.

336

* This usage means that zero-order pages may not be compound.

336

* This usage means that zero-order pages may not be compound.

337

*/

337

*/

338

339

static void free_compound_page(struct page *page)

339

static void free_compound_page(struct page *page)

340

{

340

{

341

__free_pages_ok(page, compound_order(page));

341

__free_pages_ok(page, compound_order(page));

342

}

342

}

343

344

void prep_compound_page(struct page *page, unsigned long order)

344

void prep_compound_page(struct page *page, unsigned long order)

345

{

345

{

346

int i;

346

int i;

347

int nr_pages = 1 << order;

347

int nr_pages = 1 << order;

348

349

set_compound_page_dtor(page, free_compound_page);

349

set_compound_page_dtor(page, free_compound_page);

350

set_compound_order(page, order);

350

set_compound_order(page, order);

351

__SetPageHead(page);

351

__SetPageHead(page);

352

for (i = 1; i < nr_pages; i++) {

352

for (i = 1; i < nr_pages; i++) {

353

struct page *p = page + i;

353

struct page *p = page + i;

354

__SetPageTail(p);

354

__SetPageTail(p);

355

set_page_count(p, 0);

355

set_page_count(p, 0);

356

p->first_page = page;

356

p->first_page = page;

357

}

357

}

358

}

358

}

359

360

/* update __split_huge_page_refcount if you change this function */

360

/* update __split_huge_page_refcount if you change this function */

361

static int destroy_compound_page(struct page *page, unsigned long order)

361

static int destroy_compound_page(struct page *page, unsigned long order)

362

{

362

{

363

int i;

363

int i;

364

int nr_pages = 1 << order;

364

int nr_pages = 1 << order;

365

int bad = 0;

365

int bad = 0;

366

367

if (unlikely(compound_order(page) != order) ||

367

if (unlikely(compound_order(page) != order) ||

368

unlikely(!PageHead(page))) {

368

unlikely(!PageHead(page))) {

369

bad_page(page);

369

bad_page(page);

370

bad++;

370

bad++;

371

}

371

}

372

373

__ClearPageHead(page);

373

__ClearPageHead(page);

374

375

for (i = 1; i < nr_pages; i++) {

375

for (i = 1; i < nr_pages; i++) {

376

struct page *p = page + i;

376

struct page *p = page + i;

377

378

if (unlikely(!PageTail(p) || (p->first_page != page))) {

378

if (unlikely(!PageTail(p) || (p->first_page != page))) {

379

bad_page(page);

379

bad_page(page);

380

bad++;

380

bad++;

381

}

381

}

382

__ClearPageTail(p);

382

__ClearPageTail(p);

383

}

383

}

384

385

return bad;

385

return bad;

386

}

386

}

387

388

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

388

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

389

{

389

{

390

int i;

390

int i;

391

392

/*

392

/*

393

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

393

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

394

* and __GFP_HIGHMEM from hard or soft interrupt context.

394

* and __GFP_HIGHMEM from hard or soft interrupt context.

395

*/

395

*/

396

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

396

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

397

for (i = 0; i < (1 << order); i++)

397

for (i = 0; i < (1 << order); i++)

398

clear_highpage(page + i);

398

clear_highpage(page + i);

399

}

399

}

400

401

#ifdef CONFIG_DEBUG_PAGEALLOC

401

#ifdef CONFIG_DEBUG_PAGEALLOC

402

unsigned int _debug_guardpage_minorder;

402

unsigned int _debug_guardpage_minorder;

403

404

static int __init debug_guardpage_minorder_setup(char *buf)

404

static int __init debug_guardpage_minorder_setup(char *buf)

405

{

405

{

406

unsigned long res;

406

unsigned long res;

407

408

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

408

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

409

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

409

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

410

return 0;

410

return 0;

411

}

411

}

412

_debug_guardpage_minorder = res;

412

_debug_guardpage_minorder = res;

413

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

413

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

414

return 0;

414

return 0;

415

}

415

}

416

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

416

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

417

418

static inline void set_page_guard_flag(struct page *page)

418

static inline void set_page_guard_flag(struct page *page)

419

{

419

{

420

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

420

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

421

}

421

}

422

423

static inline void clear_page_guard_flag(struct page *page)

423

static inline void clear_page_guard_flag(struct page *page)

424

{

424

{

425

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

425

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

426

}

426

}

427

#else

427

#else

428

static inline void set_page_guard_flag(struct page *page) { }

428

static inline void set_page_guard_flag(struct page *page) { }

429

static inline void clear_page_guard_flag(struct page *page) { }

429

static inline void clear_page_guard_flag(struct page *page) { }

430

#endif

430

#endif

431

432

static inline void set_page_order(struct page *page, int order)

432

static inline void set_page_order(struct page *page, int order)

433

{

433

{

434

set_page_private(page, order);

434

set_page_private(page, order);

435

__SetPageBuddy(page);

435

__SetPageBuddy(page);

436

}

436

}

437

438

static inline void rmv_page_order(struct page *page)

438

static inline void rmv_page_order(struct page *page)

439

{

439

{

440

__ClearPageBuddy(page);

440

__ClearPageBuddy(page);

441

set_page_private(page, 0);

441

set_page_private(page, 0);

442

}

442

}

443

444

/*

444

/*

445

* Locate the struct page for both the matching buddy in our

445

* Locate the struct page for both the matching buddy in our

446

* pair (buddy1) and the combined O(n+1) page they form (page).

446

* pair (buddy1) and the combined O(n+1) page they form (page).

447

*

447

*

448

* 1) Any buddy B1 will have an order O twin B2 which satisfies

448

* 1) Any buddy B1 will have an order O twin B2 which satisfies

449

* the following equation:

449

* the following equation:

450

* B2 = B1 ^ (1 << O)

450

* B2 = B1 ^ (1 << O)

451

* For example, if the starting buddy (buddy2) is #8 its order

451

* For example, if the starting buddy (buddy2) is #8 its order

452

* 1 buddy is #10:

452

* 1 buddy is #10:

453

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

453

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

454

*

454

*

455

* 2) Any buddy B will have an order O+1 parent P which

455

* 2) Any buddy B will have an order O+1 parent P which

456

* satisfies the following equation:

456

* satisfies the following equation:

457

* P = B & ~(1 << O)

457

* P = B & ~(1 << O)

458

*

458

*

459

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

459

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

460

*/

460

*/

461

static inline unsigned long

461

static inline unsigned long

462

__find_buddy_index(unsigned long page_idx, unsigned int order)

462

__find_buddy_index(unsigned long page_idx, unsigned int order)

463

{

463

{

464

return page_idx ^ (1 << order);

464

return page_idx ^ (1 << order);

465

}

465

}

466

467

/*

467

/*

468

* This function checks whether a page is free && is the buddy

468

* This function checks whether a page is free && is the buddy

469

* we can do coalesce a page and its buddy if

469

* we can do coalesce a page and its buddy if

470

* (a) the buddy is not in a hole &&

470

* (a) the buddy is not in a hole &&

471

* (b) the buddy is in the buddy system &&

471

* (b) the buddy is in the buddy system &&

472

* (c) a page and its buddy have the same order &&

472

* (c) a page and its buddy have the same order &&

473

* (d) a page and its buddy are in the same zone.

473

* (d) a page and its buddy are in the same zone.

474

*

474

*

475

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

475

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

476

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

476

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

477

*

477

*

478

* For recording page's order, we use page_private(page).

478

* For recording page's order, we use page_private(page).

479

*/

479

*/

480

static inline int page_is_buddy(struct page *page, struct page *buddy,

480

static inline int page_is_buddy(struct page *page, struct page *buddy,

481

int order)

481

int order)

482

{

482

{

483

if (!pfn_valid_within(page_to_pfn(buddy)))

483

if (!pfn_valid_within(page_to_pfn(buddy)))

484

return 0;

484

return 0;

485

486

if (page_zone_id(page) != page_zone_id(buddy))

486

if (page_zone_id(page) != page_zone_id(buddy))

487

return 0;

487

return 0;

488

489

if (page_is_guard(buddy) && page_order(buddy) == order) {

489

if (page_is_guard(buddy) && page_order(buddy) == order) {

490

VM_BUG_ON(page_count(buddy) != 0);

490

VM_BUG_ON(page_count(buddy) != 0);

491

return 1;

491

return 1;

492

}

492

}

493

494

if (PageBuddy(buddy) && page_order(buddy) == order) {

494

if (PageBuddy(buddy) && page_order(buddy) == order) {

495

VM_BUG_ON(page_count(buddy) != 0);

495

VM_BUG_ON(page_count(buddy) != 0);

496

return 1;

496

return 1;

497

}

497

}

498

return 0;

498

return 0;

499

}

499

}

500

501

/*

501

/*

502

* Freeing function for a buddy system allocator.

502

* Freeing function for a buddy system allocator.

503

*

503

*

504

* The concept of a buddy system is to maintain direct-mapped table

504

* The concept of a buddy system is to maintain direct-mapped table

505

* (containing bit values) for memory blocks of various "orders".

505

* (containing bit values) for memory blocks of various "orders".

506

* The bottom level table contains the map for the smallest allocatable

506

* The bottom level table contains the map for the smallest allocatable

507

* units of memory (here, pages), and each level above it describes

507

* units of memory (here, pages), and each level above it describes

508

* pairs of units from the levels below, hence, "buddies".

508

* pairs of units from the levels below, hence, "buddies".

509

* At a high level, all that happens here is marking the table entry

509

* At a high level, all that happens here is marking the table entry

510

* at the bottom level available, and propagating the changes upward

510

* at the bottom level available, and propagating the changes upward

511

* as necessary, plus some accounting needed to play nicely with other

511

* as necessary, plus some accounting needed to play nicely with other

512

* parts of the VM system.

512

* parts of the VM system.

513

* At each level, we keep a list of pages, which are heads of continuous

513

* At each level, we keep a list of pages, which are heads of continuous

514

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

514

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

515

* order is recorded in page_private(page) field.

515

* order is recorded in page_private(page) field.

516

* So when we are allocating or freeing one, we can derive the state of the

516

* So when we are allocating or freeing one, we can derive the state of the

517

* other. That is, if we allocate a small block, and both were

517

* other. That is, if we allocate a small block, and both were

518

* free, the remainder of the region must be split into blocks.

518

* free, the remainder of the region must be split into blocks.

519

* If a block is freed, and its buddy is also free, then this

519

* If a block is freed, and its buddy is also free, then this

520

* triggers coalescing into a block of larger size.

520

* triggers coalescing into a block of larger size.

521

*

521

*

522

* -- wli

522

* -- wli

523

*/

523

*/

524

525

static inline void __free_one_page(struct page *page,

525

static inline void __free_one_page(struct page *page,

526

struct zone *zone, unsigned int order,

526

struct zone *zone, unsigned int order,

527

int migratetype)

527

int migratetype)

528

{

528

{

529

unsigned long page_idx;

529

unsigned long page_idx;

530

unsigned long combined_idx;

530

unsigned long combined_idx;

531

unsigned long uninitialized_var(buddy_idx);

531

unsigned long uninitialized_var(buddy_idx);

532

struct page *buddy;

532

struct page *buddy;

533

534

if (unlikely(PageCompound(page)))

534

if (unlikely(PageCompound(page)))

535

if (unlikely(destroy_compound_page(page, order)))

535

if (unlikely(destroy_compound_page(page, order)))

536

return;

536

return;

537

538

VM_BUG_ON(migratetype == -1);

538

VM_BUG_ON(migratetype == -1);

539

540

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

540

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

541

542

VM_BUG_ON(page_idx & ((1 << order) - 1));

542

VM_BUG_ON(page_idx & ((1 << order) - 1));

543

VM_BUG_ON(bad_range(zone, page));

543

VM_BUG_ON(bad_range(zone, page));

544

545

while (order < MAX_ORDER-1) {

545

while (order < MAX_ORDER-1) {

546

buddy_idx = __find_buddy_index(page_idx, order);

546

buddy_idx = __find_buddy_index(page_idx, order);

547

buddy = page + (buddy_idx - page_idx);

547

buddy = page + (buddy_idx - page_idx);

548

if (!page_is_buddy(page, buddy, order))

548

if (!page_is_buddy(page, buddy, order))

549

break;

549

break;

550

/*

550

/*

551

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

551

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

552

* merge with it and move up one order.

552

* merge with it and move up one order.

553

*/

553

*/

554

if (page_is_guard(buddy)) {

554

if (page_is_guard(buddy)) {

555

clear_page_guard_flag(buddy);

555

clear_page_guard_flag(buddy);

556

set_page_private(page, 0);

556

set_page_private(page, 0);

557

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

557

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

558

} else {

558

} else {

559

list_del(&buddy->lru);

559

list_del(&buddy->lru);

560

zone->free_area[order].nr_free--;

560

zone->free_area[order].nr_free--;

561

rmv_page_order(buddy);

561

rmv_page_order(buddy);

562

}

562

}

563

combined_idx = buddy_idx & page_idx;

563

combined_idx = buddy_idx & page_idx;

564

page = page + (combined_idx - page_idx);

564

page = page + (combined_idx - page_idx);

565

page_idx = combined_idx;

565

page_idx = combined_idx;

566

order++;

566

order++;

567

}

567

}

568

set_page_order(page, order);

568

set_page_order(page, order);

569

570

/*

570

/*

571

* If this is not the largest possible page, check if the buddy

571

* If this is not the largest possible page, check if the buddy

572

* of the next-highest order is free. If it is, it's possible

572

* of the next-highest order is free. If it is, it's possible

573

* that pages are being freed that will coalesce soon. In case,

573

* that pages are being freed that will coalesce soon. In case,

574

* that is happening, add the free page to the tail of the list

574

* that is happening, add the free page to the tail of the list

575

* so it's less likely to be used soon and more likely to be merged

575

* so it's less likely to be used soon and more likely to be merged

576

* as a higher order page

576

* as a higher order page

577

*/

577

*/

578

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

578

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

579

struct page *higher_page, *higher_buddy;

579

struct page *higher_page, *higher_buddy;

580

combined_idx = buddy_idx & page_idx;

580

combined_idx = buddy_idx & page_idx;

581

higher_page = page + (combined_idx - page_idx);

581

higher_page = page + (combined_idx - page_idx);

582

buddy_idx = __find_buddy_index(combined_idx, order + 1);

582

buddy_idx = __find_buddy_index(combined_idx, order + 1);

583

higher_buddy = page + (buddy_idx - combined_idx);

583

higher_buddy = page + (buddy_idx - combined_idx);

584

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

584

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

585

list_add_tail(&page->lru,

585

list_add_tail(&page->lru,

586

&zone->free_area[order].free_list[migratetype]);

586

&zone->free_area[order].free_list[migratetype]);

587

goto out;

587

goto out;

588

}

588

}

589

}

589

}

590

591

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

591

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

592

out:

592

out:

593

zone->free_area[order].nr_free++;

593

zone->free_area[order].nr_free++;

594

}

594

}

595

596

/*

596

/*

597

* free_page_mlock() -- clean up attempts to free and mlocked() page.

597

* free_page_mlock() -- clean up attempts to free and mlocked() page.

598

* Page should not be on lru, so no need to fix that up.

598

* Page should not be on lru, so no need to fix that up.

599

* free_pages_check() will verify...

599

* free_pages_check() will verify...

600

*/

600

*/

601

static inline void free_page_mlock(struct page *page)

601

static inline void free_page_mlock(struct page *page)

602

{

602

{

603

__dec_zone_page_state(page, NR_MLOCK);

603

__dec_zone_page_state(page, NR_MLOCK);

604

__count_vm_event(UNEVICTABLE_MLOCKFREED);

604

__count_vm_event(UNEVICTABLE_MLOCKFREED);

605

}

605

}

606

607

static inline int free_pages_check(struct page *page)

607

static inline int free_pages_check(struct page *page)

608

{

608

{

609

if (unlikely(page_mapcount(page) |

609

if (unlikely(page_mapcount(page) |

610

(page->mapping != NULL) |

610

(page->mapping != NULL) |

611

(atomic_read(&page->_count) != 0) |

611

(atomic_read(&page->_count) != 0) |

612

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

612

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

613

(mem_cgroup_bad_page_check(page)))) {

613

(mem_cgroup_bad_page_check(page)))) {

614

bad_page(page);

614

bad_page(page);

615

return 1;

615

return 1;

616

}

616

}

617

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

617

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

618

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

618

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

619

return 0;

619

return 0;

620

}

620

}

621

622

/*

622

/*

623

* Frees a number of pages from the PCP lists

623

* Frees a number of pages from the PCP lists

624

* Assumes all pages on list are in same zone, and of same order.

624

* Assumes all pages on list are in same zone, and of same order.

625

* count is the number of pages to free.

625

* count is the number of pages to free.

626

*

626

*

627

* If the zone was previously in an "all pages pinned" state then look to

627

* If the zone was previously in an "all pages pinned" state then look to

628

* see if this freeing clears that state.

628

* see if this freeing clears that state.

629

*

629

*

630

* And clear the zone's pages_scanned counter, to hold off the "all pages are

630

* And clear the zone's pages_scanned counter, to hold off the "all pages are

631

* pinned" detection logic.

631

* pinned" detection logic.

632

*/

632

*/

633

static void free_pcppages_bulk(struct zone *zone, int count,

633

static void free_pcppages_bulk(struct zone *zone, int count,

634

struct per_cpu_pages *pcp)

634

struct per_cpu_pages *pcp)

635

{

635

{

636

int migratetype = 0;

636

int migratetype = 0;

637

int batch_free = 0;

637

int batch_free = 0;

638

int to_free = count;

638

int to_free = count;

639

640

spin_lock(&zone->lock);

640

spin_lock(&zone->lock);

641

zone->all_unreclaimable = 0;

641

zone->all_unreclaimable = 0;

642

zone->pages_scanned = 0;

642

zone->pages_scanned = 0;

643

644

while (to_free) {

644

while (to_free) {

645

struct page *page;

645

struct page *page;

646

struct list_head *list;

646

struct list_head *list;

647

648

/*

648

/*

649

* Remove pages from lists in a round-robin fashion. A

649

* Remove pages from lists in a round-robin fashion. A

650

* batch_free count is maintained that is incremented when an

650

* batch_free count is maintained that is incremented when an

651

* empty list is encountered. This is so more pages are freed

651

* empty list is encountered. This is so more pages are freed

652

* off fuller lists instead of spinning excessively around empty

652

* off fuller lists instead of spinning excessively around empty

653

* lists

653

* lists

654

*/

654

*/

655

do {

655

do {

656

batch_free++;

656

batch_free++;

657

if (++migratetype == MIGRATE_PCPTYPES)

657

if (++migratetype == MIGRATE_PCPTYPES)

658

migratetype = 0;

658

migratetype = 0;

659

list = &pcp->lists[migratetype];

659

list = &pcp->lists[migratetype];

660

} while (list_empty(list));

660

} while (list_empty(list));

661

662

/* This is the only non-empty list. Free them all. */

662

/* This is the only non-empty list. Free them all. */

663

if (batch_free == MIGRATE_PCPTYPES)

663

if (batch_free == MIGRATE_PCPTYPES)

664

batch_free = to_free;

664

batch_free = to_free;

665

666

do {

666

do {

667

page = list_entry(list->prev, struct page, lru);

667

page = list_entry(list->prev, struct page, lru);

668

/* must delete as __free_one_page list manipulates */

668

/* must delete as __free_one_page list manipulates */

669

list_del(&page->lru);

669

list_del(&page->lru);

670

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

670

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

671

__free_one_page(page, zone, 0, page_private(page));

671

__free_one_page(page, zone, 0, page_private(page));

672

trace_mm_page_pcpu_drain(page, 0, page_private(page));

672

trace_mm_page_pcpu_drain(page, 0, page_private(page));

673

} while (--to_free && --batch_free && !list_empty(list));

673

} while (--to_free && --batch_free && !list_empty(list));

674

}

674

}

675

__mod_zone_page_state(zone, NR_FREE_PAGES, count);

675

__mod_zone_page_state(zone, NR_FREE_PAGES, count);

676

spin_unlock(&zone->lock);

676

spin_unlock(&zone->lock);

677

}

677

}

678

679

static void free_one_page(struct zone *zone, struct page *page, int order,

679

static void free_one_page(struct zone *zone, struct page *page, int order,

680

int migratetype)

680

int migratetype)

681

{

681

{

682

spin_lock(&zone->lock);

682

spin_lock(&zone->lock);

683

zone->all_unreclaimable = 0;

683

zone->all_unreclaimable = 0;

684

zone->pages_scanned = 0;

684

zone->pages_scanned = 0;

685

686

__free_one_page(page, zone, order, migratetype);

686

__free_one_page(page, zone, order, migratetype);

687

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

687

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

688

spin_unlock(&zone->lock);

688

spin_unlock(&zone->lock);

689

}

689

}

690

691

static bool free_pages_prepare(struct page *page, unsigned int order)

691

static bool free_pages_prepare(struct page *page, unsigned int order)

692

{

692

{

693

int i;

693

int i;

694

int bad = 0;

694

int bad = 0;

695

696

trace_mm_page_free(page, order);

696

trace_mm_page_free(page, order);

697

kmemcheck_free_shadow(page, order);

697

kmemcheck_free_shadow(page, order);

698

699

if (PageAnon(page))

699

if (PageAnon(page))

700

page->mapping = NULL;

700

page->mapping = NULL;

701

for (i = 0; i < (1 << order); i++)

701

for (i = 0; i < (1 << order); i++)

702

bad += free_pages_check(page + i);

702

bad += free_pages_check(page + i);

703

if (bad)

703

if (bad)

704

return false;

704

return false;

705

706

if (!PageHighMem(page)) {

706

if (!PageHighMem(page)) {

707

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

707

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

708

debug_check_no_obj_freed(page_address(page),

708

debug_check_no_obj_freed(page_address(page),

709

PAGE_SIZE << order);

709

PAGE_SIZE << order);

710

}

710

}

711

arch_free_page(page, order);

711

arch_free_page(page, order);

712

kernel_map_pages(page, 1 << order, 0);

712

kernel_map_pages(page, 1 << order, 0);

713

714

return true;

714

return true;

715

}

715

}

716

717

static void __free_pages_ok(struct page *page, unsigned int order)

717

static void __free_pages_ok(struct page *page, unsigned int order)

718

{

718

{

719

unsigned long flags;

719

unsigned long flags;

720

int wasMlocked = __TestClearPageMlocked(page);

720

int wasMlocked = __TestClearPageMlocked(page);

721

722

if (!free_pages_prepare(page, order))

722

if (!free_pages_prepare(page, order))

723

return;

723

return;

724

725

local_irq_save(flags);

725

local_irq_save(flags);

726

if (unlikely(wasMlocked))

726

if (unlikely(wasMlocked))

727

free_page_mlock(page);

727

free_page_mlock(page);

728

__count_vm_events(PGFREE, 1 << order);

728

__count_vm_events(PGFREE, 1 << order);

729

free_one_page(page_zone(page), page, order,

729

free_one_page(page_zone(page), page, order,

730

get_pageblock_migratetype(page));

730

get_pageblock_migratetype(page));

731

local_irq_restore(flags);

731

local_irq_restore(flags);

732

}

732

}

733

734

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

734

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

735

{

735

{

736

unsigned int nr_pages = 1 << order;

736

unsigned int nr_pages = 1 << order;

737

unsigned int loop;

737

unsigned int loop;

738

739

prefetchw(page);

739

prefetchw(page);

740

for (loop = 0; loop < nr_pages; loop++) {

740

for (loop = 0; loop < nr_pages; loop++) {

741

struct page *p = &page[loop];

741

struct page *p = &page[loop];

742

743

if (loop + 1 < nr_pages)

743

if (loop + 1 < nr_pages)

744

prefetchw(p + 1);

744

prefetchw(p + 1);

745

__ClearPageReserved(p);

745

__ClearPageReserved(p);

746

set_page_count(p, 0);

746

set_page_count(p, 0);

747

}

747

}

748

749

set_page_refcounted(page);

749

set_page_refcounted(page);

750

__free_pages(page, order);

750

__free_pages(page, order);

751

}

751

}

752

753

#ifdef CONFIG_CMA

753

#ifdef CONFIG_CMA

754

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

754

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

755

void __init init_cma_reserved_pageblock(struct page *page)

755

void __init init_cma_reserved_pageblock(struct page *page)

756

{

756

{

757

unsigned i = pageblock_nr_pages;

757

unsigned i = pageblock_nr_pages;

758

struct page *p = page;

758

struct page *p = page;

759

760

do {

760

do {

761

__ClearPageReserved(p);

761

__ClearPageReserved(p);

762

set_page_count(p, 0);

762

set_page_count(p, 0);

763

} while (++p, --i);

763

} while (++p, --i);

764

765

set_page_refcounted(page);

765

set_page_refcounted(page);

766

set_pageblock_migratetype(page, MIGRATE_CMA);

766

set_pageblock_migratetype(page, MIGRATE_CMA);

767

__free_pages(page, pageblock_order);

767

__free_pages(page, pageblock_order);

768

totalram_pages += pageblock_nr_pages;

768

totalram_pages += pageblock_nr_pages;

769

}

769

}

770

#endif

770

#endif

771

772

/*

772

/*

773

* The order of subdivision here is critical for the IO subsystem.

773

* The order of subdivision here is critical for the IO subsystem.

774

* Please do not alter this order without good reasons and regression

774

* Please do not alter this order without good reasons and regression

775

* testing. Specifically, as large blocks of memory are subdivided,

775

* testing. Specifically, as large blocks of memory are subdivided,

776

* the order in which smaller blocks are delivered depends on the order

776

* the order in which smaller blocks are delivered depends on the order

777

* they're subdivided in this function. This is the primary factor

777

* they're subdivided in this function. This is the primary factor

778

* influencing the order in which pages are delivered to the IO

778

* influencing the order in which pages are delivered to the IO

779

* subsystem according to empirical testing, and this is also justified

779

* subsystem according to empirical testing, and this is also justified

780

* by considering the behavior of a buddy system containing a single

780

* by considering the behavior of a buddy system containing a single

781

* large block of memory acted on by a series of small allocations.

781

* large block of memory acted on by a series of small allocations.

782

* This behavior is a critical factor in sglist merging's success.

782

* This behavior is a critical factor in sglist merging's success.

783

*

783

*

784

* -- wli

784

* -- wli

785

*/

785

*/

786

static inline void expand(struct zone *zone, struct page *page,

786

static inline void expand(struct zone *zone, struct page *page,

787

int low, int high, struct free_area *area,

787

int low, int high, struct free_area *area,

788

int migratetype)

788

int migratetype)

789

{

789

{

790

unsigned long size = 1 << high;

790

unsigned long size = 1 << high;

791

792

while (high > low) {

792

while (high > low) {

793

area--;

793

area--;

794

high--;

794

high--;

795

size >>= 1;

795

size >>= 1;

796

VM_BUG_ON(bad_range(zone, &page[size]));

796

VM_BUG_ON(bad_range(zone, &page[size]));

797

798

#ifdef CONFIG_DEBUG_PAGEALLOC

798

#ifdef CONFIG_DEBUG_PAGEALLOC

799

if (high < debug_guardpage_minorder()) {

799

if (high < debug_guardpage_minorder()) {

800

/*

800

/*

801

* Mark as guard pages (or page), that will allow to

801

* Mark as guard pages (or page), that will allow to

802

* merge back to allocator when buddy will be freed.

802

* merge back to allocator when buddy will be freed.

803

* Corresponding page table entries will not be touched,

803

* Corresponding page table entries will not be touched,

804

* pages will stay not present in virtual address space

804

* pages will stay not present in virtual address space

805

*/

805

*/

806

INIT_LIST_HEAD(&page[size].lru);

806

INIT_LIST_HEAD(&page[size].lru);

807

set_page_guard_flag(&page[size]);

807

set_page_guard_flag(&page[size]);

808

set_page_private(&page[size], high);

808

set_page_private(&page[size], high);

809

/* Guard pages are not available for any usage */

809

/* Guard pages are not available for any usage */

810

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));

810

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));

811

continue;

811

continue;

812

}

812

}

813

#endif

813

#endif

814

list_add(&page[size].lru, &area->free_list[migratetype]);

814

list_add(&page[size].lru, &area->free_list[migratetype]);

815

area->nr_free++;

815

area->nr_free++;

816

set_page_order(&page[size], high);

816

set_page_order(&page[size], high);

817

}

817

}

818

}

818

}

819

820

/*

820

/*

821

* This page is about to be returned from the page allocator

821

* This page is about to be returned from the page allocator

822

*/

822

*/

823

static inline int check_new_page(struct page *page)

823

static inline int check_new_page(struct page *page)

824

{

824

{

825

if (unlikely(page_mapcount(page) |

825

if (unlikely(page_mapcount(page) |

826

(page->mapping != NULL) |

826

(page->mapping != NULL) |

827

(atomic_read(&page->_count) != 0) |

827

(atomic_read(&page->_count) != 0) |

828

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

828

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

829

(mem_cgroup_bad_page_check(page)))) {

829

(mem_cgroup_bad_page_check(page)))) {

830

bad_page(page);

830

bad_page(page);

831

return 1;

831

return 1;

832

}

832

}

833

return 0;

833

return 0;

834

}

834

}

835

836

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

836

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

837

{

837

{

838

int i;

838

int i;

839

840

for (i = 0; i < (1 << order); i++) {

840

for (i = 0; i < (1 << order); i++) {

841

struct page *p = page + i;

841

struct page *p = page + i;

842

if (unlikely(check_new_page(p)))

842

if (unlikely(check_new_page(p)))

843

return 1;

843

return 1;

844

}

844

}

845

846

set_page_private(page, 0);

846

set_page_private(page, 0);

847

set_page_refcounted(page);

847

set_page_refcounted(page);

848

849

arch_alloc_page(page, order);

849

arch_alloc_page(page, order);

850

kernel_map_pages(page, 1 << order, 1);

850

kernel_map_pages(page, 1 << order, 1);

851

852

if (gfp_flags & __GFP_ZERO)

852

if (gfp_flags & __GFP_ZERO)

853

prep_zero_page(page, order, gfp_flags);

853

prep_zero_page(page, order, gfp_flags);

854

855

if (order && (gfp_flags & __GFP_COMP))

855

if (order && (gfp_flags & __GFP_COMP))

856

prep_compound_page(page, order);

856

prep_compound_page(page, order);

857

858

return 0;

858

return 0;

859

}

859

}

860

861

/*

861

/*

862

* Go through the free lists for the given migratetype and remove

862

* Go through the free lists for the given migratetype and remove

863

* the smallest available page from the freelists

863

* the smallest available page from the freelists

864

*/

864

*/

865

static inline

865

static inline

866

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

866

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

867

int migratetype)

867

int migratetype)

868

{

868

{

869

unsigned int current_order;

869

unsigned int current_order;

870

struct free_area * area;

870

struct free_area * area;

871

struct page *page;

871

struct page *page;

872

873

/* Find a page of the appropriate size in the preferred list */

873

/* Find a page of the appropriate size in the preferred list */

874

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

874

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

875

area = &(zone->free_area[current_order]);

875

area = &(zone->free_area[current_order]);

876

if (list_empty(&area->free_list[migratetype]))

876

if (list_empty(&area->free_list[migratetype]))

877

continue;

877

continue;

878

879

page = list_entry(area->free_list[migratetype].next,

879

page = list_entry(area->free_list[migratetype].next,

880

struct page, lru);

880

struct page, lru);

881

list_del(&page->lru);

881

list_del(&page->lru);

882

rmv_page_order(page);

882

rmv_page_order(page);

883

area->nr_free--;

883

area->nr_free--;

884

expand(zone, page, order, current_order, area, migratetype);

884

expand(zone, page, order, current_order, area, migratetype);

885

return page;

885

return page;

886

}

886

}

887

888

return NULL;

888

return NULL;

889

}

889

}

890

891

892

/*

892

/*

893

* This array describes the order lists are fallen back to when

893

* This array describes the order lists are fallen back to when

894

* the free lists for the desirable migrate type are depleted

894

* the free lists for the desirable migrate type are depleted

895

*/

895

*/

896

static int fallbacks[MIGRATE_TYPES][4] = {

896

static int fallbacks[MIGRATE_TYPES][4] = {

897

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

897

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

898

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

898

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

899

#ifdef CONFIG_CMA

899

#ifdef CONFIG_CMA

900

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

900

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

901

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

901

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

902

#else

902

#else

903

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

903

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

904

#endif

904

#endif

905

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

905

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

906

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

906

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

907

};

907

};

908

909

/*

909

/*

910

* Move the free pages in a range to the free lists of the requested type.

910

* Move the free pages in a range to the free lists of the requested type.

911

* Note that start_page and end_pages are not aligned on a pageblock

911

* Note that start_page and end_pages are not aligned on a pageblock

912

* boundary. If alignment is required, use move_freepages_block()

912

* boundary. If alignment is required, use move_freepages_block()

913

*/

913

*/

914

static int move_freepages(struct zone *zone,

914

static int move_freepages(struct zone *zone,

915

struct page *start_page, struct page *end_page,

915

struct page *start_page, struct page *end_page,

916

int migratetype)

916

int migratetype)

917

{

917

{

918

struct page *page;

918

struct page *page;

919

unsigned long order;

919

unsigned long order;

920

int pages_moved = 0;

920

int pages_moved = 0;

921

922

#ifndef CONFIG_HOLES_IN_ZONE

922

#ifndef CONFIG_HOLES_IN_ZONE

923

/*

923

/*

924

* page_zone is not safe to call in this context when

924

* page_zone is not safe to call in this context when

925

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

925

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

926

* anyway as we check zone boundaries in move_freepages_block().

926

* anyway as we check zone boundaries in move_freepages_block().

927

* Remove at a later date when no bug reports exist related to

927

* Remove at a later date when no bug reports exist related to

928

* grouping pages by mobility

928

* grouping pages by mobility

929

*/

929

*/

930

BUG_ON(page_zone(start_page) != page_zone(end_page));

930

BUG_ON(page_zone(start_page) != page_zone(end_page));

931

#endif

931

#endif

932

933

for (page = start_page; page <= end_page;) {

933

for (page = start_page; page <= end_page;) {

934

/* Make sure we are not inadvertently changing nodes */

934

/* Make sure we are not inadvertently changing nodes */

935

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

935

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

936

937

if (!pfn_valid_within(page_to_pfn(page))) {

937

if (!pfn_valid_within(page_to_pfn(page))) {

938

page++;

938

page++;

939

continue;

939

continue;

940

}

940

}

941

942

if (!PageBuddy(page)) {

942

if (!PageBuddy(page)) {

943

page++;

943

page++;

944

continue;

944

continue;

945

}

945

}

946

947

order = page_order(page);

947

order = page_order(page);

948

list_move(&page->lru,

948

list_move(&page->lru,

949

&zone->free_area[order].free_list[migratetype]);

949

&zone->free_area[order].free_list[migratetype]);

950

page += 1 << order;

950

page += 1 << order;

951

pages_moved += 1 << order;

951

pages_moved += 1 << order;

952

}

952

}

953

954

return pages_moved;

954

return pages_moved;

955

}

955

}

956

957

static int move_freepages_block(struct zone *zone, struct page *page,

957

static int move_freepages_block(struct zone *zone, struct page *page,

958

int migratetype)

958

int migratetype)

959

{

959

{

960

unsigned long start_pfn, end_pfn;

960

unsigned long start_pfn, end_pfn;

961

struct page *start_page, *end_page;

961

struct page *start_page, *end_page;

962

963

start_pfn = page_to_pfn(page);

963

start_pfn = page_to_pfn(page);

964

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

964

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

965

start_page = pfn_to_page(start_pfn);

965

start_page = pfn_to_page(start_pfn);

966

end_page = start_page + pageblock_nr_pages - 1;

966

end_page = start_page + pageblock_nr_pages - 1;

967

end_pfn = start_pfn + pageblock_nr_pages - 1;

967

end_pfn = start_pfn + pageblock_nr_pages - 1;

968

969

/* Do not cross zone boundaries */

969

/* Do not cross zone boundaries */

970

if (start_pfn < zone->zone_start_pfn)

970

if (start_pfn < zone->zone_start_pfn)

971

start_page = page;

971

start_page = page;

972

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

972

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

973

return 0;

973

return 0;

974

975

return move_freepages(zone, start_page, end_page, migratetype);

975

return move_freepages(zone, start_page, end_page, migratetype);

976

}

976

}

977

978

static void change_pageblock_range(struct page *pageblock_page,

978

static void change_pageblock_range(struct page *pageblock_page,

979

int start_order, int migratetype)

979

int start_order, int migratetype)

980

{

980

{

981

int nr_pageblocks = 1 << (start_order - pageblock_order);

981

int nr_pageblocks = 1 << (start_order - pageblock_order);

982

983

while (nr_pageblocks--) {

983

while (nr_pageblocks--) {

984

set_pageblock_migratetype(pageblock_page, migratetype);

984

set_pageblock_migratetype(pageblock_page, migratetype);

985

pageblock_page += pageblock_nr_pages;

985

pageblock_page += pageblock_nr_pages;

986

}

986

}

987

}

987

}

988

989

/* Remove an element from the buddy allocator from the fallback list */

989

/* Remove an element from the buddy allocator from the fallback list */

990

static inline struct page *

990

static inline struct page *

991

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

991

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

992

{

992

{

993

struct free_area * area;

993

struct free_area * area;

994

int current_order;

994

int current_order;

995

struct page *page;

995

struct page *page;

996

int migratetype, i;

996

int migratetype, i;

997

998

/* Find the largest possible block of pages in the other list */

998

/* Find the largest possible block of pages in the other list */

999

for (current_order = MAX_ORDER-1; current_order >= order;

999

for (current_order = MAX_ORDER-1; current_order >= order;

1000

--current_order) {

1000

--current_order) {

1001

for (i = 0;; i++) {

1001

for (i = 0;; i++) {

1002

migratetype = fallbacks[start_migratetype][i];

1002

migratetype = fallbacks[start_migratetype][i];

1003

1004

/* MIGRATE_RESERVE handled later if necessary */

1004

/* MIGRATE_RESERVE handled later if necessary */

1005

if (migratetype == MIGRATE_RESERVE)

1005

if (migratetype == MIGRATE_RESERVE)

1006

break;

1006

break;

1007

1008

area = &(zone->free_area[current_order]);

1008

area = &(zone->free_area[current_order]);

1009

if (list_empty(&area->free_list[migratetype]))

1009

if (list_empty(&area->free_list[migratetype]))

1010

continue;

1010

continue;

1011

1012

page = list_entry(area->free_list[migratetype].next,

1012

page = list_entry(area->free_list[migratetype].next,

1013

struct page, lru);

1013

struct page, lru);

1014

area->nr_free--;

1014

area->nr_free--;

1015

1016

/*

1016

/*

1017

* If breaking a large block of pages, move all free

1017

* If breaking a large block of pages, move all free

1018

* pages to the preferred allocation list. If falling

1018

* pages to the preferred allocation list. If falling

1019

* back for a reclaimable kernel allocation, be more

1019

* back for a reclaimable kernel allocation, be more

1020

* aggressive about taking ownership of free pages

1020

* aggressive about taking ownership of free pages

1021

*

1021

*

1022

* On the other hand, never change migration

1022

* On the other hand, never change migration

1023

* type of MIGRATE_CMA pageblocks nor move CMA

1023

* type of MIGRATE_CMA pageblocks nor move CMA

1024

* pages on different free lists. We don't

1024

* pages on different free lists. We don't

1025

* want unmovable pages to be allocated from

1025

* want unmovable pages to be allocated from

1026

* MIGRATE_CMA areas.

1026

* MIGRATE_CMA areas.

1027

*/

1027

*/

1028

if (!is_migrate_cma(migratetype) &&

1028

if (!is_migrate_cma(migratetype) &&

1029

(unlikely(current_order >= pageblock_order / 2) ||

1029

(unlikely(current_order >= pageblock_order / 2) ||

1030

start_migratetype == MIGRATE_RECLAIMABLE ||

1030

start_migratetype == MIGRATE_RECLAIMABLE ||

1031

page_group_by_mobility_disabled)) {

1031

page_group_by_mobility_disabled)) {

1032

int pages;

1032

int pages;

1033

pages = move_freepages_block(zone, page,

1033

pages = move_freepages_block(zone, page,

1034

start_migratetype);

1034

start_migratetype);

1035

1036

/* Claim the whole block if over half of it is free */

1036

/* Claim the whole block if over half of it is free */

1037

if (pages >= (1 << (pageblock_order-1)) ||

1037

if (pages >= (1 << (pageblock_order-1)) ||

1038

page_group_by_mobility_disabled)

1038

page_group_by_mobility_disabled)

1039

set_pageblock_migratetype(page,

1039

set_pageblock_migratetype(page,

1040

start_migratetype);

1040

start_migratetype);

1041

1042

migratetype = start_migratetype;

1042

migratetype = start_migratetype;

1043

}

1043

}

1044

1045

/* Remove the page from the freelists */

1045

/* Remove the page from the freelists */

1046

list_del(&page->lru);

1046

list_del(&page->lru);

1047

rmv_page_order(page);

1047

rmv_page_order(page);

1048

1049

/* Take ownership for orders >= pageblock_order */

1049

/* Take ownership for orders >= pageblock_order */

1050

if (current_order >= pageblock_order &&

1050

if (current_order >= pageblock_order &&

1051

!is_migrate_cma(migratetype))

1051

!is_migrate_cma(migratetype))

1052

change_pageblock_range(page, current_order,

1052

change_pageblock_range(page, current_order,

1053

start_migratetype);

1053

start_migratetype);

1054

1055

expand(zone, page, order, current_order, area,

1055

expand(zone, page, order, current_order, area,

1056

is_migrate_cma(migratetype)

1056

is_migrate_cma(migratetype)

1057

? migratetype : start_migratetype);

1057

? migratetype : start_migratetype);

1058

1059

trace_mm_page_alloc_extfrag(page, order, current_order,

1059

trace_mm_page_alloc_extfrag(page, order, current_order,

1060

start_migratetype, migratetype);

1060

start_migratetype, migratetype);

1061

1062

return page;

1062

return page;

1063

}

1063

}

1064

}

1064

}

1065

1066

return NULL;

1066

return NULL;

1067

}

1067

}

1068

1069

/*

1069

/*

1070

* Do the hard work of removing an element from the buddy allocator.

1070

* Do the hard work of removing an element from the buddy allocator.

1071

* Call me with the zone->lock already held.

1071

* Call me with the zone->lock already held.

1072

*/

1072

*/

1073

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1073

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1074

int migratetype)

1074

int migratetype)

1075

{

1075

{

1076

struct page *page;

1076

struct page *page;

1077

1078

retry_reserve:

1078

retry_reserve:

1079

page = __rmqueue_smallest(zone, order, migratetype);

1079

page = __rmqueue_smallest(zone, order, migratetype);

1080

1081

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1081

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1082

page = __rmqueue_fallback(zone, order, migratetype);

1082

page = __rmqueue_fallback(zone, order, migratetype);

1083

1084

/*

1084

/*

1085

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1085

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1086

* is used because __rmqueue_smallest is an inline function

1086

* is used because __rmqueue_smallest is an inline function

1087

* and we want just one call site

1087

* and we want just one call site

1088

*/

1088

*/

1089

if (!page) {

1089

if (!page) {

1090

migratetype = MIGRATE_RESERVE;

1090

migratetype = MIGRATE_RESERVE;

1091

goto retry_reserve;

1091

goto retry_reserve;

1092

}

1092

}

1093

}

1093

}

1094

1095

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1095

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1096

return page;

1096

return page;

1097

}

1097

}

1098

1099

/*

1099

/*

1100

* Obtain a specified number of elements from the buddy allocator, all under

1100

* Obtain a specified number of elements from the buddy allocator, all under

1101

* a single hold of the lock, for efficiency. Add them to the supplied list.

1101

* a single hold of the lock, for efficiency. Add them to the supplied list.

1102

* Returns the number of new pages which were placed at *list.

1102

* Returns the number of new pages which were placed at *list.

1103

*/

1103

*/

1104

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1104

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1105

unsigned long count, struct list_head *list,

1105

unsigned long count, struct list_head *list,

1106

int migratetype, int cold)

1106

int migratetype, int cold)

1107

{

1107

{

1108

int mt = migratetype, i;

1108

int mt = migratetype, i;

1109

1110

spin_lock(&zone->lock);

1110

spin_lock(&zone->lock);

1111

for (i = 0; i < count; ++i) {

1111

for (i = 0; i < count; ++i) {

1112

struct page *page = __rmqueue(zone, order, migratetype);

1112

struct page *page = __rmqueue(zone, order, migratetype);

1113

if (unlikely(page == NULL))

1113

if (unlikely(page == NULL))

1114

break;

1114

break;

1115

1116

/*

1116

/*

1117

* Split buddy pages returned by expand() are received here

1117

* Split buddy pages returned by expand() are received here

1118

* in physical page order. The page is added to the callers and

1118

* in physical page order. The page is added to the callers and

1119

* list and the list head then moves forward. From the callers

1119

* list and the list head then moves forward. From the callers

1120

* perspective, the linked list is ordered by page number in

1120

* perspective, the linked list is ordered by page number in

1121

* some conditions. This is useful for IO devices that can

1121

* some conditions. This is useful for IO devices that can

1122

* merge IO requests if the physical pages are ordered

1122

* merge IO requests if the physical pages are ordered

1123

* properly.

1123

* properly.

1124

*/

1124

*/

1125

if (likely(cold == 0))

1125

if (likely(cold == 0))

1126

list_add(&page->lru, list);

1126

list_add(&page->lru, list);

1127

else

1127

else

1128

list_add_tail(&page->lru, list);

1128

list_add_tail(&page->lru, list);

1129

if (IS_ENABLED(CONFIG_CMA)) {

1129

if (IS_ENABLED(CONFIG_CMA)) {

1130

mt = get_pageblock_migratetype(page);

1130

mt = get_pageblock_migratetype(page);

1131

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1131

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1132

mt = migratetype;

1132

mt = migratetype;

1133

}

1133

}

1134

set_page_private(page, mt);

1134

set_page_private(page, mt);

1135

list = &page->lru;

1135

list = &page->lru;

1136

}

1136

}

1137

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1137

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1138

spin_unlock(&zone->lock);

1138

spin_unlock(&zone->lock);

1139

return i;

1139

return i;

1140

}

1140

}

1141

1142

#ifdef CONFIG_NUMA

1142

#ifdef CONFIG_NUMA

1143

/*

1143

/*

1144

* Called from the vmstat counter updater to drain pagesets of this

1144

* Called from the vmstat counter updater to drain pagesets of this

1145

* currently executing processor on remote nodes after they have

1145

* currently executing processor on remote nodes after they have

1146

* expired.

1146

* expired.

1147

*

1147

*

1148

* Note that this function must be called with the thread pinned to

1148

* Note that this function must be called with the thread pinned to

1149

* a single processor.

1149

* a single processor.

1150

*/

1150

*/

1151

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1151

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1152

{

1152

{

1153

unsigned long flags;

1153

unsigned long flags;

1154

int to_drain;

1154

int to_drain;

1155

1156

local_irq_save(flags);

1156

local_irq_save(flags);

1157

if (pcp->count >= pcp->batch)

1157

if (pcp->count >= pcp->batch)

1158

to_drain = pcp->batch;

1158

to_drain = pcp->batch;

1159

else

1159

else

1160

to_drain = pcp->count;

1160

to_drain = pcp->count;

1161

if (to_drain > 0) {

1161

if (to_drain > 0) {

1162

free_pcppages_bulk(zone, to_drain, pcp);

1162

free_pcppages_bulk(zone, to_drain, pcp);

1163

pcp->count -= to_drain;

1163

pcp->count -= to_drain;

1164

}

1164

}

1165

local_irq_restore(flags);

1165

local_irq_restore(flags);

1166

}

1166

}

1167

#endif

1167

#endif

1168

1169

/*

1169

/*

1170

* Drain pages of the indicated processor.

1170

* Drain pages of the indicated processor.

1171

*

1171

*

1172

* The processor must either be the current processor and the

1172

* The processor must either be the current processor and the

1173

* thread pinned to the current processor or a processor that

1173

* thread pinned to the current processor or a processor that

1174

* is not online.

1174

* is not online.

1175

*/

1175

*/

1176

static void drain_pages(unsigned int cpu)

1176

static void drain_pages(unsigned int cpu)

1177

{

1177

{

1178

unsigned long flags;

1178

unsigned long flags;

1179

struct zone *zone;

1179

struct zone *zone;

1180

1181

for_each_populated_zone(zone) {

1181

for_each_populated_zone(zone) {

1182

struct per_cpu_pageset *pset;

1182

struct per_cpu_pageset *pset;

1183

struct per_cpu_pages *pcp;

1183

struct per_cpu_pages *pcp;

1184

1185

local_irq_save(flags);

1185

local_irq_save(flags);

1186

pset = per_cpu_ptr(zone->pageset, cpu);

1186

pset = per_cpu_ptr(zone->pageset, cpu);

1187

1188

pcp = &pset->pcp;

1188

pcp = &pset->pcp;

1189

if (pcp->count) {

1189

if (pcp->count) {

1190

free_pcppages_bulk(zone, pcp->count, pcp);

1190

free_pcppages_bulk(zone, pcp->count, pcp);

1191

pcp->count = 0;

1191

pcp->count = 0;

1192

}

1192

}

1193

local_irq_restore(flags);

1193

local_irq_restore(flags);

1194

}

1194

}

1195

}

1195

}

1196

1197

/*

1197

/*

1198

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1198

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1199

*/

1199

*/

1200

void drain_local_pages(void *arg)

1200

void drain_local_pages(void *arg)

1201

{

1201

{

1202

drain_pages(smp_processor_id());

1202

drain_pages(smp_processor_id());

1203

}

1203

}

1204

1205

/*

1205

/*

1206

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1206

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1207

*

1207

*

1208

* Note that this code is protected against sending an IPI to an offline

1208

* Note that this code is protected against sending an IPI to an offline

1209

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1209

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1210

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1210

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1211

* nothing keeps CPUs from showing up after we populated the cpumask and

1211

* nothing keeps CPUs from showing up after we populated the cpumask and

1212

* before the call to on_each_cpu_mask().

1212

* before the call to on_each_cpu_mask().

1213

*/

1213

*/

1214

void drain_all_pages(void)

1214

void drain_all_pages(void)

1215

{

1215

{

1216

int cpu;

1216

int cpu;

1217

struct per_cpu_pageset *pcp;

1217

struct per_cpu_pageset *pcp;

1218

struct zone *zone;

1218

struct zone *zone;

1219

1220

/*

1220

/*

1221

* Allocate in the BSS so we wont require allocation in

1221

* Allocate in the BSS so we wont require allocation in

1222

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1222

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1223

*/

1223

*/

1224

static cpumask_t cpus_with_pcps;

1224

static cpumask_t cpus_with_pcps;

1225

1226

/*

1226

/*

1227

* We don't care about racing with CPU hotplug event

1227

* We don't care about racing with CPU hotplug event

1228

* as offline notification will cause the notified

1228

* as offline notification will cause the notified

1229

* cpu to drain that CPU pcps and on_each_cpu_mask

1229

* cpu to drain that CPU pcps and on_each_cpu_mask

1230

* disables preemption as part of its processing

1230

* disables preemption as part of its processing

1231

*/

1231

*/

1232

for_each_online_cpu(cpu) {

1232

for_each_online_cpu(cpu) {

1233

bool has_pcps = false;

1233

bool has_pcps = false;

1234

for_each_populated_zone(zone) {

1234

for_each_populated_zone(zone) {

1235

pcp = per_cpu_ptr(zone->pageset, cpu);

1235

pcp = per_cpu_ptr(zone->pageset, cpu);

1236

if (pcp->pcp.count) {

1236

if (pcp->pcp.count) {

1237

has_pcps = true;

1237

has_pcps = true;

1238

break;

1238

break;

1239

}

1239

}

1240

}

1240

}

1241

if (has_pcps)

1241

if (has_pcps)

1242

cpumask_set_cpu(cpu, &cpus_with_pcps);

1242

cpumask_set_cpu(cpu, &cpus_with_pcps);

1243

else

1243

else

1244

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1244

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1245

}

1245

}

1246

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1246

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1247

}

1247

}

1248

1249

#ifdef CONFIG_HIBERNATION

1249

#ifdef CONFIG_HIBERNATION

1250

1251

void mark_free_pages(struct zone *zone)

1251

void mark_free_pages(struct zone *zone)

1252

{

1252

{

1253

unsigned long pfn, max_zone_pfn;

1253

unsigned long pfn, max_zone_pfn;

1254

unsigned long flags;

1254

unsigned long flags;

1255

int order, t;

1255

int order, t;

1256

struct list_head *curr;

1256

struct list_head *curr;

1257

1258

if (!zone->spanned_pages)

1258

if (!zone->spanned_pages)

1259

return;

1259

return;

1260

1261

spin_lock_irqsave(&zone->lock, flags);

1261

spin_lock_irqsave(&zone->lock, flags);

1262

1263

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1263

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1264

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1264

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1265

if (pfn_valid(pfn)) {

1265

if (pfn_valid(pfn)) {

1266

struct page *page = pfn_to_page(pfn);

1266

struct page *page = pfn_to_page(pfn);

1267

1268

if (!swsusp_page_is_forbidden(page))

1268

if (!swsusp_page_is_forbidden(page))

1269

swsusp_unset_page_free(page);

1269

swsusp_unset_page_free(page);

1270

}

1270

}

1271

1272

for_each_migratetype_order(order, t) {

1272

for_each_migratetype_order(order, t) {

1273

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1273

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1274

unsigned long i;

1274

unsigned long i;

1275

1276

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1276

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1277

for (i = 0; i < (1UL << order); i++)

1277

for (i = 0; i < (1UL << order); i++)

1278

swsusp_set_page_free(pfn_to_page(pfn + i));

1278

swsusp_set_page_free(pfn_to_page(pfn + i));

1279

}

1279

}

1280

}

1280

}

1281

spin_unlock_irqrestore(&zone->lock, flags);

1281

spin_unlock_irqrestore(&zone->lock, flags);

1282

}

1282

}

1283

#endif /* CONFIG_PM */

1283

#endif /* CONFIG_PM */

1284

1285

/*

1285

/*

1286

* Free a 0-order page

1286

* Free a 0-order page

1287

* cold == 1 ? free a cold page : free a hot page

1287

* cold == 1 ? free a cold page : free a hot page

1288

*/

1288

*/

1289

void free_hot_cold_page(struct page *page, int cold)

1289

void free_hot_cold_page(struct page *page, int cold)

1290

{

1290

{

1291

struct zone *zone = page_zone(page);

1291

struct zone *zone = page_zone(page);

1292

struct per_cpu_pages *pcp;

1292

struct per_cpu_pages *pcp;

1293

unsigned long flags;

1293

unsigned long flags;

1294

int migratetype;

1294

int migratetype;

1295

int wasMlocked = __TestClearPageMlocked(page);

1295

int wasMlocked = __TestClearPageMlocked(page);

1296

1297

if (!free_pages_prepare(page, 0))

1297

if (!free_pages_prepare(page, 0))

1298

return;

1298

return;

1299

1300

migratetype = get_pageblock_migratetype(page);

1300

migratetype = get_pageblock_migratetype(page);

1301

set_page_private(page, migratetype);

1301

set_page_private(page, migratetype);

1302

local_irq_save(flags);

1302

local_irq_save(flags);

1303

if (unlikely(wasMlocked))

1303

if (unlikely(wasMlocked))

1304

free_page_mlock(page);

1304

free_page_mlock(page);

1305

__count_vm_event(PGFREE);

1305

__count_vm_event(PGFREE);

1306

1307

/*

1307

/*

1308

* We only track unmovable, reclaimable and movable on pcp lists.

1308

* We only track unmovable, reclaimable and movable on pcp lists.

1309

* Free ISOLATE pages back to the allocator because they are being

1309

* Free ISOLATE pages back to the allocator because they are being

1310

* offlined but treat RESERVE as movable pages so we can get those

1310

* offlined but treat RESERVE as movable pages so we can get those

1311

* areas back if necessary. Otherwise, we may have to free

1311

* areas back if necessary. Otherwise, we may have to free

1312

* excessively into the page allocator

1312

* excessively into the page allocator

1313

*/

1313

*/

1314

if (migratetype >= MIGRATE_PCPTYPES) {

1314

if (migratetype >= MIGRATE_PCPTYPES) {

1315

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1315

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1316

free_one_page(zone, page, 0, migratetype);

1316

free_one_page(zone, page, 0, migratetype);

1317

goto out;

1317

goto out;

1318

}

1318

}

1319

migratetype = MIGRATE_MOVABLE;

1319

migratetype = MIGRATE_MOVABLE;

1320

}

1320

}

1321

1322

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1322

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1323

if (cold)

1323

if (cold)

1324

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1324

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1325

else

1325

else

1326

list_add(&page->lru, &pcp->lists[migratetype]);

1326

list_add(&page->lru, &pcp->lists[migratetype]);

1327

pcp->count++;

1327

pcp->count++;

1328

if (pcp->count >= pcp->high) {

1328

if (pcp->count >= pcp->high) {

1329

free_pcppages_bulk(zone, pcp->batch, pcp);

1329

free_pcppages_bulk(zone, pcp->batch, pcp);

1330

pcp->count -= pcp->batch;

1330

pcp->count -= pcp->batch;

1331

}

1331

}

1332

1333

out:

1333

out:

1334

local_irq_restore(flags);

1334

local_irq_restore(flags);

1335

}

1335

}

1336

1337

/*

1337

/*

1338

* Free a list of 0-order pages

1338

* Free a list of 0-order pages

1339

*/

1339

*/

1340

void free_hot_cold_page_list(struct list_head *list, int cold)

1340

void free_hot_cold_page_list(struct list_head *list, int cold)

1341

{

1341

{

1342

struct page *page, *next;

1342

struct page *page, *next;

1343

1344

list_for_each_entry_safe(page, next, list, lru) {

1344

list_for_each_entry_safe(page, next, list, lru) {

1345

trace_mm_page_free_batched(page, cold);

1345

trace_mm_page_free_batched(page, cold);

1346

free_hot_cold_page(page, cold);

1346

free_hot_cold_page(page, cold);

1347

}

1347

}

1348

}

1348

}

1349

1350

/*

1350

/*

1351

* split_page takes a non-compound higher-order page, and splits it into

1351

* split_page takes a non-compound higher-order page, and splits it into

1352

* n (1<<order) sub-pages: page[0..n]

1352

* n (1<<order) sub-pages: page[0..n]

1353

* Each sub-page must be freed individually.

1353

* Each sub-page must be freed individually.

1354

*

1354

*

1355

* Note: this is probably too low level an operation for use in drivers.

1355

* Note: this is probably too low level an operation for use in drivers.

1356

* Please consult with lkml before using this in your driver.

1356

* Please consult with lkml before using this in your driver.

1357

*/

1357

*/

1358

void split_page(struct page *page, unsigned int order)

1358

void split_page(struct page *page, unsigned int order)

1359

{

1359

{

1360

int i;

1360

int i;

1361

1362

VM_BUG_ON(PageCompound(page));

1362

VM_BUG_ON(PageCompound(page));

1363

VM_BUG_ON(!page_count(page));

1363

VM_BUG_ON(!page_count(page));

1364

1365

#ifdef CONFIG_KMEMCHECK

1365

#ifdef CONFIG_KMEMCHECK

1366

/*

1366

/*

1367

* Split shadow pages too, because free(page[0]) would

1367

* Split shadow pages too, because free(page[0]) would

1368

* otherwise free the whole shadow.

1368

* otherwise free the whole shadow.

1369

*/

1369

*/

1370

if (kmemcheck_page_is_tracked(page))

1370

if (kmemcheck_page_is_tracked(page))

1371

split_page(virt_to_page(page[0].shadow), order);

1371

split_page(virt_to_page(page[0].shadow), order);

1372

#endif

1372

#endif

1373

1374

for (i = 1; i < (1 << order); i++)

1374

for (i = 1; i < (1 << order); i++)

1375

set_page_refcounted(page + i);

1375

set_page_refcounted(page + i);

1376

}

1376

}

1377

1378

/*

1378

/*

1379

* Similar to split_page except the page is already free. As this is only

1379

* Similar to split_page except the page is already free. As this is only

1380

* being used for migration, the migratetype of the block also changes.

1380

* being used for migration, the migratetype of the block also changes.

1381

* As this is called with interrupts disabled, the caller is responsible

1381

* As this is called with interrupts disabled, the caller is responsible

1382

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1382

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1383

* are enabled.

1383

* are enabled.

1384

*

1384

*

1385

* Note: this is probably too low level an operation for use in drivers.

1385

* Note: this is probably too low level an operation for use in drivers.

1386

* Please consult with lkml before using this in your driver.

1386

* Please consult with lkml before using this in your driver.

1387

*/

1387

*/

1388

int split_free_page(struct page *page)

1388

int split_free_page(struct page *page)

1389

{

1389

{

1390

unsigned int order;

1390

unsigned int order;

1391

unsigned long watermark;

1391

unsigned long watermark;

1392

struct zone *zone;

1392

struct zone *zone;

1393

1394

BUG_ON(!PageBuddy(page));

1394

BUG_ON(!PageBuddy(page));

1395

1396

zone = page_zone(page);

1396

zone = page_zone(page);

1397

order = page_order(page);

1397

order = page_order(page);

1398

1399

/* Obey watermarks as if the page was being allocated */

1399

/* Obey watermarks as if the page was being allocated */

1400

watermark = low_wmark_pages(zone) + (1 << order);

1400

watermark = low_wmark_pages(zone) + (1 << order);

1401

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1401

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1402

return 0;

1402

return 0;

1403

1404

/* Remove page from free list */

1404

/* Remove page from free list */

1405

list_del(&page->lru);

1405

list_del(&page->lru);

1406

zone->free_area[order].nr_free--;

1406

zone->free_area[order].nr_free--;

1407

rmv_page_order(page);

1407

rmv_page_order(page);

1408

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));

1408

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));

1409

1410

/* Split into individual pages */

1410

/* Split into individual pages */

1411

set_page_refcounted(page);

1411

set_page_refcounted(page);

1412

split_page(page, order);

1412

split_page(page, order);

1413

1414

if (order >= pageblock_order - 1) {

1414

if (order >= pageblock_order - 1) {

1415

struct page *endpage = page + (1 << order) - 1;

1415

struct page *endpage = page + (1 << order) - 1;

1416

for (; page < endpage; page += pageblock_nr_pages) {

1416

for (; page < endpage; page += pageblock_nr_pages) {

1417

int mt = get_pageblock_migratetype(page);

1417

int mt = get_pageblock_migratetype(page);

1418

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1418

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1419

set_pageblock_migratetype(page,

1419

set_pageblock_migratetype(page,

1420

MIGRATE_MOVABLE);

1420

MIGRATE_MOVABLE);

1421

}

1421

}

1422

}

1422

}

1423

1424

return 1 << order;

1424

return 1 << order;

1425

}

1425

}

1426

1427

/*

1427

/*

1428

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1428

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1429

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1429

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1430

* or two.

1430

* or two.

1431

*/

1431

*/

1432

static inline

1432

static inline

1433

struct page *buffered_rmqueue(struct zone *preferred_zone,

1433

struct page *buffered_rmqueue(struct zone *preferred_zone,

1434

struct zone *zone, int order, gfp_t gfp_flags,

1434

struct zone *zone, int order, gfp_t gfp_flags,

1435

int migratetype)

1435

int migratetype)

1436

{

1436

{

1437

unsigned long flags;

1437

unsigned long flags;

1438

struct page *page;

1438

struct page *page;

1439

int cold = !!(gfp_flags & __GFP_COLD);

1439

int cold = !!(gfp_flags & __GFP_COLD);

1440

1441

again:

1441

again:

1442

if (likely(order == 0)) {

1442

if (likely(order == 0)) {

1443

struct per_cpu_pages *pcp;

1443

struct per_cpu_pages *pcp;

1444

struct list_head *list;

1444

struct list_head *list;

1445

1446

local_irq_save(flags);

1446

local_irq_save(flags);

1447

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1447

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1448

list = &pcp->lists[migratetype];

1448

list = &pcp->lists[migratetype];

1449

if (list_empty(list)) {

1449

if (list_empty(list)) {

1450

pcp->count += rmqueue_bulk(zone, 0,

1450

pcp->count += rmqueue_bulk(zone, 0,

1451

pcp->batch, list,

1451

pcp->batch, list,

1452

migratetype, cold);

1452

migratetype, cold);

1453

if (unlikely(list_empty(list)))

1453

if (unlikely(list_empty(list)))

1454

goto failed;

1454

goto failed;

1455

}

1455

}

1456

1457

if (cold)

1457

if (cold)

1458

page = list_entry(list->prev, struct page, lru);

1458

page = list_entry(list->prev, struct page, lru);

1459

else

1459

else

1460

page = list_entry(list->next, struct page, lru);

1460

page = list_entry(list->next, struct page, lru);

1461

1462

list_del(&page->lru);

1462

list_del(&page->lru);

1463

pcp->count--;

1463

pcp->count--;

1464

} else {

1464

} else {

1465

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1465

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1466

/*

1466

/*

1467

* __GFP_NOFAIL is not to be used in new code.

1467

* __GFP_NOFAIL is not to be used in new code.

1468

*

1468

*

1469

* All __GFP_NOFAIL callers should be fixed so that they

1469

* All __GFP_NOFAIL callers should be fixed so that they

1470

* properly detect and handle allocation failures.

1470

* properly detect and handle allocation failures.

1471

*

1471

*

1472

* We most definitely don't want callers attempting to

1472

* We most definitely don't want callers attempting to

1473

* allocate greater than order-1 page units with

1473

* allocate greater than order-1 page units with

1474

* __GFP_NOFAIL.

1474

* __GFP_NOFAIL.

1475

*/

1475

*/

1476

WARN_ON_ONCE(order > 1);

1476

WARN_ON_ONCE(order > 1);

1477

}

1477

}

1478

spin_lock_irqsave(&zone->lock, flags);

1478

spin_lock_irqsave(&zone->lock, flags);

1479

page = __rmqueue(zone, order, migratetype);

1479

page = __rmqueue(zone, order, migratetype);

1480

spin_unlock(&zone->lock);

1480

spin_unlock(&zone->lock);

1481

if (!page)

1481

if (!page)

1482

goto failed;

1482

goto failed;

1483

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));

1483

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));

1484

}

1484

}

1485

1486

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1486

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1487

zone_statistics(preferred_zone, zone, gfp_flags);

1487

zone_statistics(preferred_zone, zone, gfp_flags);

1488

local_irq_restore(flags);

1488

local_irq_restore(flags);

1489

1490

VM_BUG_ON(bad_range(zone, page));

1490

VM_BUG_ON(bad_range(zone, page));

1491

if (prep_new_page(page, order, gfp_flags))

1491

if (prep_new_page(page, order, gfp_flags))

1492

goto again;

1492

goto again;

1493

return page;

1493

return page;

1494

1495

failed:

1495

failed:

1496

local_irq_restore(flags);

1496

local_irq_restore(flags);

1497

return NULL;

1497

return NULL;

1498

}

1498

}

1499

1500

/* The ALLOC_WMARK bits are used as an index to zone->watermark */

1500

/* The ALLOC_WMARK bits are used as an index to zone->watermark */

1501

#define ALLOC_WMARK_MIN WMARK_MIN

1501

#define ALLOC_WMARK_MIN WMARK_MIN

1502

#define ALLOC_WMARK_LOW WMARK_LOW

1502

#define ALLOC_WMARK_LOW WMARK_LOW

1503

#define ALLOC_WMARK_HIGH WMARK_HIGH

1503

#define ALLOC_WMARK_HIGH WMARK_HIGH

1504

#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */

1504

#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */

1505

1506

/* Mask to get the watermark bits */

1506

/* Mask to get the watermark bits */

1507

#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)

1507

#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)

1508

1509

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1509

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1510

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1510

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1511

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1511

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1512

1513

#ifdef CONFIG_FAIL_PAGE_ALLOC

1513

#ifdef CONFIG_FAIL_PAGE_ALLOC

1514

1515

static struct {

1515

static struct {

1516

struct fault_attr attr;

1516

struct fault_attr attr;

1517

1518

u32 ignore_gfp_highmem;

1518

u32 ignore_gfp_highmem;

1519

u32 ignore_gfp_wait;

1519

u32 ignore_gfp_wait;

1520

u32 min_order;

1520

u32 min_order;

1521

} fail_page_alloc = {

1521

} fail_page_alloc = {

1522

.attr = FAULT_ATTR_INITIALIZER,

1522

.attr = FAULT_ATTR_INITIALIZER,

1523

.ignore_gfp_wait = 1,

1523

.ignore_gfp_wait = 1,

1524

.ignore_gfp_highmem = 1,

1524

.ignore_gfp_highmem = 1,

1525

.min_order = 1,

1525

.min_order = 1,

1526

};

1526

};

1527

1528

static int __init setup_fail_page_alloc(char *str)

1528

static int __init setup_fail_page_alloc(char *str)

1529

{

1529

{

1530

return setup_fault_attr(&fail_page_alloc.attr, str);

1530

return setup_fault_attr(&fail_page_alloc.attr, str);

1531

}

1531

}

1532

__setup("fail_page_alloc=", setup_fail_page_alloc);

1532

__setup("fail_page_alloc=", setup_fail_page_alloc);

1533

1534

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1534

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1535

{

1535

{

1536

if (order < fail_page_alloc.min_order)

1536

if (order < fail_page_alloc.min_order)

1537

return false;

1537

return false;

1538

if (gfp_mask & __GFP_NOFAIL)

1538

if (gfp_mask & __GFP_NOFAIL)

1539

return false;

1539

return false;

1540

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1540

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1541

return false;

1541

return false;

1542

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1542

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1543

return false;

1543

return false;

1544

1545

return should_fail(&fail_page_alloc.attr, 1 << order);

1545

return should_fail(&fail_page_alloc.attr, 1 << order);

1546

}

1546

}

1547

1548

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1548

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1549

1550

static int __init fail_page_alloc_debugfs(void)

1550

static int __init fail_page_alloc_debugfs(void)

1551

{

1551

{

1552

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1552

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1553

struct dentry *dir;

1553

struct dentry *dir;

1554

1555

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1555

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1556

&fail_page_alloc.attr);

1556

&fail_page_alloc.attr);

1557

if (IS_ERR(dir))

1557

if (IS_ERR(dir))

1558

return PTR_ERR(dir);

1558

return PTR_ERR(dir);

1559

1560

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1560

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1561

&fail_page_alloc.ignore_gfp_wait))

1561

&fail_page_alloc.ignore_gfp_wait))

1562

goto fail;

1562

goto fail;

1563

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1563

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1564

&fail_page_alloc.ignore_gfp_highmem))

1564

&fail_page_alloc.ignore_gfp_highmem))

1565

goto fail;

1565

goto fail;

1566

if (!debugfs_create_u32("min-order", mode, dir,

1566

if (!debugfs_create_u32("min-order", mode, dir,

1567

&fail_page_alloc.min_order))

1567

&fail_page_alloc.min_order))

1568

goto fail;

1568

goto fail;

1569

1570

return 0;

1570

return 0;

1571

fail:

1571

fail:

1572

debugfs_remove_recursive(dir);

1572

debugfs_remove_recursive(dir);

1573

1574

return -ENOMEM;

1574

return -ENOMEM;

1575

}

1575

}

1576

1577

late_initcall(fail_page_alloc_debugfs);

1577

late_initcall(fail_page_alloc_debugfs);

1578

1579

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1579

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1580

1581

#else /* CONFIG_FAIL_PAGE_ALLOC */

1581

#else /* CONFIG_FAIL_PAGE_ALLOC */

1582

1583

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1583

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1584

{

1584

{

1585

return false;

1585

return false;

1586

}

1586

}

1587

1588

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1588

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1589

1590

/*

1590

/*

1591

* Return true if free pages are above 'mark'. This takes into account the order

1591

* Return true if free pages are above 'mark'. This takes into account the order

1592

* of the allocation.

1592

* of the allocation.

1593

*/

1593

*/

1594

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1594

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1595

int classzone_idx, int alloc_flags, long free_pages)

1595

int classzone_idx, int alloc_flags, long free_pages)

1596

{

1596

{

1597

/* free_pages my go negative - that's OK */

1597

/* free_pages my go negative - that's OK */

1598

long min = mark;

1598

long min = mark;

1599

int o;

1599

int o;

1600

1601

free_pages -= (1 << order) - 1;

1601

free_pages -= (1 << order) - 1;

1602

if (alloc_flags & ALLOC_HIGH)

1602

if (alloc_flags & ALLOC_HIGH)

1603

min -= min / 2;

1603

min -= min / 2;

1604

if (alloc_flags & ALLOC_HARDER)

1604

if (alloc_flags & ALLOC_HARDER)

1605

min -= min / 4;

1605

min -= min / 4;

1606

1607

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1607

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

1608

return false;

1608

return false;

1609

for (o = 0; o < order; o++) {

1609

for (o = 0; o < order; o++) {

1610

/* At the next order, this order's pages become unavailable */

1610

/* At the next order, this order's pages become unavailable */

1611

free_pages -= z->free_area[o].nr_free << o;

1611

free_pages -= z->free_area[o].nr_free << o;

1612

1613

/* Require fewer higher order pages to be free */

1613

/* Require fewer higher order pages to be free */

1614

min >>= 1;

1614

min >>= 1;

1615

1616

if (free_pages <= min)

1616

if (free_pages <= min)

1617

return false;

1617

return false;

1618

}

1618

}

1619

return true;

1619

return true;

1620

}

1620

}

1621

1622

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1622

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1623

int classzone_idx, int alloc_flags)

1623

int classzone_idx, int alloc_flags)

1624

{

1624

{

1625

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1625

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1626

zone_page_state(z, NR_FREE_PAGES));

1626

zone_page_state(z, NR_FREE_PAGES));

1627

}

1627

}

1628

1629

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1629

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1630

int classzone_idx, int alloc_flags)

1630

int classzone_idx, int alloc_flags)

1631

{

1631

{

1632

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1632

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1633

1634

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1634

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1635

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1635

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1636

1637

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1637

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1638

free_pages);

1638

free_pages);

1639

}

1639

}

1640

1641

#ifdef CONFIG_NUMA

1641

#ifdef CONFIG_NUMA

1642

/*

1642

/*

1643

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1643

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1644

* skip over zones that are not allowed by the cpuset, or that have

1644

* skip over zones that are not allowed by the cpuset, or that have

1645

* been recently (in last second) found to be nearly full. See further

1645

* been recently (in last second) found to be nearly full. See further

1646

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1646

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1647

* that have to skip over a lot of full or unallowed zones.

1647

* that have to skip over a lot of full or unallowed zones.

1648

*

1648

*

1649

* If the zonelist cache is present in the passed in zonelist, then

1649

* If the zonelist cache is present in the passed in zonelist, then

1650

* returns a pointer to the allowed node mask (either the current

1650

* returns a pointer to the allowed node mask (either the current

1651

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1651

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1652

*

1652

*

1653

* If the zonelist cache is not available for this zonelist, does

1653

* If the zonelist cache is not available for this zonelist, does

1654

* nothing and returns NULL.

1654

* nothing and returns NULL.

1655

*

1655

*

1656

* If the fullzones BITMAP in the zonelist cache is stale (more than

1656

* If the fullzones BITMAP in the zonelist cache is stale (more than

1657

* a second since last zap'd) then we zap it out (clear its bits.)

1657

* a second since last zap'd) then we zap it out (clear its bits.)

1658

*

1658

*

1659

* We hold off even calling zlc_setup, until after we've checked the

1659

* We hold off even calling zlc_setup, until after we've checked the

1660

* first zone in the zonelist, on the theory that most allocations will

1660

* first zone in the zonelist, on the theory that most allocations will

1661

* be satisfied from that first zone, so best to examine that zone as

1661

* be satisfied from that first zone, so best to examine that zone as

1662

* quickly as we can.

1662

* quickly as we can.

1663

*/

1663

*/

1664

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1664

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1665

{

1665

{

1666

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1666

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1667

nodemask_t *allowednodes; /* zonelist_cache approximation */

1667

nodemask_t *allowednodes; /* zonelist_cache approximation */

1668

1669

zlc = zonelist->zlcache_ptr;

1669

zlc = zonelist->zlcache_ptr;

1670

if (!zlc)

1670

if (!zlc)

1671

return NULL;

1671

return NULL;

1672

1673

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1673

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1674

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1674

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1675

zlc->last_full_zap = jiffies;

1675

zlc->last_full_zap = jiffies;

1676

}

1676

}

1677

1678

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1678

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1679

&cpuset_current_mems_allowed :

1679

&cpuset_current_mems_allowed :

1680

&node_states[N_HIGH_MEMORY];

1680

&node_states[N_HIGH_MEMORY];

1681

return allowednodes;

1681

return allowednodes;

1682

}

1682

}

1683

1684

/*

1684

/*

1685

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1685

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1686

* if it is worth looking at further for free memory:

1686

* if it is worth looking at further for free memory:

1687

* 1) Check that the zone isn't thought to be full (doesn't have its

1687

* 1) Check that the zone isn't thought to be full (doesn't have its

1688

* bit set in the zonelist_cache fullzones BITMAP).

1688

* bit set in the zonelist_cache fullzones BITMAP).

1689

* 2) Check that the zones node (obtained from the zonelist_cache

1689

* 2) Check that the zones node (obtained from the zonelist_cache

1690

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1690

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1691

* Return true (non-zero) if zone is worth looking at further, or

1691

* Return true (non-zero) if zone is worth looking at further, or

1692

* else return false (zero) if it is not.

1692

* else return false (zero) if it is not.

1693

*

1693

*

1694

* This check -ignores- the distinction between various watermarks,

1694

* This check -ignores- the distinction between various watermarks,

1695

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1695

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1696

* found to be full for any variation of these watermarks, it will

1696

* found to be full for any variation of these watermarks, it will

1697

* be considered full for up to one second by all requests, unless

1697

* be considered full for up to one second by all requests, unless

1698

* we are so low on memory on all allowed nodes that we are forced

1698

* we are so low on memory on all allowed nodes that we are forced

1699

* into the second scan of the zonelist.

1699

* into the second scan of the zonelist.

1700

*

1700

*

1701

* In the second scan we ignore this zonelist cache and exactly

1701

* In the second scan we ignore this zonelist cache and exactly

1702

* apply the watermarks to all zones, even it is slower to do so.

1702

* apply the watermarks to all zones, even it is slower to do so.

1703

* We are low on memory in the second scan, and should leave no stone

1703

* We are low on memory in the second scan, and should leave no stone

1704

* unturned looking for a free page.

1704

* unturned looking for a free page.

1705

*/

1705

*/

1706

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1706

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1707

nodemask_t *allowednodes)

1707

nodemask_t *allowednodes)

1708

{

1708

{

1709

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1709

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1710

int i; /* index of *z in zonelist zones */

1710

int i; /* index of *z in zonelist zones */

1711

int n; /* node that zone *z is on */

1711

int n; /* node that zone *z is on */

1712

1713

zlc = zonelist->zlcache_ptr;

1713

zlc = zonelist->zlcache_ptr;

1714

if (!zlc)

1714

if (!zlc)

1715

return 1;

1715

return 1;

1716

1717

i = z - zonelist->_zonerefs;

1717

i = z - zonelist->_zonerefs;

1718

n = zlc->z_to_n[i];

1718

n = zlc->z_to_n[i];

1719

1720

/* This zone is worth trying if it is allowed but not full */

1720

/* This zone is worth trying if it is allowed but not full */

1721

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1721

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1722

}

1722

}

1723

1724

/*

1724

/*

1725

* Given 'z' scanning a zonelist, set the corresponding bit in

1725

* Given 'z' scanning a zonelist, set the corresponding bit in

1726

* zlc->fullzones, so that subsequent attempts to allocate a page

1726

* zlc->fullzones, so that subsequent attempts to allocate a page

1727

* from that zone don't waste time re-examining it.

1727

* from that zone don't waste time re-examining it.

1728

*/

1728

*/

1729

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1729

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1730

{

1730

{

1731

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1731

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1732

int i; /* index of *z in zonelist zones */

1732

int i; /* index of *z in zonelist zones */

1733

1734

zlc = zonelist->zlcache_ptr;

1734

zlc = zonelist->zlcache_ptr;

1735

if (!zlc)

1735

if (!zlc)

1736

return;

1736

return;

1737

1738

i = z - zonelist->_zonerefs;

1738

i = z - zonelist->_zonerefs;

1739

1740

set_bit(i, zlc->fullzones);

1740

set_bit(i, zlc->fullzones);

1741

}

1741

}

1742

1743

/*

1743

/*

1744

* clear all zones full, called after direct reclaim makes progress so that

1744

* clear all zones full, called after direct reclaim makes progress so that

1745

* a zone that was recently full is not skipped over for up to a second

1745

* a zone that was recently full is not skipped over for up to a second

1746

*/

1746

*/

1747

static void zlc_clear_zones_full(struct zonelist *zonelist)

1747

static void zlc_clear_zones_full(struct zonelist *zonelist)

1748

{

1748

{

1749

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1749

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1750

1751

zlc = zonelist->zlcache_ptr;

1751

zlc = zonelist->zlcache_ptr;

1752

if (!zlc)

1752

if (!zlc)

1753

return;

1753

return;

1754

1755

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1755

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1756

}

1756

}

1757

1758

#else /* CONFIG_NUMA */

1758

#else /* CONFIG_NUMA */

1759

1760

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1760

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1761

{

1761

{

1762

return NULL;

1762

return NULL;

1763

}

1763

}

1764

1765

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1765

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1766

nodemask_t *allowednodes)

1766

nodemask_t *allowednodes)

1767

{

1767

{

1768

return 1;

1768

return 1;

1769

}

1769

}

1770

1771

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1771

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1772

{

1772

{

1773

}

1773

}

1774

1775

static void zlc_clear_zones_full(struct zonelist *zonelist)

1775

static void zlc_clear_zones_full(struct zonelist *zonelist)

1776

{

1776

{

1777

}

1777

}

1778

#endif /* CONFIG_NUMA */

1778

#endif /* CONFIG_NUMA */

1779

1780

/*

1780

/*

1781

* get_page_from_freelist goes through the zonelist trying to allocate

1781

* get_page_from_freelist goes through the zonelist trying to allocate

1782

* a page.

1782

* a page.

1783

*/

1783

*/

1784

static struct page *

1784

static struct page *

1785

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1785

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1786

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1786

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1787

struct zone *preferred_zone, int migratetype)

1787

struct zone *preferred_zone, int migratetype)

1788

{

1788

{

1789

struct zoneref *z;

1789

struct zoneref *z;

1790

struct page *page = NULL;

1790

struct page *page = NULL;

1791

int classzone_idx;

1791

int classzone_idx;

1792

struct zone *zone;

1792

struct zone *zone;

1793

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1793

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1794

int zlc_active = 0; /* set if using zonelist_cache */

1794

int zlc_active = 0; /* set if using zonelist_cache */

1795

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1795

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1796

1797

classzone_idx = zone_idx(preferred_zone);

1797

classzone_idx = zone_idx(preferred_zone);

1798

zonelist_scan:

1798

zonelist_scan:

1799

/*

1799

/*

1800

* Scan zonelist, looking for a zone with enough free.

1800

* Scan zonelist, looking for a zone with enough free.

1801

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1801

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1802

*/

1802

*/

1803

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1803

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1804

high_zoneidx, nodemask) {

1804

high_zoneidx, nodemask) {

1805

if (NUMA_BUILD && zlc_active &&

1805

if (NUMA_BUILD && zlc_active &&

1806

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1806

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1807

continue;

1807

continue;

1808

if ((alloc_flags & ALLOC_CPUSET) &&

1808

if ((alloc_flags & ALLOC_CPUSET) &&

1809

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1809

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1810

continue;

1810

continue;

1811

/*

1811

/*

1812

* When allocating a page cache page for writing, we

1812

* When allocating a page cache page for writing, we

1813

* want to get it from a zone that is within its dirty

1813

* want to get it from a zone that is within its dirty

1814

* limit, such that no single zone holds more than its

1814

* limit, such that no single zone holds more than its

1815

* proportional share of globally allowed dirty pages.

1815

* proportional share of globally allowed dirty pages.

1816

* The dirty limits take into account the zone's

1816

* The dirty limits take into account the zone's

1817

* lowmem reserves and high watermark so that kswapd

1817

* lowmem reserves and high watermark so that kswapd

1818

* should be able to balance it without having to

1818

* should be able to balance it without having to

1819

* write pages from its LRU list.

1819

* write pages from its LRU list.

1820

*

1820

*

1821

* This may look like it could increase pressure on

1821

* This may look like it could increase pressure on

1822

* lower zones by failing allocations in higher zones

1822

* lower zones by failing allocations in higher zones

1823

* before they are full. But the pages that do spill

1823

* before they are full. But the pages that do spill

1824

* over are limited as the lower zones are protected

1824

* over are limited as the lower zones are protected

1825

* by this very same mechanism. It should not become

1825

* by this very same mechanism. It should not become

1826

* a practical burden to them.

1826

* a practical burden to them.

1827

*

1827

*

1828

* XXX: For now, allow allocations to potentially

1828

* XXX: For now, allow allocations to potentially

1829

* exceed the per-zone dirty limit in the slowpath

1829

* exceed the per-zone dirty limit in the slowpath

1830

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1830

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1831

* which is important when on a NUMA setup the allowed

1831

* which is important when on a NUMA setup the allowed

1832

* zones are together not big enough to reach the

1832

* zones are together not big enough to reach the

1833

* global limit. The proper fix for these situations

1833

* global limit. The proper fix for these situations

1834

* will require awareness of zones in the

1834

* will require awareness of zones in the

1835

* dirty-throttling and the flusher threads.

1835

* dirty-throttling and the flusher threads.

1836

*/

1836

*/

1837

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1837

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1838

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1838

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1839

goto this_zone_full;

1839

goto this_zone_full;

1840

1841

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1841

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1842

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1842

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1843

unsigned long mark;

1843

unsigned long mark;

1844

int ret;

1844

int ret;

1845

1846

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1846

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1847

if (zone_watermark_ok(zone, order, mark,

1847

if (zone_watermark_ok(zone, order, mark,

1848

classzone_idx, alloc_flags))

1848

classzone_idx, alloc_flags))

1849

goto try_this_zone;

1849

goto try_this_zone;

1850

1851

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

1851

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

1852

/*

1852

/*

1853

* we do zlc_setup if there are multiple nodes

1853

* we do zlc_setup if there are multiple nodes

1854

* and before considering the first zone allowed

1854

* and before considering the first zone allowed

1855

* by the cpuset.

1855

* by the cpuset.

1856

*/

1856

*/

1857

allowednodes = zlc_setup(zonelist, alloc_flags);

1857

allowednodes = zlc_setup(zonelist, alloc_flags);

1858

zlc_active = 1;

1858

zlc_active = 1;

1859

did_zlc_setup = 1;

1859

did_zlc_setup = 1;

1860

}

1860

}

1861

1862

if (zone_reclaim_mode == 0)

1862

if (zone_reclaim_mode == 0)

1863

goto this_zone_full;

1863

goto this_zone_full;

1864

1865

/*

1865

/*

1866

* As we may have just activated ZLC, check if the first

1866

* As we may have just activated ZLC, check if the first

1867

* eligible zone has failed zone_reclaim recently.

1867

* eligible zone has failed zone_reclaim recently.

1868

*/

1868

*/

1869

if (NUMA_BUILD && zlc_active &&

1869

if (NUMA_BUILD && zlc_active &&

1870

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1870

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1871

continue;

1871

continue;

1872

1873

ret = zone_reclaim(zone, gfp_mask, order);

1873

ret = zone_reclaim(zone, gfp_mask, order);

1874

switch (ret) {

1874

switch (ret) {

1875

case ZONE_RECLAIM_NOSCAN:

1875

case ZONE_RECLAIM_NOSCAN:

1876

/* did not scan */

1876

/* did not scan */

1877

continue;

1877

continue;

1878

case ZONE_RECLAIM_FULL:

1878

case ZONE_RECLAIM_FULL:

1879

/* scanned but unreclaimable */

1879

/* scanned but unreclaimable */

1880

continue;

1880

continue;

1881

default:

1881

default:

1882

/* did we reclaim enough */

1882

/* did we reclaim enough */

1883

if (!zone_watermark_ok(zone, order, mark,

1883

if (!zone_watermark_ok(zone, order, mark,

1884

classzone_idx, alloc_flags))

1884

classzone_idx, alloc_flags))

1885

goto this_zone_full;

1885

goto this_zone_full;

1886

}

1886

}

1887

}

1887

}

1888

1889

try_this_zone:

1889

try_this_zone:

1890

page = buffered_rmqueue(preferred_zone, zone, order,

1890

page = buffered_rmqueue(preferred_zone, zone, order,

1891

gfp_mask, migratetype);

1891

gfp_mask, migratetype);

1892

if (page)

1892

if (page)

1893

break;

1893

break;

1894

this_zone_full:

1894

this_zone_full:

1895

if (NUMA_BUILD)

1895

if (NUMA_BUILD)

1896

zlc_mark_zone_full(zonelist, z);

1896

zlc_mark_zone_full(zonelist, z);

1897

}

1897

}

1898

1899

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1899

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1900

/* Disable zlc cache for second zonelist scan */

1900

/* Disable zlc cache for second zonelist scan */

1901

zlc_active = 0;

1901

zlc_active = 0;

1902

goto zonelist_scan;

1902

goto zonelist_scan;

1903

}

1903

}

1904

return page;

1904

return page;

1905

}

1905

}

1906

1907

/*

1907

/*

1908

* Large machines with many possible nodes should not always dump per-node

1908

* Large machines with many possible nodes should not always dump per-node

1909

* meminfo in irq context.

1909

* meminfo in irq context.

1910

*/

1910

*/

1911

static inline bool should_suppress_show_mem(void)

1911

static inline bool should_suppress_show_mem(void)

1912

{

1912

{

1913

bool ret = false;

1913

bool ret = false;

1914

1915

#if NODES_SHIFT > 8

1915

#if NODES_SHIFT > 8

1916

ret = in_interrupt();

1916

ret = in_interrupt();

1917

#endif

1917

#endif

1918

return ret;

1918

return ret;

1919

}

1919

}

1920

1921

static DEFINE_RATELIMIT_STATE(nopage_rs,

1921

static DEFINE_RATELIMIT_STATE(nopage_rs,

1922

DEFAULT_RATELIMIT_INTERVAL,

1922

DEFAULT_RATELIMIT_INTERVAL,

1923

DEFAULT_RATELIMIT_BURST);

1923

DEFAULT_RATELIMIT_BURST);

1924

1925

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1925

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1926

{

1926

{

1927

unsigned int filter = SHOW_MEM_FILTER_NODES;

1927

unsigned int filter = SHOW_MEM_FILTER_NODES;

1928

1929

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1929

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1930

debug_guardpage_minorder() > 0)

1930

debug_guardpage_minorder() > 0)

1931

return;

1931

return;

1932

1933

/*

1933

/*

1934

* This documents exceptions given to allocations in certain

1934

* This documents exceptions given to allocations in certain

1935

* contexts that are allowed to allocate outside current's set

1935

* contexts that are allowed to allocate outside current's set

1936

* of allowed nodes.

1936

* of allowed nodes.

1937

*/

1937

*/

1938

if (!(gfp_mask & __GFP_NOMEMALLOC))

1938

if (!(gfp_mask & __GFP_NOMEMALLOC))

1939

if (test_thread_flag(TIF_MEMDIE) ||

1939

if (test_thread_flag(TIF_MEMDIE) ||

1940

(current->flags & (PF_MEMALLOC | PF_EXITING)))

1940

(current->flags & (PF_MEMALLOC | PF_EXITING)))

1941

filter &= ~SHOW_MEM_FILTER_NODES;

1941

filter &= ~SHOW_MEM_FILTER_NODES;

1942

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

1942

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

1943

filter &= ~SHOW_MEM_FILTER_NODES;

1943

filter &= ~SHOW_MEM_FILTER_NODES;

1944

1945

if (fmt) {

1945

if (fmt) {

1946

struct va_format vaf;

1946

struct va_format vaf;

1947

va_list args;

1947

va_list args;

1948

1949

va_start(args, fmt);

1949

va_start(args, fmt);

1950

1951

vaf.fmt = fmt;

1951

vaf.fmt = fmt;

1952

vaf.va = &args;

1952

vaf.va = &args;

1953

1954

pr_warn("%pV", &vaf);

1954

pr_warn("%pV", &vaf);

1955

1956

va_end(args);

1956

va_end(args);

1957

}

1957

}

1958

1959

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

1959

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

1960

current->comm, order, gfp_mask);

1960

current->comm, order, gfp_mask);

1961

1962

dump_stack();

1962

dump_stack();

1963

if (!should_suppress_show_mem())

1963

if (!should_suppress_show_mem())

1964

show_mem(filter);

1964

show_mem(filter);

1965

}

1965

}

1966

1967

static inline int

1967

static inline int

1968

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

1968

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

1969

unsigned long did_some_progress,

1969

unsigned long did_some_progress,

1970

unsigned long pages_reclaimed)

1970

unsigned long pages_reclaimed)

1971

{

1971

{

1972

/* Do not loop if specifically requested */

1972

/* Do not loop if specifically requested */

1973

if (gfp_mask & __GFP_NORETRY)

1973

if (gfp_mask & __GFP_NORETRY)

1974

return 0;

1974

return 0;

1975

1976

/* Always retry if specifically requested */

1976

/* Always retry if specifically requested */

1977

if (gfp_mask & __GFP_NOFAIL)

1977

if (gfp_mask & __GFP_NOFAIL)

1978

return 1;

1978

return 1;

1979

1980

/*

1980

/*

1981

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

1981

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

1982

* making forward progress without invoking OOM. Suspend also disables

1982

* making forward progress without invoking OOM. Suspend also disables

1983

* storage devices so kswapd will not help. Bail if we are suspending.

1983

* storage devices so kswapd will not help. Bail if we are suspending.

1984

*/

1984

*/

1985

if (!did_some_progress && pm_suspended_storage())

1985

if (!did_some_progress && pm_suspended_storage())

1986

return 0;

1986

return 0;

1987

1988

/*

1988

/*

1989

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

1989

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

1990

* means __GFP_NOFAIL, but that may not be true in other

1990

* means __GFP_NOFAIL, but that may not be true in other

1991

* implementations.

1991

* implementations.

1992

*/

1992

*/

1993

if (order <= PAGE_ALLOC_COSTLY_ORDER)

1993

if (order <= PAGE_ALLOC_COSTLY_ORDER)

1994

return 1;

1994

return 1;

1995

1996

/*

1996

/*

1997

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

1997

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

1998

* specified, then we retry until we no longer reclaim any pages

1998

* specified, then we retry until we no longer reclaim any pages

1999

* (above), or we've reclaimed an order of pages at least as

1999

* (above), or we've reclaimed an order of pages at least as

2000

* large as the allocation's order. In both cases, if the

2000

* large as the allocation's order. In both cases, if the

2001

* allocation still fails, we stop retrying.

2001

* allocation still fails, we stop retrying.

2002

*/

2002

*/

2003

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2003

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2004

return 1;

2004

return 1;

2005

2006

return 0;

2006

return 0;

2007

}

2007

}

2008

2009

static inline struct page *

2009

static inline struct page *

2010

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2010

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2011

struct zonelist *zonelist, enum zone_type high_zoneidx,

2011

struct zonelist *zonelist, enum zone_type high_zoneidx,

2012

nodemask_t *nodemask, struct zone *preferred_zone,

2012

nodemask_t *nodemask, struct zone *preferred_zone,

2013

int migratetype)

2013

int migratetype)

2014

{

2014

{

2015

struct page *page;

2015

struct page *page;

2016

2017

/* Acquire the OOM killer lock for the zones in zonelist */

2017

/* Acquire the OOM killer lock for the zones in zonelist */

2018

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2018

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2019

schedule_timeout_uninterruptible(1);

2019

schedule_timeout_uninterruptible(1);

2020

return NULL;

2020

return NULL;

2021

}

2021

}

2022

2023

/*

2023

/*

2024

* Go through the zonelist yet one more time, keep very high watermark

2024

* Go through the zonelist yet one more time, keep very high watermark

2025

* here, this is only to catch a parallel oom killing, we must fail if

2025

* here, this is only to catch a parallel oom killing, we must fail if

2026

* we're still under heavy pressure.

2026

* we're still under heavy pressure.

2027

*/

2027

*/

2028

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2028

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2029

order, zonelist, high_zoneidx,

2029

order, zonelist, high_zoneidx,

2030

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2030

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2031

preferred_zone, migratetype);

2031

preferred_zone, migratetype);

2032

if (page)

2032

if (page)

2033

goto out;

2033

goto out;

2034

2035

if (!(gfp_mask & __GFP_NOFAIL)) {

2035

if (!(gfp_mask & __GFP_NOFAIL)) {

2036

/* The OOM killer will not help higher order allocs */

2036

/* The OOM killer will not help higher order allocs */

2037

if (order > PAGE_ALLOC_COSTLY_ORDER)

2037

if (order > PAGE_ALLOC_COSTLY_ORDER)

2038

goto out;

2038

goto out;

2039

/* The OOM killer does not needlessly kill tasks for lowmem */

2039

/* The OOM killer does not needlessly kill tasks for lowmem */

2040

if (high_zoneidx < ZONE_NORMAL)

2040

if (high_zoneidx < ZONE_NORMAL)

2041

goto out;

2041

goto out;

2042

/*

2042

/*

2043

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2043

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2044

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2044

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2045

* The caller should handle page allocation failure by itself if

2045

* The caller should handle page allocation failure by itself if

2046

* it specifies __GFP_THISNODE.

2046

* it specifies __GFP_THISNODE.

2047

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2047

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2048

*/

2048

*/

2049

if (gfp_mask & __GFP_THISNODE)

2049

if (gfp_mask & __GFP_THISNODE)

2050

goto out;

2050

goto out;

2051

}

2051

}

2052

/* Exhausted what can be done so it's blamo time */

2052

/* Exhausted what can be done so it's blamo time */

2053

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2053

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2054

2055

out:

2055

out:

2056

clear_zonelist_oom(zonelist, gfp_mask);

2056

clear_zonelist_oom(zonelist, gfp_mask);

2057

return page;

2057

return page;

2058

}

2058

}

2059

2060

#ifdef CONFIG_COMPACTION

2060

#ifdef CONFIG_COMPACTION

2061

/* Try memory compaction for high-order allocations before reclaim */

2061

/* Try memory compaction for high-order allocations before reclaim */

2062

static struct page *

2062

static struct page *

2063

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2063

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2064

struct zonelist *zonelist, enum zone_type high_zoneidx,

2064

struct zonelist *zonelist, enum zone_type high_zoneidx,

2065

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2065

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2066

int migratetype, bool sync_migration,

2066

int migratetype, bool sync_migration,

2067

bool *deferred_compaction,

2067

bool *deferred_compaction,

2068

unsigned long *did_some_progress)

2068

unsigned long *did_some_progress)

2069

{

2069

{

2070

struct page *page;

2070

struct page *page;

2071

2072

if (!order)

2072

if (!order)

2073

return NULL;

2073

return NULL;

2074

2075

if (compaction_deferred(preferred_zone, order)) {

2075

if (compaction_deferred(preferred_zone, order)) {

2076

*deferred_compaction = true;

2076

*deferred_compaction = true;

2077

return NULL;

2077

return NULL;

2078

}

2078

}

2079

2080

current->flags |= PF_MEMALLOC;

2080

current->flags |= PF_MEMALLOC;

2081

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2081

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2082

nodemask, sync_migration);

2082

nodemask, sync_migration);

2083

current->flags &= ~PF_MEMALLOC;

2083

current->flags &= ~PF_MEMALLOC;

2084

if (*did_some_progress != COMPACT_SKIPPED) {

2084

if (*did_some_progress != COMPACT_SKIPPED) {

2085

2086

/* Page migration frees to the PCP lists but we want merging */

2086

/* Page migration frees to the PCP lists but we want merging */

2087

drain_pages(get_cpu());

2087

drain_pages(get_cpu());

2088

put_cpu();

2088

put_cpu();

2089

2090

page = get_page_from_freelist(gfp_mask, nodemask,

2090

page = get_page_from_freelist(gfp_mask, nodemask,

2091

order, zonelist, high_zoneidx,

2091

order, zonelist, high_zoneidx,

2092

alloc_flags, preferred_zone,

2092

alloc_flags, preferred_zone,

2093

migratetype);

2093

migratetype);

2094

if (page) {

2094

if (page) {

2095

preferred_zone->compact_considered = 0;

2095

preferred_zone->compact_considered = 0;

2096

preferred_zone->compact_defer_shift = 0;

2096

preferred_zone->compact_defer_shift = 0;

2097

if (order >= preferred_zone->compact_order_failed)

2097

if (order >= preferred_zone->compact_order_failed)

2098

preferred_zone->compact_order_failed = order + 1;

2098

preferred_zone->compact_order_failed = order + 1;

2099

count_vm_event(COMPACTSUCCESS);

2099

count_vm_event(COMPACTSUCCESS);

2100

return page;

2100

return page;

2101

}

2101

}

2102

2103

/*

2103

/*

2104

* It's bad if compaction run occurs and fails.

2104

* It's bad if compaction run occurs and fails.

2105

* The most likely reason is that pages exist,

2105

* The most likely reason is that pages exist,

2106

* but not enough to satisfy watermarks.

2106

* but not enough to satisfy watermarks.

2107

*/

2107

*/

2108

count_vm_event(COMPACTFAIL);

2108

count_vm_event(COMPACTFAIL);

2109

2110

/*

2110

/*

2111

* As async compaction considers a subset of pageblocks, only

2111

* As async compaction considers a subset of pageblocks, only

2112

* defer if the failure was a sync compaction failure.

2112

* defer if the failure was a sync compaction failure.

2113

*/

2113

*/

2114

if (sync_migration)

2114

if (sync_migration)

2115

defer_compaction(preferred_zone, order);

2115

defer_compaction(preferred_zone, order);

2116

2117

cond_resched();

2117

cond_resched();

2118

}

2118

}

2119

2120

return NULL;

2120

return NULL;

2121

}

2121

}

2122

#else

2122

#else

2123

static inline struct page *

2123

static inline struct page *

2124

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2124

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2125

struct zonelist *zonelist, enum zone_type high_zoneidx,

2125

struct zonelist *zonelist, enum zone_type high_zoneidx,

2126

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2126

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2127

int migratetype, bool sync_migration,

2127

int migratetype, bool sync_migration,

2128

bool *deferred_compaction,

2128

bool *deferred_compaction,

2129

unsigned long *did_some_progress)

2129

unsigned long *did_some_progress)

2130

{

2130

{

2131

return NULL;

2131

return NULL;

2132

}

2132

}

2133

#endif /* CONFIG_COMPACTION */

2133

#endif /* CONFIG_COMPACTION */

2134

2135

/* Perform direct synchronous page reclaim */

2135

/* Perform direct synchronous page reclaim */

2136

static int

2136

static int

2137

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2137

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2138

nodemask_t *nodemask)

2138

nodemask_t *nodemask)

2139

{

2139

{

2140

struct reclaim_state reclaim_state;

2140

struct reclaim_state reclaim_state;

2141

int progress;

2141

int progress;

2142

2143

cond_resched();

2143

cond_resched();

2144

2145

/* We now go into synchronous reclaim */

2145

/* We now go into synchronous reclaim */

2146

cpuset_memory_pressure_bump();

2146

cpuset_memory_pressure_bump();

2147

current->flags |= PF_MEMALLOC;

2147

current->flags |= PF_MEMALLOC;

2148

lockdep_set_current_reclaim_state(gfp_mask);

2148

lockdep_set_current_reclaim_state(gfp_mask);

2149

reclaim_state.reclaimed_slab = 0;

2149

reclaim_state.reclaimed_slab = 0;

2150

current->reclaim_state = &reclaim_state;

2150

current->reclaim_state = &reclaim_state;

2151

2152

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2152

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2153

2154

current->reclaim_state = NULL;

2154

current->reclaim_state = NULL;

2155

lockdep_clear_current_reclaim_state();

2155

lockdep_clear_current_reclaim_state();

2156

current->flags &= ~PF_MEMALLOC;

2156

current->flags &= ~PF_MEMALLOC;

2157

2158

cond_resched();

2158

cond_resched();

2159

2160

return progress;

2160

return progress;

2161

}

2161

}

2162

2163

/* The really slow allocator path where we enter direct reclaim */

2163

/* The really slow allocator path where we enter direct reclaim */

2164

static inline struct page *

2164

static inline struct page *

2165

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2165

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2166

struct zonelist *zonelist, enum zone_type high_zoneidx,

2166

struct zonelist *zonelist, enum zone_type high_zoneidx,

2167

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2167

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2168

int migratetype, unsigned long *did_some_progress)

2168

int migratetype, unsigned long *did_some_progress)

2169

{

2169

{

2170

struct page *page = NULL;

2170

struct page *page = NULL;

2171

bool drained = false;

2171

bool drained = false;

2172

2173

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2173

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2174

nodemask);

2174

nodemask);

2175

if (unlikely(!(*did_some_progress)))

2175

if (unlikely(!(*did_some_progress)))

2176

return NULL;

2176

return NULL;

2177

2178

/* After successful reclaim, reconsider all zones for allocation */

2178

/* After successful reclaim, reconsider all zones for allocation */

2179

if (NUMA_BUILD)

2179

if (NUMA_BUILD)

2180

zlc_clear_zones_full(zonelist);

2180

zlc_clear_zones_full(zonelist);

2181

2182

retry:

2182

retry:

2183

page = get_page_from_freelist(gfp_mask, nodemask, order,

2183

page = get_page_from_freelist(gfp_mask, nodemask, order,

2184

zonelist, high_zoneidx,

2184

zonelist, high_zoneidx,

2185

alloc_flags, preferred_zone,

2185

alloc_flags, preferred_zone,

2186

migratetype);

2186

migratetype);

2187

2188

/*

2188

/*

2189

* If an allocation failed after direct reclaim, it could be because

2189

* If an allocation failed after direct reclaim, it could be because

2190

* pages are pinned on the per-cpu lists. Drain them and try again

2190

* pages are pinned on the per-cpu lists. Drain them and try again

2191

*/

2191

*/

2192

if (!page && !drained) {

2192

if (!page && !drained) {

2193

drain_all_pages();

2193

drain_all_pages();

2194

drained = true;

2194

drained = true;

2195

goto retry;

2195

goto retry;

2196

}

2196

}

2197

2198

return page;

2198

return page;

2199

}

2199

}

2200

2201

/*

2201

/*

2202

* This is called in the allocator slow-path if the allocation request is of

2202

* This is called in the allocator slow-path if the allocation request is of

2203

* sufficient urgency to ignore watermarks and take other desperate measures

2203

* sufficient urgency to ignore watermarks and take other desperate measures

2204

*/

2204

*/

2205

static inline struct page *

2205

static inline struct page *

2206

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2206

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2207

struct zonelist *zonelist, enum zone_type high_zoneidx,

2207

struct zonelist *zonelist, enum zone_type high_zoneidx,

2208

nodemask_t *nodemask, struct zone *preferred_zone,

2208

nodemask_t *nodemask, struct zone *preferred_zone,

2209

int migratetype)

2209

int migratetype)

2210

{

2210

{

2211

struct page *page;

2211

struct page *page;

2212

2213

do {

2213

do {

2214

page = get_page_from_freelist(gfp_mask, nodemask, order,

2214

page = get_page_from_freelist(gfp_mask, nodemask, order,

2215

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2215

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2216

preferred_zone, migratetype);

2216

preferred_zone, migratetype);

2217

2218

if (!page && gfp_mask & __GFP_NOFAIL)

2218

if (!page && gfp_mask & __GFP_NOFAIL)

2219

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2219

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2220

} while (!page && (gfp_mask & __GFP_NOFAIL));

2220

} while (!page && (gfp_mask & __GFP_NOFAIL));

2221

2222

return page;

2222

return page;

2223

}

2223

}

2224

2225

static inline

2225

static inline

2226

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2226

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2227

enum zone_type high_zoneidx,

2227

enum zone_type high_zoneidx,

2228

enum zone_type classzone_idx)

2228

enum zone_type classzone_idx)

2229

{

2229

{

2230

struct zoneref *z;

2230

struct zoneref *z;

2231

struct zone *zone;

2231

struct zone *zone;

2232

2233

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2233

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2234

wakeup_kswapd(zone, order, classzone_idx);

2234

wakeup_kswapd(zone, order, classzone_idx);

2235

}

2235

}

2236

2237

static inline int

2237

static inline int

2238

gfp_to_alloc_flags(gfp_t gfp_mask)

2238

gfp_to_alloc_flags(gfp_t gfp_mask)

2239

{

2239

{

2240

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2240

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2241

const gfp_t wait = gfp_mask & __GFP_WAIT;

2241

const gfp_t wait = gfp_mask & __GFP_WAIT;

2242

2243

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2243

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2244

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2244

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2245

2246

/*

2246

/*

2247

* The caller may dip into page reserves a bit more if the caller

2247

* The caller may dip into page reserves a bit more if the caller

2248

* cannot run direct reclaim, or if the caller has realtime scheduling

2248

* cannot run direct reclaim, or if the caller has realtime scheduling

2249

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2249

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2250

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2250

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2251

*/

2251

*/

2252

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2252

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2253

2254

if (!wait) {

2254

if (!wait) {

2255

/*

2255

/*

2256

* Not worth trying to allocate harder for

2256

* Not worth trying to allocate harder for

2257

* __GFP_NOMEMALLOC even if it can't schedule.

2257

* __GFP_NOMEMALLOC even if it can't schedule.

2258

*/

2258

*/

2259

if (!(gfp_mask & __GFP_NOMEMALLOC))

2259

if (!(gfp_mask & __GFP_NOMEMALLOC))

2260

alloc_flags |= ALLOC_HARDER;

2260

alloc_flags |= ALLOC_HARDER;

2261

/*

2261

/*

2262

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2262

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2263

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2263

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2264

*/

2264

*/

2265

alloc_flags &= ~ALLOC_CPUSET;

2265

alloc_flags &= ~ALLOC_CPUSET;

2266

} else if (unlikely(rt_task(current)) && !in_interrupt())

2266

} else if (unlikely(rt_task(current)) && !in_interrupt())

2267

alloc_flags |= ALLOC_HARDER;

2267

alloc_flags |= ALLOC_HARDER;

2268

2269

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2269

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2270

if (!in_interrupt() &&

2270

if (!in_interrupt() &&

2271

((current->flags & PF_MEMALLOC) ||

2271

((current->flags & PF_MEMALLOC) ||

2272

unlikely(test_thread_flag(TIF_MEMDIE))))

2272

unlikely(test_thread_flag(TIF_MEMDIE))))

2273

alloc_flags |= ALLOC_NO_WATERMARKS;

2273

alloc_flags |= ALLOC_NO_WATERMARKS;

2274

}

2274

}

2275

2276

return alloc_flags;

2276

return alloc_flags;

2277

}

2277

}

2278

2279

static inline struct page *

2279

static inline struct page *

2280

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2280

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2281

struct zonelist *zonelist, enum zone_type high_zoneidx,

2281

struct zonelist *zonelist, enum zone_type high_zoneidx,

2282

nodemask_t *nodemask, struct zone *preferred_zone,

2282

nodemask_t *nodemask, struct zone *preferred_zone,

2283

int migratetype)

2283

int migratetype)

2284

{

2284

{

2285

const gfp_t wait = gfp_mask & __GFP_WAIT;

2285

const gfp_t wait = gfp_mask & __GFP_WAIT;

2286

struct page *page = NULL;

2286

struct page *page = NULL;

2287

int alloc_flags;

2287

int alloc_flags;

2288

unsigned long pages_reclaimed = 0;

2288

unsigned long pages_reclaimed = 0;

2289

unsigned long did_some_progress;

2289

unsigned long did_some_progress;

2290

bool sync_migration = false;

2290

bool sync_migration = false;

2291

bool deferred_compaction = false;

2291

bool deferred_compaction = false;

2292

2293

/*

2293

/*

2294

* In the slowpath, we sanity check order to avoid ever trying to

2294

* In the slowpath, we sanity check order to avoid ever trying to

2295

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2295

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2296

* be using allocators in order of preference for an area that is

2296

* be using allocators in order of preference for an area that is

2297

* too large.

2297

* too large.

2298

*/

2298

*/

2299

if (order >= MAX_ORDER) {

2299

if (order >= MAX_ORDER) {

2300

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2300

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2301

return NULL;

2301

return NULL;

2302

}

2302

}

2303

2304

/*

2304

/*

2305

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2305

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2306

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2306

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2307

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2307

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2308

* using a larger set of nodes after it has established that the

2308

* using a larger set of nodes after it has established that the

2309

* allowed per node queues are empty and that nodes are

2309

* allowed per node queues are empty and that nodes are

2310

* over allocated.

2310

* over allocated.

2311

*/

2311

*/

2312

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2312

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2313

goto nopage;

2313

goto nopage;

2314

2315

restart:

2315

restart:

2316

if (!(gfp_mask & __GFP_NO_KSWAPD))

2316

if (!(gfp_mask & __GFP_NO_KSWAPD))

2317

wake_all_kswapd(order, zonelist, high_zoneidx,

2317

wake_all_kswapd(order, zonelist, high_zoneidx,

2318

zone_idx(preferred_zone));

2318

zone_idx(preferred_zone));

2319

2320

/*

2320

/*

2321

* OK, we're below the kswapd watermark and have kicked background

2321

* OK, we're below the kswapd watermark and have kicked background

2322

* reclaim. Now things get more complex, so set up alloc_flags according

2322

* reclaim. Now things get more complex, so set up alloc_flags according

2323

* to how we want to proceed.

2323

* to how we want to proceed.

2324

*/

2324

*/

2325

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2325

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2326

2327

/*

2327

/*

2328

* Find the true preferred zone if the allocation is unconstrained by

2328

* Find the true preferred zone if the allocation is unconstrained by

2329

* cpusets.

2329

* cpusets.

2330

*/

2330

*/

2331

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2331

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2332

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2332

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2333

&preferred_zone);

2333

&preferred_zone);

2334

2335

rebalance:

2335

rebalance:

2336

/* This is the last chance, in general, before the goto nopage. */

2336

/* This is the last chance, in general, before the goto nopage. */

2337

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2337

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2338

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2338

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2339

preferred_zone, migratetype);

2339

preferred_zone, migratetype);

2340

if (page)

2340

if (page)

2341

goto got_pg;

2341

goto got_pg;

2342

2343

/* Allocate without watermarks if the context allows */

2343

/* Allocate without watermarks if the context allows */

2344

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2344

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2345

page = __alloc_pages_high_priority(gfp_mask, order,

2345

page = __alloc_pages_high_priority(gfp_mask, order,

2346

zonelist, high_zoneidx, nodemask,

2346

zonelist, high_zoneidx, nodemask,

2347

preferred_zone, migratetype);

2347

preferred_zone, migratetype);

2348

if (page)

2348

if (page)

2349

goto got_pg;

2349

goto got_pg;

2350

}

2350

}

2351

2352

/* Atomic allocations - we can't balance anything */

2352

/* Atomic allocations - we can't balance anything */

2353

if (!wait)

2353

if (!wait)

2354

goto nopage;

2354

goto nopage;

2355

2356

/* Avoid recursion of direct reclaim */

2356

/* Avoid recursion of direct reclaim */

2357

if (current->flags & PF_MEMALLOC)

2357

if (current->flags & PF_MEMALLOC)

2358

goto nopage;

2358

goto nopage;

2359

2360

/* Avoid allocations with no watermarks from looping endlessly */

2360

/* Avoid allocations with no watermarks from looping endlessly */

2361

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2361

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2362

goto nopage;

2362

goto nopage;

2363

2364

/*

2364

/*

2365

* Try direct compaction. The first pass is asynchronous. Subsequent

2365

* Try direct compaction. The first pass is asynchronous. Subsequent

2366

* attempts after direct reclaim are synchronous

2366

* attempts after direct reclaim are synchronous

2367

*/

2367

*/

2368

page = __alloc_pages_direct_compact(gfp_mask, order,

2368

page = __alloc_pages_direct_compact(gfp_mask, order,

2369

zonelist, high_zoneidx,

2369

zonelist, high_zoneidx,

2370

nodemask,

2370

nodemask,

2371

alloc_flags, preferred_zone,

2371

alloc_flags, preferred_zone,

2372

migratetype, sync_migration,

2372

migratetype, sync_migration,

2373

&deferred_compaction,

2373

&deferred_compaction,

2374

&did_some_progress);

2374

&did_some_progress);

2375

if (page)

2375

if (page)

2376

goto got_pg;

2376

goto got_pg;

2377

sync_migration = true;

2377

sync_migration = true;

2378

2379

/*

2379

/*

2380

* If compaction is deferred for high-order allocations, it is because

2380

* If compaction is deferred for high-order allocations, it is because

2381

* sync compaction recently failed. In this is the case and the caller

2381

* sync compaction recently failed. In this is the case and the caller

2382

* has requested the system not be heavily disrupted, fail the

2382

* has requested the system not be heavily disrupted, fail the

2383

* allocation now instead of entering direct reclaim

2383

* allocation now instead of entering direct reclaim

2384

*/

2384

*/

2385

if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))

2385

if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))

2386

goto nopage;

2386

goto nopage;

2387

2388

/* Try direct reclaim and then allocating */

2388

/* Try direct reclaim and then allocating */

2389

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2389

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2390

zonelist, high_zoneidx,

2390

zonelist, high_zoneidx,

2391

nodemask,

2391

nodemask,

2392

alloc_flags, preferred_zone,

2392

alloc_flags, preferred_zone,

2393

migratetype, &did_some_progress);

2393

migratetype, &did_some_progress);

2394

if (page)

2394

if (page)

2395

goto got_pg;

2395

goto got_pg;

2396

2397

/*

2397

/*

2398

* If we failed to make any progress reclaiming, then we are

2398

* If we failed to make any progress reclaiming, then we are

2399

* running out of options and have to consider going OOM

2399

* running out of options and have to consider going OOM

2400

*/

2400

*/

2401

if (!did_some_progress) {

2401

if (!did_some_progress) {

2402

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2402

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2403

if (oom_killer_disabled)

2403

if (oom_killer_disabled)

2404

goto nopage;

2404

goto nopage;

2405

/* Coredumps can quickly deplete all memory reserves */

2405

/* Coredumps can quickly deplete all memory reserves */

2406

if ((current->flags & PF_DUMPCORE) &&

2406

if ((current->flags & PF_DUMPCORE) &&

2407

!(gfp_mask & __GFP_NOFAIL))

2407

!(gfp_mask & __GFP_NOFAIL))

2408

goto nopage;

2408

goto nopage;

2409

page = __alloc_pages_may_oom(gfp_mask, order,

2409

page = __alloc_pages_may_oom(gfp_mask, order,

2410

zonelist, high_zoneidx,

2410

zonelist, high_zoneidx,

2411

nodemask, preferred_zone,

2411

nodemask, preferred_zone,

2412

migratetype);

2412

migratetype);

2413

if (page)

2413

if (page)

2414

goto got_pg;

2414

goto got_pg;

2415

2416

if (!(gfp_mask & __GFP_NOFAIL)) {

2416

if (!(gfp_mask & __GFP_NOFAIL)) {

2417

/*

2417

/*

2418

* The oom killer is not called for high-order

2418

* The oom killer is not called for high-order

2419

* allocations that may fail, so if no progress

2419

* allocations that may fail, so if no progress

2420

* is being made, there are no other options and

2420

* is being made, there are no other options and

2421

* retrying is unlikely to help.

2421

* retrying is unlikely to help.

2422

*/

2422

*/

2423

if (order > PAGE_ALLOC_COSTLY_ORDER)

2423

if (order > PAGE_ALLOC_COSTLY_ORDER)

2424

goto nopage;

2424

goto nopage;

2425

/*

2425

/*

2426

* The oom killer is not called for lowmem

2426

* The oom killer is not called for lowmem

2427

* allocations to prevent needlessly killing

2427

* allocations to prevent needlessly killing

2428

* innocent tasks.

2428

* innocent tasks.

2429

*/

2429

*/

2430

if (high_zoneidx < ZONE_NORMAL)

2430

if (high_zoneidx < ZONE_NORMAL)

2431

goto nopage;

2431

goto nopage;

2432

}

2432

}

2433

2434

goto restart;

2434

goto restart;

2435

}

2435

}

2436

}

2436

}

2437

2438

/* Check if we should retry the allocation */

2438

/* Check if we should retry the allocation */

2439

pages_reclaimed += did_some_progress;

2439

pages_reclaimed += did_some_progress;

2440

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2440

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2441

pages_reclaimed)) {

2441

pages_reclaimed)) {

2442

/* Wait for some write requests to complete then retry */

2442

/* Wait for some write requests to complete then retry */

2443

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2443

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2444

goto rebalance;

2444

goto rebalance;

2445

} else {

2445

} else {

2446

/*

2446

/*

2447

* High-order allocations do not necessarily loop after

2447

* High-order allocations do not necessarily loop after

2448

* direct reclaim and reclaim/compaction depends on compaction

2448

* direct reclaim and reclaim/compaction depends on compaction

2449

* being called after reclaim so call directly if necessary

2449

* being called after reclaim so call directly if necessary

2450

*/

2450

*/

2451

page = __alloc_pages_direct_compact(gfp_mask, order,

2451

page = __alloc_pages_direct_compact(gfp_mask, order,

2452

zonelist, high_zoneidx,

2452

zonelist, high_zoneidx,

2453

nodemask,

2453

nodemask,

2454

alloc_flags, preferred_zone,

2454

alloc_flags, preferred_zone,

2455

migratetype, sync_migration,

2455

migratetype, sync_migration,

2456

&deferred_compaction,

2456

&deferred_compaction,

2457

&did_some_progress);

2457

&did_some_progress);

2458

if (page)

2458

if (page)

2459

goto got_pg;

2459

goto got_pg;

2460

}

2460

}

2461

2462

nopage:

2462

nopage:

2463

warn_alloc_failed(gfp_mask, order, NULL);

2463

warn_alloc_failed(gfp_mask, order, NULL);

2464

return page;

2464

return page;

2465

got_pg:

2465

got_pg:

2466

if (kmemcheck_enabled)

2466

if (kmemcheck_enabled)

2467

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2467

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2468

return page;

2468

return page;

2469

2470

}

2470

}

2471

2472

/*

2472

/*

2473

* This is the 'heart' of the zoned buddy allocator.

2473

* This is the 'heart' of the zoned buddy allocator.

2474

*/

2474

*/

2475

struct page *

2475

struct page *

2476

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2476

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2477

struct zonelist *zonelist, nodemask_t *nodemask)

2477

struct zonelist *zonelist, nodemask_t *nodemask)

2478

{

2478

{

2479

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2479

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2480

struct zone *preferred_zone;

2480

struct zone *preferred_zone;

2481

struct page *page = NULL;

2481

struct page *page = NULL;

2482

int migratetype = allocflags_to_migratetype(gfp_mask);

2482

int migratetype = allocflags_to_migratetype(gfp_mask);

2483

unsigned int cpuset_mems_cookie;

2483

unsigned int cpuset_mems_cookie;

2484

2485

gfp_mask &= gfp_allowed_mask;

2485

gfp_mask &= gfp_allowed_mask;

2486

2487

lockdep_trace_alloc(gfp_mask);

2487

lockdep_trace_alloc(gfp_mask);

2488

2489

might_sleep_if(gfp_mask & __GFP_WAIT);

2489

might_sleep_if(gfp_mask & __GFP_WAIT);

2490

2491

if (should_fail_alloc_page(gfp_mask, order))

2491

if (should_fail_alloc_page(gfp_mask, order))

2492

return NULL;

2492

return NULL;

2493

2494

/*

2494

/*

2495

* Check the zones suitable for the gfp_mask contain at least one

2495

* Check the zones suitable for the gfp_mask contain at least one

2496

* valid zone. It's possible to have an empty zonelist as a result

2496

* valid zone. It's possible to have an empty zonelist as a result

2497

* of GFP_THISNODE and a memoryless node

2497

* of GFP_THISNODE and a memoryless node

2498

*/

2498

*/

2499

if (unlikely(!zonelist->_zonerefs->zone))

2499

if (unlikely(!zonelist->_zonerefs->zone))

2500

return NULL;

2500

return NULL;

2501

2502

retry_cpuset:

2502

retry_cpuset:

2503

cpuset_mems_cookie = get_mems_allowed();

2503

cpuset_mems_cookie = get_mems_allowed();

2504

2505

/* The preferred zone is used for statistics later */

2505

/* The preferred zone is used for statistics later */

2506

first_zones_zonelist(zonelist, high_zoneidx,

2506

first_zones_zonelist(zonelist, high_zoneidx,

2507

nodemask ? : &cpuset_current_mems_allowed,

2507

nodemask ? : &cpuset_current_mems_allowed,

2508

&preferred_zone);

2508

&preferred_zone);

2509

if (!preferred_zone)

2509

if (!preferred_zone)

2510

goto out;

2510

goto out;

2511

2512

/* First allocation attempt */

2512

/* First allocation attempt */

2513

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2513

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2514

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

2514

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

2515

preferred_zone, migratetype);

2515

preferred_zone, migratetype);

2516

if (unlikely(!page))

2516

if (unlikely(!page))

2517

page = __alloc_pages_slowpath(gfp_mask, order,

2517

page = __alloc_pages_slowpath(gfp_mask, order,

2518

zonelist, high_zoneidx, nodemask,

2518

zonelist, high_zoneidx, nodemask,

2519

preferred_zone, migratetype);

2519

preferred_zone, migratetype);

2520

2521

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2521

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2522

2523

out:

2523

out:

2524

/*

2524

/*

2525

* When updating a task's mems_allowed, it is possible to race with

2525

* When updating a task's mems_allowed, it is possible to race with

2526

* parallel threads in such a way that an allocation can fail while

2526

* parallel threads in such a way that an allocation can fail while

2527

* the mask is being updated. If a page allocation is about to fail,

2527

* the mask is being updated. If a page allocation is about to fail,

2528

* check if the cpuset changed during allocation and if so, retry.

2528

* check if the cpuset changed during allocation and if so, retry.

2529

*/

2529

*/

2530

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2530

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2531

goto retry_cpuset;

2531

goto retry_cpuset;

2532

2533

return page;

2533

return page;

2534

}

2534

}

2535

EXPORT_SYMBOL(__alloc_pages_nodemask);

2535

EXPORT_SYMBOL(__alloc_pages_nodemask);

2536

2537

/*

2537

/*

2538

* Common helper functions.

2538

* Common helper functions.

2539

*/

2539

*/

2540

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2540

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2541

{

2541

{

2542

struct page *page;

2542

struct page *page;

2543

2544

/*

2544

/*

2545

* __get_free_pages() returns a 32-bit address, which cannot represent

2545

* __get_free_pages() returns a 32-bit address, which cannot represent

2546

* a highmem page

2546

* a highmem page

2547

*/

2547

*/

2548

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2548

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2549

2550

page = alloc_pages(gfp_mask, order);

2550

page = alloc_pages(gfp_mask, order);

2551

if (!page)

2551

if (!page)

2552

return 0;

2552

return 0;

2553

return (unsigned long) page_address(page);

2553

return (unsigned long) page_address(page);

2554

}

2554

}

2555

EXPORT_SYMBOL(__get_free_pages);

2555

EXPORT_SYMBOL(__get_free_pages);

2556

2557

unsigned long get_zeroed_page(gfp_t gfp_mask)

2557

unsigned long get_zeroed_page(gfp_t gfp_mask)

2558

{

2558

{

2559

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2559

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2560

}

2560

}

2561

EXPORT_SYMBOL(get_zeroed_page);

2561

EXPORT_SYMBOL(get_zeroed_page);

2562

2563

void __free_pages(struct page *page, unsigned int order)

2563

void __free_pages(struct page *page, unsigned int order)

2564

{

2564

{

2565

if (put_page_testzero(page)) {

2565

if (put_page_testzero(page)) {

2566

if (order == 0)

2566

if (order == 0)

2567

free_hot_cold_page(page, 0);

2567

free_hot_cold_page(page, 0);

2568

else

2568

else

2569

__free_pages_ok(page, order);

2569

__free_pages_ok(page, order);

2570

}

2570

}

2571

}

2571

}

2572

2573

EXPORT_SYMBOL(__free_pages);

2573

EXPORT_SYMBOL(__free_pages);

2574

2575

void free_pages(unsigned long addr, unsigned int order)

2575

void free_pages(unsigned long addr, unsigned int order)

2576

{

2576

{

2577

if (addr != 0) {

2577

if (addr != 0) {

2578

VM_BUG_ON(!virt_addr_valid((void *)addr));

2578

VM_BUG_ON(!virt_addr_valid((void *)addr));

2579

__free_pages(virt_to_page((void *)addr), order);

2579

__free_pages(virt_to_page((void *)addr), order);

2580

}

2580

}

2581

}

2581

}

2582

2583

EXPORT_SYMBOL(free_pages);

2583

EXPORT_SYMBOL(free_pages);

2584

2585

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2585

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2586

{

2586

{

2587

if (addr) {

2587

if (addr) {

2588

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2588

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2589

unsigned long used = addr + PAGE_ALIGN(size);

2589

unsigned long used = addr + PAGE_ALIGN(size);

2590

2591

split_page(virt_to_page((void *)addr), order);

2591

split_page(virt_to_page((void *)addr), order);

2592

while (used < alloc_end) {

2592

while (used < alloc_end) {

2593

free_page(used);

2593

free_page(used);

2594

used += PAGE_SIZE;

2594

used += PAGE_SIZE;

2595

}

2595

}

2596

}

2596

}

2597

return (void *)addr;

2597

return (void *)addr;

2598

}

2598

}

2599

2600

/**

2600

/**

2601

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2601

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2602

* @size: the number of bytes to allocate

2602

* @size: the number of bytes to allocate

2603

* @gfp_mask: GFP flags for the allocation

2603

* @gfp_mask: GFP flags for the allocation

2604

*

2604

*

2605

* This function is similar to alloc_pages(), except that it allocates the

2605

* This function is similar to alloc_pages(), except that it allocates the

2606

* minimum number of pages to satisfy the request. alloc_pages() can only

2606

* minimum number of pages to satisfy the request. alloc_pages() can only

2607

* allocate memory in power-of-two pages.

2607

* allocate memory in power-of-two pages.

2608

*

2608

*

2609

* This function is also limited by MAX_ORDER.

2609

* This function is also limited by MAX_ORDER.

2610

*

2610

*

2611

* Memory allocated by this function must be released by free_pages_exact().

2611

* Memory allocated by this function must be released by free_pages_exact().

2612

*/

2612

*/

2613

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2613

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2614

{

2614

{

2615

unsigned int order = get_order(size);

2615

unsigned int order = get_order(size);

2616

unsigned long addr;

2616

unsigned long addr;

2617

2618

addr = __get_free_pages(gfp_mask, order);

2618

addr = __get_free_pages(gfp_mask, order);

2619

return make_alloc_exact(addr, order, size);

2619

return make_alloc_exact(addr, order, size);

2620

}

2620

}

2621

EXPORT_SYMBOL(alloc_pages_exact);

2621

EXPORT_SYMBOL(alloc_pages_exact);

2622

2623

/**

2623

/**

2624

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2624

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2625

* pages on a node.

2625

* pages on a node.

2626

* @nid: the preferred node ID where memory should be allocated

2626

* @nid: the preferred node ID where memory should be allocated

2627

* @size: the number of bytes to allocate

2627

* @size: the number of bytes to allocate

2628

* @gfp_mask: GFP flags for the allocation

2628

* @gfp_mask: GFP flags for the allocation

2629

*

2629

*

2630

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2630

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2631

* back.

2631

* back.

2632

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2632

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2633

* but is not exact.

2633

* but is not exact.

2634

*/

2634

*/

2635

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2635

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2636

{

2636

{

2637

unsigned order = get_order(size);

2637

unsigned order = get_order(size);

2638

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2638

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2639

if (!p)

2639

if (!p)

2640

return NULL;

2640

return NULL;

2641

return make_alloc_exact((unsigned long)page_address(p), order, size);

2641

return make_alloc_exact((unsigned long)page_address(p), order, size);

2642

}

2642

}

2643

EXPORT_SYMBOL(alloc_pages_exact_nid);

2643

EXPORT_SYMBOL(alloc_pages_exact_nid);

2644

2645

/**

2645

/**

2646

* free_pages_exact - release memory allocated via alloc_pages_exact()

2646

* free_pages_exact - release memory allocated via alloc_pages_exact()

2647

* @virt: the value returned by alloc_pages_exact.

2647

* @virt: the value returned by alloc_pages_exact.

2648

* @size: size of allocation, same value as passed to alloc_pages_exact().

2648

* @size: size of allocation, same value as passed to alloc_pages_exact().

2649

*

2649

*

2650

* Release the memory allocated by a previous call to alloc_pages_exact.

2650

* Release the memory allocated by a previous call to alloc_pages_exact.

2651

*/

2651

*/

2652

void free_pages_exact(void *virt, size_t size)

2652

void free_pages_exact(void *virt, size_t size)

2653

{

2653

{

2654

unsigned long addr = (unsigned long)virt;

2654

unsigned long addr = (unsigned long)virt;

2655

unsigned long end = addr + PAGE_ALIGN(size);

2655

unsigned long end = addr + PAGE_ALIGN(size);

2656

2657

while (addr < end) {

2657

while (addr < end) {

2658

free_page(addr);

2658

free_page(addr);

2659

addr += PAGE_SIZE;

2659

addr += PAGE_SIZE;

2660

}

2660

}

2661

}

2661

}

2662

EXPORT_SYMBOL(free_pages_exact);

2662

EXPORT_SYMBOL(free_pages_exact);

2663

2664

static unsigned int nr_free_zone_pages(int offset)

2664

static unsigned int nr_free_zone_pages(int offset)

2665

{

2665

{

2666

struct zoneref *z;

2666

struct zoneref *z;

2667

struct zone *zone;

2667

struct zone *zone;

2668

2669

/* Just pick one node, since fallback list is circular */

2669

/* Just pick one node, since fallback list is circular */

2670

unsigned int sum = 0;

2670

unsigned int sum = 0;

2671

2672

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2672

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2673

2674

for_each_zone_zonelist(zone, z, zonelist, offset) {

2674

for_each_zone_zonelist(zone, z, zonelist, offset) {

2675

unsigned long size = zone->present_pages;

2675

unsigned long size = zone->present_pages;

2676

unsigned long high = high_wmark_pages(zone);

2676

unsigned long high = high_wmark_pages(zone);

2677

if (size > high)

2677

if (size > high)

2678

sum += size - high;

2678

sum += size - high;

2679

}

2679

}

2680

2681

return sum;

2681

return sum;

2682

}

2682

}

2683

2684

/*

2684

/*

2685

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2685

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2686

*/

2686

*/

2687

unsigned int nr_free_buffer_pages(void)

2687

unsigned int nr_free_buffer_pages(void)

2688

{

2688

{

2689

return nr_free_zone_pages(gfp_zone(GFP_USER));

2689

return nr_free_zone_pages(gfp_zone(GFP_USER));

2690

}

2690

}

2691

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2691

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2692

2693

/*

2693

/*

2694

* Amount of free RAM allocatable within all zones

2694

* Amount of free RAM allocatable within all zones

2695

*/

2695

*/

2696

unsigned int nr_free_pagecache_pages(void)

2696

unsigned int nr_free_pagecache_pages(void)

2697

{

2697

{

2698

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2698

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2699

}

2699

}

2700

2701

static inline void show_node(struct zone *zone)

2701

static inline void show_node(struct zone *zone)

2702

{

2702

{

2703

if (NUMA_BUILD)

2703

if (NUMA_BUILD)

2704

printk("Node %d ", zone_to_nid(zone));

2704

printk("Node %d ", zone_to_nid(zone));

2705

}

2705

}

2706

2707

void si_meminfo(struct sysinfo *val)

2707

void si_meminfo(struct sysinfo *val)

2708

{

2708

{

2709

val->totalram = totalram_pages;

2709

val->totalram = totalram_pages;

2710

val->sharedram = 0;

2710

val->sharedram = 0;

2711

val->freeram = global_page_state(NR_FREE_PAGES);

2711

val->freeram = global_page_state(NR_FREE_PAGES);

2712

val->bufferram = nr_blockdev_pages();

2712

val->bufferram = nr_blockdev_pages();

2713

val->totalhigh = totalhigh_pages;

2713

val->totalhigh = totalhigh_pages;

2714

val->freehigh = nr_free_highpages();

2714

val->freehigh = nr_free_highpages();

2715

val->mem_unit = PAGE_SIZE;

2715

val->mem_unit = PAGE_SIZE;

2716

}

2716

}

2717

2718

EXPORT_SYMBOL(si_meminfo);

2718

EXPORT_SYMBOL(si_meminfo);

2719

2720

#ifdef CONFIG_NUMA

2720

#ifdef CONFIG_NUMA

2721

void si_meminfo_node(struct sysinfo *val, int nid)

2721

void si_meminfo_node(struct sysinfo *val, int nid)

2722

{

2722

{

2723

pg_data_t *pgdat = NODE_DATA(nid);

2723

pg_data_t *pgdat = NODE_DATA(nid);

2724

2725

val->totalram = pgdat->node_present_pages;

2725

val->totalram = pgdat->node_present_pages;

2726

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2726

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2727

#ifdef CONFIG_HIGHMEM

2727

#ifdef CONFIG_HIGHMEM

2728

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2728

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2729

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2729

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2730

NR_FREE_PAGES);

2730

NR_FREE_PAGES);

2731

#else

2731

#else

2732

val->totalhigh = 0;

2732

val->totalhigh = 0;

2733

val->freehigh = 0;

2733

val->freehigh = 0;

2734

#endif

2734

#endif

2735

val->mem_unit = PAGE_SIZE;

2735

val->mem_unit = PAGE_SIZE;

2736

}

2736

}

2737

#endif

2737

#endif

2738

2739

/*

2739

/*

2740

* Determine whether the node should be displayed or not, depending on whether

2740

* Determine whether the node should be displayed or not, depending on whether

2741

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2741

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2742

*/

2742

*/

2743

bool skip_free_areas_node(unsigned int flags, int nid)

2743

bool skip_free_areas_node(unsigned int flags, int nid)

2744

{

2744

{

2745

bool ret = false;

2745

bool ret = false;

2746

unsigned int cpuset_mems_cookie;

2746

unsigned int cpuset_mems_cookie;

2747

2748

if (!(flags & SHOW_MEM_FILTER_NODES))

2748

if (!(flags & SHOW_MEM_FILTER_NODES))

2749

goto out;

2749

goto out;

2750

2751

do {

2751

do {

2752

cpuset_mems_cookie = get_mems_allowed();

2752

cpuset_mems_cookie = get_mems_allowed();

2753

ret = !node_isset(nid, cpuset_current_mems_allowed);

2753

ret = !node_isset(nid, cpuset_current_mems_allowed);

2754

} while (!put_mems_allowed(cpuset_mems_cookie));

2754

} while (!put_mems_allowed(cpuset_mems_cookie));

2755

out:

2755

out:

2756

return ret;

2756

return ret;

2757

}

2757

}

2758

2759

#define K(x) ((x) << (PAGE_SHIFT-10))

2759

#define K(x) ((x) << (PAGE_SHIFT-10))

2760

2761

/*

2761

/*

2762

* Show free area list (used inside shift_scroll-lock stuff)

2762

* Show free area list (used inside shift_scroll-lock stuff)

2763

* We also calculate the percentage fragmentation. We do this by counting the

2763

* We also calculate the percentage fragmentation. We do this by counting the

2764

* memory on each free list with the exception of the first item on the list.

2764

* memory on each free list with the exception of the first item on the list.

2765

* Suppresses nodes that are not allowed by current's cpuset if

2765

* Suppresses nodes that are not allowed by current's cpuset if

2766

* SHOW_MEM_FILTER_NODES is passed.

2766

* SHOW_MEM_FILTER_NODES is passed.

2767

*/

2767

*/

2768

void show_free_areas(unsigned int filter)

2768

void show_free_areas(unsigned int filter)

2769

{

2769

{

2770

int cpu;

2770

int cpu;

2771

struct zone *zone;

2771

struct zone *zone;

2772

2773

for_each_populated_zone(zone) {

2773

for_each_populated_zone(zone) {

2774

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2774

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2775

continue;

2775

continue;

2776

show_node(zone);

2776

show_node(zone);

2777

printk("%s per-cpu:\n", zone->name);

2777

printk("%s per-cpu:\n", zone->name);

2778

2779

for_each_online_cpu(cpu) {

2779

for_each_online_cpu(cpu) {

2780

struct per_cpu_pageset *pageset;

2780

struct per_cpu_pageset *pageset;

2781

2782

pageset = per_cpu_ptr(zone->pageset, cpu);

2782

pageset = per_cpu_ptr(zone->pageset, cpu);

2783

2784

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2784

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2785

cpu, pageset->pcp.high,

2785

cpu, pageset->pcp.high,

2786

pageset->pcp.batch, pageset->pcp.count);

2786

pageset->pcp.batch, pageset->pcp.count);

2787

}

2787

}

2788

}

2788

}

2789

2790

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2790

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2791

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2791

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2792

" unevictable:%lu"

2792

" unevictable:%lu"

2793

" dirty:%lu writeback:%lu unstable:%lu\n"

2793

" dirty:%lu writeback:%lu unstable:%lu\n"

2794

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2794

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2795

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",

2795

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",

2796

global_page_state(NR_ACTIVE_ANON),

2796

global_page_state(NR_ACTIVE_ANON),

2797

global_page_state(NR_INACTIVE_ANON),

2797

global_page_state(NR_INACTIVE_ANON),

2798

global_page_state(NR_ISOLATED_ANON),

2798

global_page_state(NR_ISOLATED_ANON),

2799

global_page_state(NR_ACTIVE_FILE),

2799

global_page_state(NR_ACTIVE_FILE),

2800

global_page_state(NR_INACTIVE_FILE),

2800

global_page_state(NR_INACTIVE_FILE),

2801

global_page_state(NR_ISOLATED_FILE),

2801

global_page_state(NR_ISOLATED_FILE),

2802

global_page_state(NR_UNEVICTABLE),

2802

global_page_state(NR_UNEVICTABLE),

2803

global_page_state(NR_FILE_DIRTY),

2803

global_page_state(NR_FILE_DIRTY),

2804

global_page_state(NR_WRITEBACK),

2804

global_page_state(NR_WRITEBACK),

2805

global_page_state(NR_UNSTABLE_NFS),

2805

global_page_state(NR_UNSTABLE_NFS),

2806

global_page_state(NR_FREE_PAGES),

2806

global_page_state(NR_FREE_PAGES),

2807

global_page_state(NR_SLAB_RECLAIMABLE),

2807

global_page_state(NR_SLAB_RECLAIMABLE),

2808

global_page_state(NR_SLAB_UNRECLAIMABLE),

2808

global_page_state(NR_SLAB_UNRECLAIMABLE),

2809

global_page_state(NR_FILE_MAPPED),

2809

global_page_state(NR_FILE_MAPPED),

2810

global_page_state(NR_SHMEM),

2810

global_page_state(NR_SHMEM),

2811

global_page_state(NR_PAGETABLE),

2811

global_page_state(NR_PAGETABLE),

2812

global_page_state(NR_BOUNCE));

2812

global_page_state(NR_BOUNCE));

2813

2814

for_each_populated_zone(zone) {

2814

for_each_populated_zone(zone) {

2815

int i;

2815

int i;

2816

2817

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2817

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2818

continue;

2818

continue;

2819

show_node(zone);

2819

show_node(zone);

2820

printk("%s"

2820

printk("%s"

2821

" free:%lukB"

2821

" free:%lukB"

2822

" min:%lukB"

2822

" min:%lukB"

2823

" low:%lukB"

2823

" low:%lukB"

2824

" high:%lukB"

2824

" high:%lukB"

2825

" active_anon:%lukB"

2825

" active_anon:%lukB"

2826

" inactive_anon:%lukB"

2826

" inactive_anon:%lukB"

2827

" active_file:%lukB"

2827

" active_file:%lukB"

2828

" inactive_file:%lukB"

2828

" inactive_file:%lukB"

2829

" unevictable:%lukB"

2829

" unevictable:%lukB"

2830

" isolated(anon):%lukB"

2830

" isolated(anon):%lukB"

2831

" isolated(file):%lukB"

2831

" isolated(file):%lukB"

2832

" present:%lukB"

2832

" present:%lukB"

2833

" mlocked:%lukB"

2833

" mlocked:%lukB"

2834

" dirty:%lukB"

2834

" dirty:%lukB"

2835

" writeback:%lukB"

2835

" writeback:%lukB"

2836

" mapped:%lukB"

2836

" mapped:%lukB"

2837

" shmem:%lukB"

2837

" shmem:%lukB"

2838

" slab_reclaimable:%lukB"

2838

" slab_reclaimable:%lukB"

2839

" slab_unreclaimable:%lukB"

2839

" slab_unreclaimable:%lukB"

2840

" kernel_stack:%lukB"

2840

" kernel_stack:%lukB"

2841

" pagetables:%lukB"

2841

" pagetables:%lukB"

2842

" unstable:%lukB"

2842

" unstable:%lukB"

2843

" bounce:%lukB"

2843

" bounce:%lukB"

2844

" writeback_tmp:%lukB"

2844

" writeback_tmp:%lukB"

2845

" pages_scanned:%lu"

2845

" pages_scanned:%lu"

2846

" all_unreclaimable? %s"

2846

" all_unreclaimable? %s"

2847

"\n",

2847

"\n",

2848

zone->name,

2848

zone->name,

2849

K(zone_page_state(zone, NR_FREE_PAGES)),

2849

K(zone_page_state(zone, NR_FREE_PAGES)),

2850

K(min_wmark_pages(zone)),

2850

K(min_wmark_pages(zone)),

2851

K(low_wmark_pages(zone)),

2851

K(low_wmark_pages(zone)),

2852

K(high_wmark_pages(zone)),

2852

K(high_wmark_pages(zone)),

2853

K(zone_page_state(zone, NR_ACTIVE_ANON)),

2853

K(zone_page_state(zone, NR_ACTIVE_ANON)),

2854

K(zone_page_state(zone, NR_INACTIVE_ANON)),

2854

K(zone_page_state(zone, NR_INACTIVE_ANON)),

2855

K(zone_page_state(zone, NR_ACTIVE_FILE)),

2855

K(zone_page_state(zone, NR_ACTIVE_FILE)),

2856

K(zone_page_state(zone, NR_INACTIVE_FILE)),

2856

K(zone_page_state(zone, NR_INACTIVE_FILE)),

2857

K(zone_page_state(zone, NR_UNEVICTABLE)),

2857

K(zone_page_state(zone, NR_UNEVICTABLE)),

2858

K(zone_page_state(zone, NR_ISOLATED_ANON)),

2858

K(zone_page_state(zone, NR_ISOLATED_ANON)),

2859

K(zone_page_state(zone, NR_ISOLATED_FILE)),

2859

K(zone_page_state(zone, NR_ISOLATED_FILE)),

2860

K(zone->present_pages),

2860

K(zone->present_pages),

2861

K(zone_page_state(zone, NR_MLOCK)),

2861

K(zone_page_state(zone, NR_MLOCK)),

2862

K(zone_page_state(zone, NR_FILE_DIRTY)),

2862

K(zone_page_state(zone, NR_FILE_DIRTY)),

2863

K(zone_page_state(zone, NR_WRITEBACK)),

2863

K(zone_page_state(zone, NR_WRITEBACK)),

2864

K(zone_page_state(zone, NR_FILE_MAPPED)),

2864

K(zone_page_state(zone, NR_FILE_MAPPED)),

2865

K(zone_page_state(zone, NR_SHMEM)),

2865

K(zone_page_state(zone, NR_SHMEM)),

2866

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

2866

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

2867

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

2867

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

2868

zone_page_state(zone, NR_KERNEL_STACK) *

2868

zone_page_state(zone, NR_KERNEL_STACK) *

2869

THREAD_SIZE / 1024,

2869

THREAD_SIZE / 1024,

2870

K(zone_page_state(zone, NR_PAGETABLE)),

2870

K(zone_page_state(zone, NR_PAGETABLE)),

2871

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

2871

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

2872

K(zone_page_state(zone, NR_BOUNCE)),

2872

K(zone_page_state(zone, NR_BOUNCE)),

2873

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

2873

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

2874

zone->pages_scanned,

2874

zone->pages_scanned,

2875

(zone->all_unreclaimable ? "yes" : "no")

2875

(zone->all_unreclaimable ? "yes" : "no")

2876

);

2876

);

2877

printk("lowmem_reserve[]:");

2877

printk("lowmem_reserve[]:");

2878

for (i = 0; i < MAX_NR_ZONES; i++)

2878

for (i = 0; i < MAX_NR_ZONES; i++)

2879

printk(" %lu", zone->lowmem_reserve[i]);

2879

printk(" %lu", zone->lowmem_reserve[i]);

2880

printk("\n");

2880

printk("\n");

2881

}

2881

}

2882

2883

for_each_populated_zone(zone) {

2883

for_each_populated_zone(zone) {

2884

unsigned long nr[MAX_ORDER], flags, order, total = 0;

2884

unsigned long nr[MAX_ORDER], flags, order, total = 0;

2885

2886

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2886

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2887

continue;

2887

continue;

2888

show_node(zone);

2888

show_node(zone);

2889

printk("%s: ", zone->name);

2889

printk("%s: ", zone->name);

2890

2891

spin_lock_irqsave(&zone->lock, flags);

2891

spin_lock_irqsave(&zone->lock, flags);

2892

for (order = 0; order < MAX_ORDER; order++) {

2892

for (order = 0; order < MAX_ORDER; order++) {

2893

nr[order] = zone->free_area[order].nr_free;

2893

nr[order] = zone->free_area[order].nr_free;

2894

total += nr[order] << order;

2894

total += nr[order] << order;

2895

}

2895

}

2896

spin_unlock_irqrestore(&zone->lock, flags);

2896

spin_unlock_irqrestore(&zone->lock, flags);

2897

for (order = 0; order < MAX_ORDER; order++)

2897

for (order = 0; order < MAX_ORDER; order++)

2898

printk("%lu*%lukB ", nr[order], K(1UL) << order);

2898

printk("%lu*%lukB ", nr[order], K(1UL) << order);

2899

printk("= %lukB\n", K(total));

2899

printk("= %lukB\n", K(total));

2900

}

2900

}

2901

2902

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

2902

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

2903

2904

show_swap_cache_info();

2904

show_swap_cache_info();

2905

}

2905

}

2906

2907

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

2907

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

2908

{

2908

{

2909

zoneref->zone = zone;

2909

zoneref->zone = zone;

2910

zoneref->zone_idx = zone_idx(zone);

2910

zoneref->zone_idx = zone_idx(zone);

2911

}

2911

}

2912

2913

/*

2913

/*

2914

* Builds allocation fallback zone lists.

2914

* Builds allocation fallback zone lists.

2915

*

2915

*

2916

* Add all populated zones of a node to the zonelist.

2916

* Add all populated zones of a node to the zonelist.

2917

*/

2917

*/

2918

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

2918

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

2919

int nr_zones, enum zone_type zone_type)

2919

int nr_zones, enum zone_type zone_type)

2920

{

2920

{

2921

struct zone *zone;

2921

struct zone *zone;

2922

2923

BUG_ON(zone_type >= MAX_NR_ZONES);

2923

BUG_ON(zone_type >= MAX_NR_ZONES);

2924

zone_type++;

2924

zone_type++;

2925

2926

do {

2926

do {

2927

zone_type--;

2927

zone_type--;

2928

zone = pgdat->node_zones + zone_type;

2928

zone = pgdat->node_zones + zone_type;

2929

if (populated_zone(zone)) {

2929

if (populated_zone(zone)) {

2930

zoneref_set_zone(zone,

2930

zoneref_set_zone(zone,

2931

&zonelist->_zonerefs[nr_zones++]);

2931

&zonelist->_zonerefs[nr_zones++]);

2932

check_highest_zone(zone_type);

2932

check_highest_zone(zone_type);

2933

}

2933

}

2934

2935

} while (zone_type);

2935

} while (zone_type);

2936

return nr_zones;

2936

return nr_zones;

2937

}

2937

}

2938

2939

2940

/*

2940

/*

2941

* zonelist_order:

2941

* zonelist_order:

2942

* 0 = automatic detection of better ordering.

2942

* 0 = automatic detection of better ordering.

2943

* 1 = order by ([node] distance, -zonetype)

2943

* 1 = order by ([node] distance, -zonetype)

2944

* 2 = order by (-zonetype, [node] distance)

2944

* 2 = order by (-zonetype, [node] distance)

2945

*

2945

*

2946

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

2946

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

2947

* the same zonelist. So only NUMA can configure this param.

2947

* the same zonelist. So only NUMA can configure this param.

2948

*/

2948

*/

2949

#define ZONELIST_ORDER_DEFAULT 0

2949

#define ZONELIST_ORDER_DEFAULT 0

2950

#define ZONELIST_ORDER_NODE 1

2950

#define ZONELIST_ORDER_NODE 1

2951

#define ZONELIST_ORDER_ZONE 2

2951

#define ZONELIST_ORDER_ZONE 2

2952

2953

/* zonelist order in the kernel.

2953

/* zonelist order in the kernel.

2954

* set_zonelist_order() will set this to NODE or ZONE.

2954

* set_zonelist_order() will set this to NODE or ZONE.

2955

*/

2955

*/

2956

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

2956

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

2957

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

2957

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

2958

2959

2960

#ifdef CONFIG_NUMA

2960

#ifdef CONFIG_NUMA

2961

/* The value user specified ....changed by config */

2961

/* The value user specified ....changed by config */

2962

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

2962

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

2963

/* string for sysctl */

2963

/* string for sysctl */

2964

#define NUMA_ZONELIST_ORDER_LEN 16

2964

#define NUMA_ZONELIST_ORDER_LEN 16

2965

char numa_zonelist_order[16] = "default";

2965

char numa_zonelist_order[16] = "default";

2966

2967

/*

2967

/*

2968

* interface for configure zonelist ordering.

2968

* interface for configure zonelist ordering.

2969

* command line option "numa_zonelist_order"

2969

* command line option "numa_zonelist_order"

2970

* = "[dD]efault - default, automatic configuration.

2970

* = "[dD]efault - default, automatic configuration.

2971

* = "[nN]ode - order by node locality, then by zone within node

2971

* = "[nN]ode - order by node locality, then by zone within node

2972

* = "[zZ]one - order by zone, then by locality within zone

2972

* = "[zZ]one - order by zone, then by locality within zone

2973

*/

2973

*/

2974

2975

static int __parse_numa_zonelist_order(char *s)

2975

static int __parse_numa_zonelist_order(char *s)

2976

{

2976

{

2977

if (*s == 'd' || *s == 'D') {

2977

if (*s == 'd' || *s == 'D') {

2978

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

2978

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

2979

} else if (*s == 'n' || *s == 'N') {

2979

} else if (*s == 'n' || *s == 'N') {

2980

user_zonelist_order = ZONELIST_ORDER_NODE;

2980

user_zonelist_order = ZONELIST_ORDER_NODE;

2981

} else if (*s == 'z' || *s == 'Z') {

2981

} else if (*s == 'z' || *s == 'Z') {

2982

user_zonelist_order = ZONELIST_ORDER_ZONE;

2982

user_zonelist_order = ZONELIST_ORDER_ZONE;

2983

} else {

2983

} else {

2984

printk(KERN_WARNING

2984

printk(KERN_WARNING

2985

"Ignoring invalid numa_zonelist_order value: "

2985

"Ignoring invalid numa_zonelist_order value: "

2986

"%s\n", s);

2986

"%s\n", s);

2987

return -EINVAL;

2987

return -EINVAL;

2988

}

2988

}

2989

return 0;

2989

return 0;

2990

}

2990

}

2991

2992

static __init int setup_numa_zonelist_order(char *s)

2992

static __init int setup_numa_zonelist_order(char *s)

2993

{

2993

{

2994

int ret;

2994

int ret;

2995

2996

if (!s)

2996

if (!s)

2997

return 0;

2997

return 0;

2998

2999

ret = __parse_numa_zonelist_order(s);

2999

ret = __parse_numa_zonelist_order(s);

3000

if (ret == 0)

3000

if (ret == 0)

3001

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3001

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3002

3003

return ret;

3003

return ret;

3004

}

3004

}

3005

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3005

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3006

3007

/*

3007

/*

3008

* sysctl handler for numa_zonelist_order

3008

* sysctl handler for numa_zonelist_order

3009

*/

3009

*/

3010

int numa_zonelist_order_handler(ctl_table *table, int write,

3010

int numa_zonelist_order_handler(ctl_table *table, int write,

3011

void __user *buffer, size_t *length,

3011

void __user *buffer, size_t *length,

3012

loff_t *ppos)

3012

loff_t *ppos)

3013

{

3013

{

3014

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3014

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3015

int ret;

3015

int ret;

3016

static DEFINE_MUTEX(zl_order_mutex);

3016

static DEFINE_MUTEX(zl_order_mutex);

3017

3018

mutex_lock(&zl_order_mutex);

3018

mutex_lock(&zl_order_mutex);

3019

if (write)

3019

if (write)

3020

strcpy(saved_string, (char*)table->data);

3020

strcpy(saved_string, (char*)table->data);

3021

ret = proc_dostring(table, write, buffer, length, ppos);

3021

ret = proc_dostring(table, write, buffer, length, ppos);

3022

if (ret)

3022

if (ret)

3023

goto out;

3023

goto out;

3024

if (write) {

3024

if (write) {

3025

int oldval = user_zonelist_order;

3025

int oldval = user_zonelist_order;

3026

if (__parse_numa_zonelist_order((char*)table->data)) {

3026

if (__parse_numa_zonelist_order((char*)table->data)) {

3027

/*

3027

/*

3028

* bogus value. restore saved string

3028

* bogus value. restore saved string

3029

*/

3029

*/

3030

strncpy((char*)table->data, saved_string,

3030

strncpy((char*)table->data, saved_string,

3031

NUMA_ZONELIST_ORDER_LEN);

3031

NUMA_ZONELIST_ORDER_LEN);

3032

user_zonelist_order = oldval;

3032

user_zonelist_order = oldval;

3033

} else if (oldval != user_zonelist_order) {

3033

} else if (oldval != user_zonelist_order) {

3034

mutex_lock(&zonelists_mutex);

3034

mutex_lock(&zonelists_mutex);

3035

build_all_zonelists(NULL);

3035

build_all_zonelists(NULL);

3036

mutex_unlock(&zonelists_mutex);

3036

mutex_unlock(&zonelists_mutex);

3037

}

3037

}

3038

}

3038

}

3039

out:

3039

out:

3040

mutex_unlock(&zl_order_mutex);

3040

mutex_unlock(&zl_order_mutex);

3041

return ret;

3041

return ret;

3042

}

3042

}

3043

3044

3045

#define MAX_NODE_LOAD (nr_online_nodes)

3045

#define MAX_NODE_LOAD (nr_online_nodes)

3046

static int node_load[MAX_NUMNODES];

3046

static int node_load[MAX_NUMNODES];

3047

3048

/**

3048

/**

3049

* find_next_best_node - find the next node that should appear in a given node's fallback list

3049

* find_next_best_node - find the next node that should appear in a given node's fallback list

3050

* @node: node whose fallback list we're appending

3050

* @node: node whose fallback list we're appending

3051

* @used_node_mask: nodemask_t of already used nodes

3051

* @used_node_mask: nodemask_t of already used nodes

3052

*

3052

*

3053

* We use a number of factors to determine which is the next node that should

3053

* We use a number of factors to determine which is the next node that should

3054

* appear on a given node's fallback list. The node should not have appeared

3054

* appear on a given node's fallback list. The node should not have appeared

3055

* already in @node's fallback list, and it should be the next closest node

3055

* already in @node's fallback list, and it should be the next closest node

3056

* according to the distance array (which contains arbitrary distance values

3056

* according to the distance array (which contains arbitrary distance values

3057

* from each node to each node in the system), and should also prefer nodes

3057

* from each node to each node in the system), and should also prefer nodes

3058

* with no CPUs, since presumably they'll have very little allocation pressure

3058

* with no CPUs, since presumably they'll have very little allocation pressure

3059

* on them otherwise.

3059

* on them otherwise.

3060

* It returns -1 if no node is found.

3060

* It returns -1 if no node is found.

3061

*/

3061

*/

3062

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3062

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3063

{

3063

{

3064

int n, val;

3064

int n, val;

3065

int min_val = INT_MAX;

3065

int min_val = INT_MAX;

3066

int best_node = -1;

3066

int best_node = -1;

3067

const struct cpumask *tmp = cpumask_of_node(0);

3067

const struct cpumask *tmp = cpumask_of_node(0);

3068

3069

/* Use the local node if we haven't already */

3069

/* Use the local node if we haven't already */

3070

if (!node_isset(node, *used_node_mask)) {

3070

if (!node_isset(node, *used_node_mask)) {

3071

node_set(node, *used_node_mask);

3071

node_set(node, *used_node_mask);

3072

return node;

3072

return node;

3073

}

3073

}

3074

3075

for_each_node_state(n, N_HIGH_MEMORY) {

3075

for_each_node_state(n, N_HIGH_MEMORY) {

3076

3077

/* Don't want a node to appear more than once */

3077

/* Don't want a node to appear more than once */

3078

if (node_isset(n, *used_node_mask))

3078

if (node_isset(n, *used_node_mask))

3079

continue;

3079

continue;

3080

3081

/* Use the distance array to find the distance */

3081

/* Use the distance array to find the distance */

3082

val = node_distance(node, n);

3082

val = node_distance(node, n);

3083

3084

/* Penalize nodes under us ("prefer the next node") */

3084

/* Penalize nodes under us ("prefer the next node") */

3085

val += (n < node);

3085

val += (n < node);

3086

3087

/* Give preference to headless and unused nodes */

3087

/* Give preference to headless and unused nodes */

3088

tmp = cpumask_of_node(n);

3088

tmp = cpumask_of_node(n);

3089

if (!cpumask_empty(tmp))

3089

if (!cpumask_empty(tmp))

3090

val += PENALTY_FOR_NODE_WITH_CPUS;

3090

val += PENALTY_FOR_NODE_WITH_CPUS;

3091

3092

/* Slight preference for less loaded node */

3092

/* Slight preference for less loaded node */

3093

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3093

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3094

val += node_load[n];

3094

val += node_load[n];

3095

3096

if (val < min_val) {

3096

if (val < min_val) {

3097

min_val = val;

3097

min_val = val;

3098

best_node = n;

3098

best_node = n;

3099

}

3099

}

3100

}

3100

}

3101

3102

if (best_node >= 0)

3102

if (best_node >= 0)

3103

node_set(best_node, *used_node_mask);

3103

node_set(best_node, *used_node_mask);

3104

3105

return best_node;

3105

return best_node;

3106

}

3106

}

3107

3108

3109

/*

3109

/*

3110

* Build zonelists ordered by node and zones within node.

3110

* Build zonelists ordered by node and zones within node.

3111

* This results in maximum locality--normal zone overflows into local

3111

* This results in maximum locality--normal zone overflows into local

3112

* DMA zone, if any--but risks exhausting DMA zone.

3112

* DMA zone, if any--but risks exhausting DMA zone.

3113

*/

3113

*/

3114

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3114

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3115

{

3115

{

3116

int j;

3116

int j;

3117

struct zonelist *zonelist;

3117

struct zonelist *zonelist;

3118

3119

zonelist = &pgdat->node_zonelists[0];

3119

zonelist = &pgdat->node_zonelists[0];

3120

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3120

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3121

;

3121

;

3122

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3122

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3123

MAX_NR_ZONES - 1);

3123

MAX_NR_ZONES - 1);

3124

zonelist->_zonerefs[j].zone = NULL;

3124

zonelist->_zonerefs[j].zone = NULL;

3125

zonelist->_zonerefs[j].zone_idx = 0;

3125

zonelist->_zonerefs[j].zone_idx = 0;

3126

}

3126

}

3127

3128

/*

3128

/*

3129

* Build gfp_thisnode zonelists

3129

* Build gfp_thisnode zonelists

3130

*/

3130

*/

3131

static void build_thisnode_zonelists(pg_data_t *pgdat)

3131

static void build_thisnode_zonelists(pg_data_t *pgdat)

3132

{

3132

{

3133

int j;

3133

int j;

3134

struct zonelist *zonelist;

3134

struct zonelist *zonelist;

3135

3136

zonelist = &pgdat->node_zonelists[1];

3136

zonelist = &pgdat->node_zonelists[1];

3137

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3137

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3138

zonelist->_zonerefs[j].zone = NULL;

3138

zonelist->_zonerefs[j].zone = NULL;

3139

zonelist->_zonerefs[j].zone_idx = 0;

3139

zonelist->_zonerefs[j].zone_idx = 0;

3140

}

3140

}

3141

3142

/*

3142

/*

3143

* Build zonelists ordered by zone and nodes within zones.

3143

* Build zonelists ordered by zone and nodes within zones.

3144

* This results in conserving DMA zone[s] until all Normal memory is

3144

* This results in conserving DMA zone[s] until all Normal memory is

3145

* exhausted, but results in overflowing to remote node while memory

3145

* exhausted, but results in overflowing to remote node while memory

3146

* may still exist in local DMA zone.

3146

* may still exist in local DMA zone.

3147

*/

3147

*/

3148

static int node_order[MAX_NUMNODES];

3148

static int node_order[MAX_NUMNODES];

3149

3150

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3150

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3151

{

3151

{

3152

int pos, j, node;

3152

int pos, j, node;

3153

int zone_type; /* needs to be signed */

3153

int zone_type; /* needs to be signed */

3154

struct zone *z;

3154

struct zone *z;

3155

struct zonelist *zonelist;

3155

struct zonelist *zonelist;

3156

3157

zonelist = &pgdat->node_zonelists[0];

3157

zonelist = &pgdat->node_zonelists[0];

3158

pos = 0;

3158

pos = 0;

3159

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3159

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3160

for (j = 0; j < nr_nodes; j++) {

3160

for (j = 0; j < nr_nodes; j++) {

3161

node = node_order[j];

3161

node = node_order[j];

3162

z = &NODE_DATA(node)->node_zones[zone_type];

3162

z = &NODE_DATA(node)->node_zones[zone_type];

3163

if (populated_zone(z)) {

3163

if (populated_zone(z)) {

3164

zoneref_set_zone(z,

3164

zoneref_set_zone(z,

3165

&zonelist->_zonerefs[pos++]);

3165

&zonelist->_zonerefs[pos++]);

3166

check_highest_zone(zone_type);

3166

check_highest_zone(zone_type);

3167

}

3167

}

3168

}

3168

}

3169

}

3169

}

3170

zonelist->_zonerefs[pos].zone = NULL;

3170

zonelist->_zonerefs[pos].zone = NULL;

3171

zonelist->_zonerefs[pos].zone_idx = 0;

3171

zonelist->_zonerefs[pos].zone_idx = 0;

3172

}

3172

}

3173

3174

static int default_zonelist_order(void)

3174

static int default_zonelist_order(void)

3175

{

3175

{

3176

int nid, zone_type;

3176

int nid, zone_type;

3177

unsigned long low_kmem_size,total_size;

3177

unsigned long low_kmem_size,total_size;

3178

struct zone *z;

3178

struct zone *z;

3179

int average_size;

3179

int average_size;

3180

/*

3180

/*

3181

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3181

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3182

* If they are really small and used heavily, the system can fall

3182

* If they are really small and used heavily, the system can fall

3183

* into OOM very easily.

3183

* into OOM very easily.

3184

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3184

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3185

*/

3185

*/

3186

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3186

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3187

low_kmem_size = 0;

3187

low_kmem_size = 0;

3188

total_size = 0;

3188

total_size = 0;

3189

for_each_online_node(nid) {

3189

for_each_online_node(nid) {

3190

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3190

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3191

z = &NODE_DATA(nid)->node_zones[zone_type];

3191

z = &NODE_DATA(nid)->node_zones[zone_type];

3192

if (populated_zone(z)) {

3192

if (populated_zone(z)) {

3193

if (zone_type < ZONE_NORMAL)

3193

if (zone_type < ZONE_NORMAL)

3194

low_kmem_size += z->present_pages;

3194

low_kmem_size += z->present_pages;

3195

total_size += z->present_pages;

3195

total_size += z->present_pages;

3196

} else if (zone_type == ZONE_NORMAL) {

3196

} else if (zone_type == ZONE_NORMAL) {

3197

/*

3197

/*

3198

* If any node has only lowmem, then node order

3198

* If any node has only lowmem, then node order

3199

* is preferred to allow kernel allocations

3199

* is preferred to allow kernel allocations

3200

* locally; otherwise, they can easily infringe

3200

* locally; otherwise, they can easily infringe

3201

* on other nodes when there is an abundance of

3201

* on other nodes when there is an abundance of

3202

* lowmem available to allocate from.

3202

* lowmem available to allocate from.

3203

*/

3203

*/

3204

return ZONELIST_ORDER_NODE;

3204

return ZONELIST_ORDER_NODE;

3205

}

3205

}

3206

}

3206

}

3207

}

3207

}

3208

if (!low_kmem_size || /* there are no DMA area. */

3208

if (!low_kmem_size || /* there are no DMA area. */

3209

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3209

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3210

return ZONELIST_ORDER_NODE;

3210

return ZONELIST_ORDER_NODE;

3211

/*

3211

/*

3212

* look into each node's config.

3212

* look into each node's config.

3213

* If there is a node whose DMA/DMA32 memory is very big area on

3213

* If there is a node whose DMA/DMA32 memory is very big area on

3214

* local memory, NODE_ORDER may be suitable.

3214

* local memory, NODE_ORDER may be suitable.

3215

*/

3215

*/

3216

average_size = total_size /

3216

average_size = total_size /

3217

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

3217

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

3218

for_each_online_node(nid) {

3218

for_each_online_node(nid) {

3219

low_kmem_size = 0;

3219

low_kmem_size = 0;

3220

total_size = 0;

3220

total_size = 0;

3221

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3221

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3222

z = &NODE_DATA(nid)->node_zones[zone_type];

3222

z = &NODE_DATA(nid)->node_zones[zone_type];

3223

if (populated_zone(z)) {

3223

if (populated_zone(z)) {

3224

if (zone_type < ZONE_NORMAL)

3224

if (zone_type < ZONE_NORMAL)

3225

low_kmem_size += z->present_pages;

3225

low_kmem_size += z->present_pages;

3226

total_size += z->present_pages;

3226

total_size += z->present_pages;

3227

}

3227

}

3228

}

3228

}

3229

if (low_kmem_size &&

3229

if (low_kmem_size &&

3230

total_size > average_size && /* ignore small node */

3230

total_size > average_size && /* ignore small node */

3231

low_kmem_size > total_size * 70/100)

3231

low_kmem_size > total_size * 70/100)

3232

return ZONELIST_ORDER_NODE;

3232

return ZONELIST_ORDER_NODE;

3233

}

3233

}

3234

return ZONELIST_ORDER_ZONE;

3234

return ZONELIST_ORDER_ZONE;

3235

}

3235

}

3236

3237

static void set_zonelist_order(void)

3237

static void set_zonelist_order(void)

3238

{

3238

{

3239

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3239

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3240

current_zonelist_order = default_zonelist_order();

3240

current_zonelist_order = default_zonelist_order();

3241

else

3241

else

3242

current_zonelist_order = user_zonelist_order;

3242

current_zonelist_order = user_zonelist_order;

3243

}

3243

}

3244

3245

static void build_zonelists(pg_data_t *pgdat)

3245

static void build_zonelists(pg_data_t *pgdat)

3246

{

3246

{

3247

int j, node, load;

3247

int j, node, load;

3248

enum zone_type i;

3248

enum zone_type i;

3249

nodemask_t used_mask;

3249

nodemask_t used_mask;

3250

int local_node, prev_node;

3250

int local_node, prev_node;

3251

struct zonelist *zonelist;

3251

struct zonelist *zonelist;

3252

int order = current_zonelist_order;

3252

int order = current_zonelist_order;

3253

3254

/* initialize zonelists */

3254

/* initialize zonelists */

3255

for (i = 0; i < MAX_ZONELISTS; i++) {

3255

for (i = 0; i < MAX_ZONELISTS; i++) {

3256

zonelist = pgdat->node_zonelists + i;

3256

zonelist = pgdat->node_zonelists + i;

3257

zonelist->_zonerefs[0].zone = NULL;

3257

zonelist->_zonerefs[0].zone = NULL;

3258

zonelist->_zonerefs[0].zone_idx = 0;

3258

zonelist->_zonerefs[0].zone_idx = 0;

3259

}

3259

}

3260

3261

/* NUMA-aware ordering of nodes */

3261

/* NUMA-aware ordering of nodes */

3262

local_node = pgdat->node_id;

3262

local_node = pgdat->node_id;

3263

load = nr_online_nodes;

3263

load = nr_online_nodes;

3264

prev_node = local_node;

3264

prev_node = local_node;

3265

nodes_clear(used_mask);

3265

nodes_clear(used_mask);

3266

3267

memset(node_order, 0, sizeof(node_order));

3267

memset(node_order, 0, sizeof(node_order));

3268

j = 0;

3268

j = 0;

3269

3270

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3270

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3271

int distance = node_distance(local_node, node);

3271

int distance = node_distance(local_node, node);

3272

3273

/*

3273

/*

3274

* If another node is sufficiently far away then it is better

3274

* If another node is sufficiently far away then it is better

3275

* to reclaim pages in a zone before going off node.

3275

* to reclaim pages in a zone before going off node.

3276

*/

3276

*/

3277

if (distance > RECLAIM_DISTANCE)

3277

if (distance > RECLAIM_DISTANCE)

3278

zone_reclaim_mode = 1;

3278

zone_reclaim_mode = 1;

3279

3280

/*

3280

/*

3281

* We don't want to pressure a particular node.

3281

* We don't want to pressure a particular node.

3282

* So adding penalty to the first node in same

3282

* So adding penalty to the first node in same

3283

* distance group to make it round-robin.

3283

* distance group to make it round-robin.

3284

*/

3284

*/

3285

if (distance != node_distance(local_node, prev_node))

3285

if (distance != node_distance(local_node, prev_node))

3286

node_load[node] = load;

3286

node_load[node] = load;

3287

3288

prev_node = node;

3288

prev_node = node;

3289

load--;

3289

load--;

3290

if (order == ZONELIST_ORDER_NODE)

3290

if (order == ZONELIST_ORDER_NODE)

3291

build_zonelists_in_node_order(pgdat, node);

3291

build_zonelists_in_node_order(pgdat, node);

3292

else

3292

else

3293

node_order[j++] = node; /* remember order */

3293

node_order[j++] = node; /* remember order */

3294

}

3294

}

3295

3296

if (order == ZONELIST_ORDER_ZONE) {

3296

if (order == ZONELIST_ORDER_ZONE) {

3297

/* calculate node order -- i.e., DMA last! */

3297

/* calculate node order -- i.e., DMA last! */

3298

build_zonelists_in_zone_order(pgdat, j);

3298

build_zonelists_in_zone_order(pgdat, j);

3299

}

3299

}

3300

3301

build_thisnode_zonelists(pgdat);

3301

build_thisnode_zonelists(pgdat);

3302

}

3302

}

3303

3304

/* Construct the zonelist performance cache - see further mmzone.h */

3304

/* Construct the zonelist performance cache - see further mmzone.h */

3305

static void build_zonelist_cache(pg_data_t *pgdat)

3305

static void build_zonelist_cache(pg_data_t *pgdat)

3306

{

3306

{

3307

struct zonelist *zonelist;

3307

struct zonelist *zonelist;

3308

struct zonelist_cache *zlc;

3308

struct zonelist_cache *zlc;

3309

struct zoneref *z;

3309

struct zoneref *z;

3310

3311

zonelist = &pgdat->node_zonelists[0];

3311

zonelist = &pgdat->node_zonelists[0];

3312

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3312

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3313

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3313

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3314

for (z = zonelist->_zonerefs; z->zone; z++)

3314

for (z = zonelist->_zonerefs; z->zone; z++)

3315

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3315

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3316

}

3316

}

3317

3318

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3318

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3319

/*

3319

/*

3320

* Return node id of node used for "local" allocations.

3320

* Return node id of node used for "local" allocations.

3321

* I.e., first node id of first zone in arg node's generic zonelist.

3321

* I.e., first node id of first zone in arg node's generic zonelist.

3322

* Used for initializing percpu 'numa_mem', which is used primarily

3322

* Used for initializing percpu 'numa_mem', which is used primarily

3323

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3323

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3324

*/

3324

*/

3325

int local_memory_node(int node)

3325

int local_memory_node(int node)

3326

{

3326

{

3327

struct zone *zone;

3327

struct zone *zone;

3328

3329

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3329

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3330

gfp_zone(GFP_KERNEL),

3330

gfp_zone(GFP_KERNEL),

3331

NULL,

3331

NULL,

3332

&zone);

3332

&zone);

3333

return zone->node;

3333

return zone->node;

3334

}

3334

}

3335

#endif

3335

#endif

3336

3337

#else /* CONFIG_NUMA */

3337

#else /* CONFIG_NUMA */

3338

3339

static void set_zonelist_order(void)

3339

static void set_zonelist_order(void)

3340

{

3340

{

3341

current_zonelist_order = ZONELIST_ORDER_ZONE;

3341

current_zonelist_order = ZONELIST_ORDER_ZONE;

3342

}

3342

}

3343

3344

static void build_zonelists(pg_data_t *pgdat)

3344

static void build_zonelists(pg_data_t *pgdat)

3345

{

3345

{

3346

int node, local_node;

3346

int node, local_node;

3347

enum zone_type j;

3347

enum zone_type j;

3348

struct zonelist *zonelist;

3348

struct zonelist *zonelist;

3349

3350

local_node = pgdat->node_id;

3350

local_node = pgdat->node_id;

3351

3352

zonelist = &pgdat->node_zonelists[0];

3352

zonelist = &pgdat->node_zonelists[0];

3353

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3353

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3354

3355

/*

3355

/*

3356

* Now we build the zonelist so that it contains the zones

3356

* Now we build the zonelist so that it contains the zones

3357

* of all the other nodes.

3357

* of all the other nodes.

3358

* We don't want to pressure a particular node, so when

3358

* We don't want to pressure a particular node, so when

3359

* building the zones for node N, we make sure that the

3359

* building the zones for node N, we make sure that the

3360

* zones coming right after the local ones are those from

3360

* zones coming right after the local ones are those from

3361

* node N+1 (modulo N)

3361

* node N+1 (modulo N)

3362

*/

3362

*/

3363

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3363

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3364

if (!node_online(node))

3364

if (!node_online(node))

3365

continue;

3365

continue;

3366

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3366

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3367

MAX_NR_ZONES - 1);

3367

MAX_NR_ZONES - 1);

3368

}

3368

}

3369

for (node = 0; node < local_node; node++) {

3369

for (node = 0; node < local_node; node++) {

3370

if (!node_online(node))

3370

if (!node_online(node))

3371

continue;

3371

continue;

3372

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3372

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3373

MAX_NR_ZONES - 1);

3373

MAX_NR_ZONES - 1);

3374

}

3374

}

3375

3376

zonelist->_zonerefs[j].zone = NULL;

3376

zonelist->_zonerefs[j].zone = NULL;

3377

zonelist->_zonerefs[j].zone_idx = 0;

3377

zonelist->_zonerefs[j].zone_idx = 0;

3378

}

3378

}

3379

3380

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3380

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3381

static void build_zonelist_cache(pg_data_t *pgdat)

3381

static void build_zonelist_cache(pg_data_t *pgdat)

3382

{

3382

{

3383

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3383

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3384

}

3384

}

3385

3386

#endif /* CONFIG_NUMA */

3386

#endif /* CONFIG_NUMA */

3387

3388

/*

3388

/*

3389

* Boot pageset table. One per cpu which is going to be used for all

3389

* Boot pageset table. One per cpu which is going to be used for all

3390

* zones and all nodes. The parameters will be set in such a way

3390

* zones and all nodes. The parameters will be set in such a way

3391

* that an item put on a list will immediately be handed over to

3391

* that an item put on a list will immediately be handed over to

3392

* the buddy list. This is safe since pageset manipulation is done

3392

* the buddy list. This is safe since pageset manipulation is done

3393

* with interrupts disabled.

3393

* with interrupts disabled.

3394

*

3394

*

3395

* The boot_pagesets must be kept even after bootup is complete for

3395

* The boot_pagesets must be kept even after bootup is complete for

3396

* unused processors and/or zones. They do play a role for bootstrapping

3396

* unused processors and/or zones. They do play a role for bootstrapping

3397

* hotplugged processors.

3397

* hotplugged processors.

3398

*

3398

*

3399

* zoneinfo_show() and maybe other functions do

3399

* zoneinfo_show() and maybe other functions do

3400

* not check if the processor is online before following the pageset pointer.

3400

* not check if the processor is online before following the pageset pointer.

3401

* Other parts of the kernel may not check if the zone is available.

3401

* Other parts of the kernel may not check if the zone is available.

3402

*/

3402

*/

3403

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3403

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3404

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3404

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3405

static void setup_zone_pageset(struct zone *zone);

3405

static void setup_zone_pageset(struct zone *zone);

3406

3407

/*

3407

/*

3408

* Global mutex to protect against size modification of zonelists

3408

* Global mutex to protect against size modification of zonelists

3409

* as well as to serialize pageset setup for the new populated zone.

3409

* as well as to serialize pageset setup for the new populated zone.

3410

*/

3410

*/

3411

DEFINE_MUTEX(zonelists_mutex);

3411

DEFINE_MUTEX(zonelists_mutex);

3412

3413

/* return values int ....just for stop_machine() */

3413

/* return values int ....just for stop_machine() */

3414

static __init_refok int __build_all_zonelists(void *data)

3414

static __init_refok int __build_all_zonelists(void *data)

3415

{

3415

{

3416

int nid;

3416

int nid;

3417

int cpu;

3417

int cpu;

3418

3419

#ifdef CONFIG_NUMA

3419

#ifdef CONFIG_NUMA

3420

memset(node_load, 0, sizeof(node_load));

3420

memset(node_load, 0, sizeof(node_load));

3421

#endif

3421

#endif

3422

for_each_online_node(nid) {

3422

for_each_online_node(nid) {

3423

pg_data_t *pgdat = NODE_DATA(nid);

3423

pg_data_t *pgdat = NODE_DATA(nid);

3424

3425

build_zonelists(pgdat);

3425

build_zonelists(pgdat);

3426

build_zonelist_cache(pgdat);

3426

build_zonelist_cache(pgdat);

3427

}

3427

}

3428

3429

/*

3429

/*

3430

* Initialize the boot_pagesets that are going to be used

3430

* Initialize the boot_pagesets that are going to be used

3431

* for bootstrapping processors. The real pagesets for

3431

* for bootstrapping processors. The real pagesets for

3432

* each zone will be allocated later when the per cpu

3432

* each zone will be allocated later when the per cpu

3433

* allocator is available.

3433

* allocator is available.

3434

*

3434

*

3435

* boot_pagesets are used also for bootstrapping offline

3435

* boot_pagesets are used also for bootstrapping offline

3436

* cpus if the system is already booted because the pagesets

3436

* cpus if the system is already booted because the pagesets

3437

* are needed to initialize allocators on a specific cpu too.

3437

* are needed to initialize allocators on a specific cpu too.

3438

* F.e. the percpu allocator needs the page allocator which

3438

* F.e. the percpu allocator needs the page allocator which

3439

* needs the percpu allocator in order to allocate its pagesets

3439

* needs the percpu allocator in order to allocate its pagesets

3440

* (a chicken-egg dilemma).

3440

* (a chicken-egg dilemma).

3441

*/

3441

*/

3442

for_each_possible_cpu(cpu) {

3442

for_each_possible_cpu(cpu) {

3443

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3443

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3444

3445

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3445

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3446

/*

3446

/*

3447

* We now know the "local memory node" for each node--

3447

* We now know the "local memory node" for each node--

3448

* i.e., the node of the first zone in the generic zonelist.

3448

* i.e., the node of the first zone in the generic zonelist.

3449

* Set up numa_mem percpu variable for on-line cpus. During

3449

* Set up numa_mem percpu variable for on-line cpus. During

3450

* boot, only the boot cpu should be on-line; we'll init the

3450

* boot, only the boot cpu should be on-line; we'll init the

3451

* secondary cpus' numa_mem as they come on-line. During

3451

* secondary cpus' numa_mem as they come on-line. During

3452

* node/memory hotplug, we'll fixup all on-line cpus.

3452

* node/memory hotplug, we'll fixup all on-line cpus.

3453

*/

3453

*/

3454

if (cpu_online(cpu))

3454

if (cpu_online(cpu))

3455

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3455

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3456

#endif

3456

#endif

3457

}

3457

}

3458

3459

return 0;

3459

return 0;

3460

}

3460

}

3461

3462

/*

3462

/*

3463

* Called with zonelists_mutex held always

3463

* Called with zonelists_mutex held always

3464

* unless system_state == SYSTEM_BOOTING.

3464

* unless system_state == SYSTEM_BOOTING.

3465

*/

3465

*/

3466

void __ref build_all_zonelists(void *data)

3466

void __ref build_all_zonelists(void *data)

3467

{

3467

{

3468

set_zonelist_order();

3468

set_zonelist_order();

3469

3470

if (system_state == SYSTEM_BOOTING) {

3470

if (system_state == SYSTEM_BOOTING) {

3471

__build_all_zonelists(NULL);

3471

__build_all_zonelists(NULL);

3472

mminit_verify_zonelist();

3472

mminit_verify_zonelist();

3473

cpuset_init_current_mems_allowed();

3473

cpuset_init_current_mems_allowed();

3474

} else {

3474

} else {

3475

/* we have to stop all cpus to guarantee there is no user

3475

/* we have to stop all cpus to guarantee there is no user

3476

of zonelist */

3476

of zonelist */

3477

#ifdef CONFIG_MEMORY_HOTPLUG

3477

#ifdef CONFIG_MEMORY_HOTPLUG

3478

if (data)

3478

if (data)

3479

setup_zone_pageset((struct zone *)data);

3479

setup_zone_pageset((struct zone *)data);

3480

#endif

3480

#endif

3481

stop_machine(__build_all_zonelists, NULL, NULL);

3481

stop_machine(__build_all_zonelists, NULL, NULL);

3482

/* cpuset refresh routine should be here */

3482

/* cpuset refresh routine should be here */

3483

}

3483

}

3484

vm_total_pages = nr_free_pagecache_pages();

3484

vm_total_pages = nr_free_pagecache_pages();

3485

/*

3485

/*

3486

* Disable grouping by mobility if the number of pages in the

3486

* Disable grouping by mobility if the number of pages in the

3487

* system is too low to allow the mechanism to work. It would be

3487

* system is too low to allow the mechanism to work. It would be

3488

* more accurate, but expensive to check per-zone. This check is

3488

* more accurate, but expensive to check per-zone. This check is

3489

* made on memory-hotadd so a system can start with mobility

3489

* made on memory-hotadd so a system can start with mobility

3490

* disabled and enable it later

3490

* disabled and enable it later

3491

*/

3491

*/

3492

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3492

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3493

page_group_by_mobility_disabled = 1;

3493

page_group_by_mobility_disabled = 1;

3494

else

3494

else

3495

page_group_by_mobility_disabled = 0;

3495

page_group_by_mobility_disabled = 0;

3496

3497

printk("Built %i zonelists in %s order, mobility grouping %s. "

3497

printk("Built %i zonelists in %s order, mobility grouping %s. "

3498

"Total pages: %ld\n",

3498

"Total pages: %ld\n",

3499

nr_online_nodes,

3499

nr_online_nodes,

3500

zonelist_order_name[current_zonelist_order],

3500

zonelist_order_name[current_zonelist_order],

3501

page_group_by_mobility_disabled ? "off" : "on",

3501

page_group_by_mobility_disabled ? "off" : "on",

3502

vm_total_pages);

3502

vm_total_pages);

3503

#ifdef CONFIG_NUMA

3503

#ifdef CONFIG_NUMA

3504

printk("Policy zone: %s\n", zone_names[policy_zone]);

3504

printk("Policy zone: %s\n", zone_names[policy_zone]);

3505

#endif

3505

#endif

3506

}

3506

}

3507

3508

/*

3508

/*

3509

* Helper functions to size the waitqueue hash table.

3509

* Helper functions to size the waitqueue hash table.

3510

* Essentially these want to choose hash table sizes sufficiently

3510

* Essentially these want to choose hash table sizes sufficiently

3511

* large so that collisions trying to wait on pages are rare.

3511

* large so that collisions trying to wait on pages are rare.

3512

* But in fact, the number of active page waitqueues on typical

3512

* But in fact, the number of active page waitqueues on typical

3513

* systems is ridiculously low, less than 200. So this is even

3513

* systems is ridiculously low, less than 200. So this is even

3514

* conservative, even though it seems large.

3514

* conservative, even though it seems large.

3515

*

3515

*

3516

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3516

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3517

* waitqueues, i.e. the size of the waitq table given the number of pages.

3517

* waitqueues, i.e. the size of the waitq table given the number of pages.

3518

*/

3518

*/

3519

#define PAGES_PER_WAITQUEUE 256

3519

#define PAGES_PER_WAITQUEUE 256

3520

3521

#ifndef CONFIG_MEMORY_HOTPLUG

3521

#ifndef CONFIG_MEMORY_HOTPLUG

3522

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3522

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3523

{

3523

{

3524

unsigned long size = 1;

3524

unsigned long size = 1;

3525

3526

pages /= PAGES_PER_WAITQUEUE;

3526

pages /= PAGES_PER_WAITQUEUE;

3527

3528

while (size < pages)

3528

while (size < pages)

3529

size <<= 1;

3529

size <<= 1;

3530

3531

/*

3531

/*

3532

* Once we have dozens or even hundreds of threads sleeping

3532

* Once we have dozens or even hundreds of threads sleeping

3533

* on IO we've got bigger problems than wait queue collision.

3533

* on IO we've got bigger problems than wait queue collision.

3534

* Limit the size of the wait table to a reasonable size.

3534

* Limit the size of the wait table to a reasonable size.

3535

*/

3535

*/

3536

size = min(size, 4096UL);

3536

size = min(size, 4096UL);

3537

3538

return max(size, 4UL);

3538

return max(size, 4UL);

3539

}

3539

}

3540

#else

3540

#else

3541

/*

3541

/*

3542

* A zone's size might be changed by hot-add, so it is not possible to determine

3542

* A zone's size might be changed by hot-add, so it is not possible to determine

3543

* a suitable size for its wait_table. So we use the maximum size now.

3543

* a suitable size for its wait_table. So we use the maximum size now.

3544

*

3544

*

3545

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3545

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3546

*

3546

*

3547

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3547

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3548

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3548

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3549

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3549

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3550

*

3550

*

3551

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3551

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3552

* or more by the traditional way. (See above). It equals:

3552

* or more by the traditional way. (See above). It equals:

3553

*

3553

*

3554

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3554

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3555

* ia64(16K page size) : = ( 8G + 4M)byte.

3555

* ia64(16K page size) : = ( 8G + 4M)byte.

3556

* powerpc (64K page size) : = (32G +16M)byte.

3556

* powerpc (64K page size) : = (32G +16M)byte.

3557

*/

3557

*/

3558

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3558

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3559

{

3559

{

3560

return 4096UL;

3560

return 4096UL;

3561

}

3561

}

3562

#endif

3562

#endif

3563

3564

/*

3564

/*

3565

* This is an integer logarithm so that shifts can be used later

3565

* This is an integer logarithm so that shifts can be used later

3566

* to extract the more random high bits from the multiplicative

3566

* to extract the more random high bits from the multiplicative

3567

* hash function before the remainder is taken.

3567

* hash function before the remainder is taken.

3568

*/

3568

*/

3569

static inline unsigned long wait_table_bits(unsigned long size)

3569

static inline unsigned long wait_table_bits(unsigned long size)

3570

{

3570

{

3571

return ffz(~size);

3571

return ffz(~size);

3572

}

3572

}

3573

3574

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3574

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3575

3576

/*

3576

/*

3577

* Check if a pageblock contains reserved pages

3577

* Check if a pageblock contains reserved pages

3578

*/

3578

*/

3579

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3579

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3580

{

3580

{

3581

unsigned long pfn;

3581

unsigned long pfn;

3582

3583

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3583

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3584

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3584

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3585

return 1;

3585

return 1;

3586

}

3586

}

3587

return 0;

3587

return 0;

3588

}

3588

}

3589

3590

/*

3590

/*

3591

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3591

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3592

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3592

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3593

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3593

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3594

* higher will lead to a bigger reserve which will get freed as contiguous

3594

* higher will lead to a bigger reserve which will get freed as contiguous

3595

* blocks as reclaim kicks in

3595

* blocks as reclaim kicks in

3596

*/

3596

*/

3597

static void setup_zone_migrate_reserve(struct zone *zone)

3597

static void setup_zone_migrate_reserve(struct zone *zone)

3598

{

3598

{

3599

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3599

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3600

struct page *page;

3600

struct page *page;

3601

unsigned long block_migratetype;

3601

unsigned long block_migratetype;

3602

int reserve;

3602

int reserve;

3603

3604

/*

3604

/*

3605

* Get the start pfn, end pfn and the number of blocks to reserve

3605

* Get the start pfn, end pfn and the number of blocks to reserve

3606

* We have to be careful to be aligned to pageblock_nr_pages to

3606

* We have to be careful to be aligned to pageblock_nr_pages to

3607

* make sure that we always check pfn_valid for the first page in

3607

* make sure that we always check pfn_valid for the first page in

3608

* the block.

3608

* the block.

3609

*/

3609

*/

3610

start_pfn = zone->zone_start_pfn;

3610

start_pfn = zone->zone_start_pfn;

3611

end_pfn = start_pfn + zone->spanned_pages;

3611

end_pfn = start_pfn + zone->spanned_pages;

3612

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3612

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3613

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3613

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3614

pageblock_order;

3614

pageblock_order;

3615

3616

/*

3616

/*

3617

* Reserve blocks are generally in place to help high-order atomic

3617

* Reserve blocks are generally in place to help high-order atomic

3618

* allocations that are short-lived. A min_free_kbytes value that

3618

* allocations that are short-lived. A min_free_kbytes value that

3619

* would result in more than 2 reserve blocks for atomic allocations

3619

* would result in more than 2 reserve blocks for atomic allocations

3620

* is assumed to be in place to help anti-fragmentation for the

3620

* is assumed to be in place to help anti-fragmentation for the

3621

* future allocation of hugepages at runtime.

3621

* future allocation of hugepages at runtime.

3622

*/

3622

*/

3623

reserve = min(2, reserve);

3623

reserve = min(2, reserve);

3624

3625

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3625

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3626

if (!pfn_valid(pfn))

3626

if (!pfn_valid(pfn))

3627

continue;

3627

continue;

3628

page = pfn_to_page(pfn);

3628

page = pfn_to_page(pfn);

3629

3630

/* Watch out for overlapping nodes */

3630

/* Watch out for overlapping nodes */

3631

if (page_to_nid(page) != zone_to_nid(zone))

3631

if (page_to_nid(page) != zone_to_nid(zone))

3632

continue;

3632

continue;

3633

3634

block_migratetype = get_pageblock_migratetype(page);

3634

block_migratetype = get_pageblock_migratetype(page);

3635

3636

/* Only test what is necessary when the reserves are not met */

3636

/* Only test what is necessary when the reserves are not met */

3637

if (reserve > 0) {

3637

if (reserve > 0) {

3638

/*

3638

/*

3639

* Blocks with reserved pages will never free, skip

3639

* Blocks with reserved pages will never free, skip

3640

* them.

3640

* them.

3641

*/

3641

*/

3642

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3642

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3643

if (pageblock_is_reserved(pfn, block_end_pfn))

3643

if (pageblock_is_reserved(pfn, block_end_pfn))

3644

continue;

3644

continue;

3645

3646

/* If this block is reserved, account for it */

3646

/* If this block is reserved, account for it */

3647

if (block_migratetype == MIGRATE_RESERVE) {

3647

if (block_migratetype == MIGRATE_RESERVE) {

3648

reserve--;

3648

reserve--;

3649

continue;

3649

continue;

3650

}

3650

}

3651

3652

/* Suitable for reserving if this block is movable */

3652

/* Suitable for reserving if this block is movable */

3653

if (block_migratetype == MIGRATE_MOVABLE) {

3653

if (block_migratetype == MIGRATE_MOVABLE) {

3654

set_pageblock_migratetype(page,

3654

set_pageblock_migratetype(page,

3655

MIGRATE_RESERVE);

3655

MIGRATE_RESERVE);

3656

move_freepages_block(zone, page,

3656

move_freepages_block(zone, page,

3657

MIGRATE_RESERVE);

3657

MIGRATE_RESERVE);

3658

reserve--;

3658

reserve--;

3659

continue;

3659

continue;

3660

}

3660

}

3661

}

3661

}

3662

3663

/*

3663

/*

3664

* If the reserve is met and this is a previous reserved block,

3664

* If the reserve is met and this is a previous reserved block,

3665

* take it back

3665

* take it back

3666

*/

3666

*/

3667

if (block_migratetype == MIGRATE_RESERVE) {

3667

if (block_migratetype == MIGRATE_RESERVE) {

3668

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3668

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3669

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3669

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3670

}

3670

}

3671

}

3671

}

3672

}

3672

}

3673

3674

/*

3674

/*

3675

* Initially all pages are reserved - free ones are freed

3675

* Initially all pages are reserved - free ones are freed

3676

* up by free_all_bootmem() once the early boot process is

3676

* up by free_all_bootmem() once the early boot process is

3677

* done. Non-atomic initialization, single-pass.

3677

* done. Non-atomic initialization, single-pass.

3678

*/

3678

*/

3679

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3679

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3680

unsigned long start_pfn, enum memmap_context context)

3680

unsigned long start_pfn, enum memmap_context context)

3681

{

3681

{

3682

struct page *page;

3682

struct page *page;

3683

unsigned long end_pfn = start_pfn + size;

3683

unsigned long end_pfn = start_pfn + size;

3684

unsigned long pfn;

3684

unsigned long pfn;

3685

struct zone *z;

3685

struct zone *z;

3686

3687

if (highest_memmap_pfn < end_pfn - 1)

3687

if (highest_memmap_pfn < end_pfn - 1)

3688

highest_memmap_pfn = end_pfn - 1;

3688

highest_memmap_pfn = end_pfn - 1;

3689

3690

z = &NODE_DATA(nid)->node_zones[zone];

3690

z = &NODE_DATA(nid)->node_zones[zone];

3691

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3691

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3692

/*

3692

/*

3693

* There can be holes in boot-time mem_map[]s

3693

* There can be holes in boot-time mem_map[]s

3694

* handed to this function. They do not

3694

* handed to this function. They do not

3695

* exist on hotplugged memory.

3695

* exist on hotplugged memory.

3696

*/

3696

*/

3697

if (context == MEMMAP_EARLY) {

3697

if (context == MEMMAP_EARLY) {

3698

if (!early_pfn_valid(pfn))

3698

if (!early_pfn_valid(pfn))

3699

continue;

3699

continue;

3700

if (!early_pfn_in_nid(pfn, nid))

3700

if (!early_pfn_in_nid(pfn, nid))

3701

continue;

3701

continue;

3702

}

3702

}

3703

page = pfn_to_page(pfn);

3703

page = pfn_to_page(pfn);

3704

set_page_links(page, zone, nid, pfn);

3704

set_page_links(page, zone, nid, pfn);

3705

mminit_verify_page_links(page, zone, nid, pfn);

3705

mminit_verify_page_links(page, zone, nid, pfn);

3706

init_page_count(page);

3706

init_page_count(page);

3707

reset_page_mapcount(page);

3707

reset_page_mapcount(page);

3708

SetPageReserved(page);

3708

SetPageReserved(page);

3709

/*

3709

/*

3710

* Mark the block movable so that blocks are reserved for

3710

* Mark the block movable so that blocks are reserved for

3711

* movable at startup. This will force kernel allocations

3711

* movable at startup. This will force kernel allocations

3712

* to reserve their blocks rather than leaking throughout

3712

* to reserve their blocks rather than leaking throughout

3713

* the address space during boot when many long-lived

3713

* the address space during boot when many long-lived

3714

* kernel allocations are made. Later some blocks near

3714

* kernel allocations are made. Later some blocks near

3715

* the start are marked MIGRATE_RESERVE by

3715

* the start are marked MIGRATE_RESERVE by

3716

* setup_zone_migrate_reserve()

3716

* setup_zone_migrate_reserve()

3717

*

3717

*

3718

* bitmap is created for zone's valid pfn range. but memmap

3718

* bitmap is created for zone's valid pfn range. but memmap

3719

* can be created for invalid pages (for alignment)

3719

* can be created for invalid pages (for alignment)

3720

* check here not to call set_pageblock_migratetype() against

3720

* check here not to call set_pageblock_migratetype() against

3721

* pfn out of zone.

3721

* pfn out of zone.

3722

*/

3722

*/

3723

if ((z->zone_start_pfn <= pfn)

3723

if ((z->zone_start_pfn <= pfn)

3724

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3724

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3725

&& !(pfn & (pageblock_nr_pages - 1)))

3725

&& !(pfn & (pageblock_nr_pages - 1)))

3726

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3726

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3727

3728

INIT_LIST_HEAD(&page->lru);

3728

INIT_LIST_HEAD(&page->lru);

3729

#ifdef WANT_PAGE_VIRTUAL

3729

#ifdef WANT_PAGE_VIRTUAL

3730

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3730

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3731

if (!is_highmem_idx(zone))

3731

if (!is_highmem_idx(zone))

3732

set_page_address(page, __va(pfn << PAGE_SHIFT));

3732

set_page_address(page, __va(pfn << PAGE_SHIFT));

3733

#endif

3733

#endif

3734

}

3734

}

3735

}

3735

}

3736

3737

static void __meminit zone_init_free_lists(struct zone *zone)

3737

static void __meminit zone_init_free_lists(struct zone *zone)

3738

{

3738

{

3739

int order, t;

3739

int order, t;

3740

for_each_migratetype_order(order, t) {

3740

for_each_migratetype_order(order, t) {

3741

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3741

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3742

zone->free_area[order].nr_free = 0;

3742

zone->free_area[order].nr_free = 0;

3743

}

3743

}

3744

}

3744

}

3745

3746

#ifndef __HAVE_ARCH_MEMMAP_INIT

3746

#ifndef __HAVE_ARCH_MEMMAP_INIT

3747

#define memmap_init(size, nid, zone, start_pfn) \

3747

#define memmap_init(size, nid, zone, start_pfn) \

3748

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3748

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3749

#endif

3749

#endif

3750

3751

static int zone_batchsize(struct zone *zone)

3751

static int zone_batchsize(struct zone *zone)

3752

{

3752

{

3753

#ifdef CONFIG_MMU

3753

#ifdef CONFIG_MMU

3754

int batch;

3754

int batch;

3755

3756

/*

3756

/*

3757

* The per-cpu-pages pools are set to around 1000th of the

3757

* The per-cpu-pages pools are set to around 1000th of the

3758

* size of the zone. But no more than 1/2 of a meg.

3758

* size of the zone. But no more than 1/2 of a meg.

3759

*

3759

*

3760

* OK, so we don't know how big the cache is. So guess.

3760

* OK, so we don't know how big the cache is. So guess.

3761

*/

3761

*/

3762

batch = zone->present_pages / 1024;

3762

batch = zone->present_pages / 1024;

3763

if (batch * PAGE_SIZE > 512 * 1024)

3763

if (batch * PAGE_SIZE > 512 * 1024)

3764

batch = (512 * 1024) / PAGE_SIZE;

3764

batch = (512 * 1024) / PAGE_SIZE;

3765

batch /= 4; /* We effectively *= 4 below */

3765

batch /= 4; /* We effectively *= 4 below */

3766

if (batch < 1)

3766

if (batch < 1)

3767

batch = 1;

3767

batch = 1;

3768

3769

/*

3769

/*

3770

* Clamp the batch to a 2^n - 1 value. Having a power

3770

* Clamp the batch to a 2^n - 1 value. Having a power

3771

* of 2 value was found to be more likely to have

3771

* of 2 value was found to be more likely to have

3772

* suboptimal cache aliasing properties in some cases.

3772

* suboptimal cache aliasing properties in some cases.

3773

*

3773

*

3774

* For example if 2 tasks are alternately allocating

3774

* For example if 2 tasks are alternately allocating

3775

* batches of pages, one task can end up with a lot

3775

* batches of pages, one task can end up with a lot

3776

* of pages of one half of the possible page colors

3776

* of pages of one half of the possible page colors

3777

* and the other with pages of the other colors.

3777

* and the other with pages of the other colors.

3778

*/

3778

*/

3779

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3779

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3780

3781

return batch;

3781

return batch;

3782

3783

#else

3783

#else

3784

/* The deferral and batching of frees should be suppressed under NOMMU

3784

/* The deferral and batching of frees should be suppressed under NOMMU

3785

* conditions.

3785

* conditions.

3786

*

3786

*

3787

* The problem is that NOMMU needs to be able to allocate large chunks

3787

* The problem is that NOMMU needs to be able to allocate large chunks

3788

* of contiguous memory as there's no hardware page translation to

3788

* of contiguous memory as there's no hardware page translation to

3789

* assemble apparent contiguous memory from discontiguous pages.

3789

* assemble apparent contiguous memory from discontiguous pages.

3790

*

3790

*

3791

* Queueing large contiguous runs of pages for batching, however,

3791

* Queueing large contiguous runs of pages for batching, however,

3792

* causes the pages to actually be freed in smaller chunks. As there

3792

* causes the pages to actually be freed in smaller chunks. As there

3793

* can be a significant delay between the individual batches being

3793

* can be a significant delay between the individual batches being

3794

* recycled, this leads to the once large chunks of space being

3794

* recycled, this leads to the once large chunks of space being

3795

* fragmented and becoming unavailable for high-order allocations.

3795

* fragmented and becoming unavailable for high-order allocations.

3796

*/

3796

*/

3797

return 0;

3797

return 0;

3798

#endif

3798

#endif

3799

}

3799

}

3800

3801

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3801

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3802

{

3802

{

3803

struct per_cpu_pages *pcp;

3803

struct per_cpu_pages *pcp;

3804

int migratetype;

3804

int migratetype;

3805

3806

memset(p, 0, sizeof(*p));

3806

memset(p, 0, sizeof(*p));

3807

3808

pcp = &p->pcp;

3808

pcp = &p->pcp;

3809

pcp->count = 0;

3809

pcp->count = 0;

3810

pcp->high = 6 * batch;

3810

pcp->high = 6 * batch;

3811

pcp->batch = max(1UL, 1 * batch);

3811

pcp->batch = max(1UL, 1 * batch);

3812

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3812

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3813

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3813

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3814

}

3814

}

3815

3816

/*

3816

/*

3817

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3817

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3818

* to the value high for the pageset p.

3818

* to the value high for the pageset p.

3819

*/

3819

*/

3820

3821

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3821

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3822

unsigned long high)

3822

unsigned long high)

3823

{

3823

{

3824

struct per_cpu_pages *pcp;

3824

struct per_cpu_pages *pcp;

3825

3826

pcp = &p->pcp;

3826

pcp = &p->pcp;

3827

pcp->high = high;

3827

pcp->high = high;

3828

pcp->batch = max(1UL, high/4);

3828

pcp->batch = max(1UL, high/4);

3829

if ((high/4) > (PAGE_SHIFT * 8))

3829

if ((high/4) > (PAGE_SHIFT * 8))

3830

pcp->batch = PAGE_SHIFT * 8;

3830

pcp->batch = PAGE_SHIFT * 8;

3831

}

3831

}

3832

3833

static void setup_zone_pageset(struct zone *zone)

3833

static void setup_zone_pageset(struct zone *zone)

3834

{

3834

{

3835

int cpu;

3835

int cpu;

3836

3837

zone->pageset = alloc_percpu(struct per_cpu_pageset);

3837

zone->pageset = alloc_percpu(struct per_cpu_pageset);

3838

3839

for_each_possible_cpu(cpu) {

3839

for_each_possible_cpu(cpu) {

3840

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

3840

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

3841

3842

setup_pageset(pcp, zone_batchsize(zone));

3842

setup_pageset(pcp, zone_batchsize(zone));

3843

3844

if (percpu_pagelist_fraction)

3844

if (percpu_pagelist_fraction)

3845

setup_pagelist_highmark(pcp,

3845

setup_pagelist_highmark(pcp,

3846

(zone->present_pages /

3846

(zone->present_pages /

3847

percpu_pagelist_fraction));

3847

percpu_pagelist_fraction));

3848

}

3848

}

3849

}

3849

}

3850

3851

/*

3851

/*

3852

* Allocate per cpu pagesets and initialize them.

3852

* Allocate per cpu pagesets and initialize them.

3853

* Before this call only boot pagesets were available.

3853

* Before this call only boot pagesets were available.

3854

*/

3854

*/

3855

void __init setup_per_cpu_pageset(void)

3855

void __init setup_per_cpu_pageset(void)

3856

{

3856

{

3857

struct zone *zone;

3857

struct zone *zone;

3858

3859

for_each_populated_zone(zone)

3859

for_each_populated_zone(zone)

3860

setup_zone_pageset(zone);

3860

setup_zone_pageset(zone);

3861

}

3861

}

3862

3863

static noinline __init_refok

3863

static noinline __init_refok

3864

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

3864

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

3865

{

3865

{

3866

int i;

3866

int i;

3867

struct pglist_data *pgdat = zone->zone_pgdat;

3867

struct pglist_data *pgdat = zone->zone_pgdat;

3868

size_t alloc_size;

3868

size_t alloc_size;

3869

3870

/*

3870

/*

3871

* The per-page waitqueue mechanism uses hashed waitqueues

3871

* The per-page waitqueue mechanism uses hashed waitqueues

3872

* per zone.

3872

* per zone.

3873

*/

3873

*/

3874

zone->wait_table_hash_nr_entries =

3874

zone->wait_table_hash_nr_entries =

3875

wait_table_hash_nr_entries(zone_size_pages);

3875

wait_table_hash_nr_entries(zone_size_pages);

3876

zone->wait_table_bits =

3876

zone->wait_table_bits =

3877

wait_table_bits(zone->wait_table_hash_nr_entries);

3877

wait_table_bits(zone->wait_table_hash_nr_entries);

3878

alloc_size = zone->wait_table_hash_nr_entries

3878

alloc_size = zone->wait_table_hash_nr_entries

3879

* sizeof(wait_queue_head_t);

3879

* sizeof(wait_queue_head_t);

3880

3881

if (!slab_is_available()) {

3881

if (!slab_is_available()) {

3882

zone->wait_table = (wait_queue_head_t *)

3882

zone->wait_table = (wait_queue_head_t *)

3883

alloc_bootmem_node_nopanic(pgdat, alloc_size);

3883

alloc_bootmem_node_nopanic(pgdat, alloc_size);

3884

} else {

3884

} else {

3885

/*

3885

/*

3886

* This case means that a zone whose size was 0 gets new memory

3886

* This case means that a zone whose size was 0 gets new memory

3887

* via memory hot-add.

3887

* via memory hot-add.

3888

* But it may be the case that a new node was hot-added. In

3888

* But it may be the case that a new node was hot-added. In

3889

* this case vmalloc() will not be able to use this new node's

3889

* this case vmalloc() will not be able to use this new node's

3890

* memory - this wait_table must be initialized to use this new

3890

* memory - this wait_table must be initialized to use this new

3891

* node itself as well.

3891

* node itself as well.

3892

* To use this new node's memory, further consideration will be

3892

* To use this new node's memory, further consideration will be

3893

* necessary.

3893

* necessary.

3894

*/

3894

*/

3895

zone->wait_table = vmalloc(alloc_size);

3895

zone->wait_table = vmalloc(alloc_size);

3896

}

3896

}

3897

if (!zone->wait_table)

3897

if (!zone->wait_table)

3898

return -ENOMEM;

3898

return -ENOMEM;

3899

3900

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

3900

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

3901

init_waitqueue_head(zone->wait_table + i);

3901

init_waitqueue_head(zone->wait_table + i);

3902

3903

return 0;

3903

return 0;

3904

}

3904

}

3905

3906

static int __zone_pcp_update(void *data)

3906

static int __zone_pcp_update(void *data)

3907

{

3907

{

3908

struct zone *zone = data;

3908

struct zone *zone = data;

3909

int cpu;

3909

int cpu;

3910

unsigned long batch = zone_batchsize(zone), flags;

3910

unsigned long batch = zone_batchsize(zone), flags;

3911

3912

for_each_possible_cpu(cpu) {

3912

for_each_possible_cpu(cpu) {

3913

struct per_cpu_pageset *pset;

3913

struct per_cpu_pageset *pset;

3914

struct per_cpu_pages *pcp;

3914

struct per_cpu_pages *pcp;

3915

3916

pset = per_cpu_ptr(zone->pageset, cpu);

3916

pset = per_cpu_ptr(zone->pageset, cpu);

3917

pcp = &pset->pcp;

3917

pcp = &pset->pcp;

3918

3919

local_irq_save(flags);

3919

local_irq_save(flags);

3920

if (pcp->count > 0)

3920

if (pcp->count > 0)

3921

free_pcppages_bulk(zone, pcp->count, pcp);

3921

free_pcppages_bulk(zone, pcp->count, pcp);

3922

setup_pageset(pset, batch);

3922

setup_pageset(pset, batch);

3923

local_irq_restore(flags);

3923

local_irq_restore(flags);

3924

}

3924

}

3925

return 0;

3925

return 0;

3926

}

3926

}

3927

3928

void zone_pcp_update(struct zone *zone)

3928

void zone_pcp_update(struct zone *zone)

3929

{

3929

{

3930

stop_machine(__zone_pcp_update, zone, NULL);

3930

stop_machine(__zone_pcp_update, zone, NULL);

3931

}

3931

}

3932

3933

static __meminit void zone_pcp_init(struct zone *zone)

3933

static __meminit void zone_pcp_init(struct zone *zone)

3934

{

3934

{

3935

/*

3935

/*

3936

* per cpu subsystem is not up at this point. The following code

3936

* per cpu subsystem is not up at this point. The following code

3937

* relies on the ability of the linker to provide the

3937

* relies on the ability of the linker to provide the

3938

* offset of a (static) per cpu variable into the per cpu area.

3938

* offset of a (static) per cpu variable into the per cpu area.

3939

*/

3939

*/

3940

zone->pageset = &boot_pageset;

3940

zone->pageset = &boot_pageset;

3941

3942

if (zone->present_pages)

3942

if (zone->present_pages)

3943

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

3943

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

3944

zone->name, zone->present_pages,

3944

zone->name, zone->present_pages,

3945

zone_batchsize(zone));

3945

zone_batchsize(zone));

3946

}

3946

}

3947

3948

__meminit int init_currently_empty_zone(struct zone *zone,

3948

__meminit int init_currently_empty_zone(struct zone *zone,

3949

unsigned long zone_start_pfn,

3949

unsigned long zone_start_pfn,

3950

unsigned long size,

3950

unsigned long size,

3951

enum memmap_context context)

3951

enum memmap_context context)

3952

{

3952

{

3953

struct pglist_data *pgdat = zone->zone_pgdat;

3953

struct pglist_data *pgdat = zone->zone_pgdat;

3954

int ret;

3954

int ret;

3955

ret = zone_wait_table_init(zone, size);

3955

ret = zone_wait_table_init(zone, size);

3956

if (ret)

3956

if (ret)

3957

return ret;

3957

return ret;

3958

pgdat->nr_zones = zone_idx(zone) + 1;

3958

pgdat->nr_zones = zone_idx(zone) + 1;

3959

3960

zone->zone_start_pfn = zone_start_pfn;

3960

zone->zone_start_pfn = zone_start_pfn;

3961

3962

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3962

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3963

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

3963

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

3964

pgdat->node_id,

3964

pgdat->node_id,

3965

(unsigned long)zone_idx(zone),

3965

(unsigned long)zone_idx(zone),

3966

zone_start_pfn, (zone_start_pfn + size));

3966

zone_start_pfn, (zone_start_pfn + size));

3967

3968

zone_init_free_lists(zone);

3968

zone_init_free_lists(zone);

3969

3970

return 0;

3970

return 0;

3971

}

3971

}

3972

3973

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

3973

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

3974

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

3974

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

3975

/*

3975

/*

3976

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

3976

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

3977

* Architectures may implement their own version but if add_active_range()

3977

* Architectures may implement their own version but if add_active_range()

3978

* was used and there are no special requirements, this is a convenient

3978

* was used and there are no special requirements, this is a convenient

3979

* alternative

3979

* alternative

3980

*/

3980

*/

3981

int __meminit __early_pfn_to_nid(unsigned long pfn)

3981

int __meminit __early_pfn_to_nid(unsigned long pfn)

3982

{

3982

{

3983

unsigned long start_pfn, end_pfn;

3983

unsigned long start_pfn, end_pfn;

3984

int i, nid;

3984

int i, nid;

3985

3986

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

3986

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

3987

if (start_pfn <= pfn && pfn < end_pfn)

3987

if (start_pfn <= pfn && pfn < end_pfn)

3988

return nid;

3988

return nid;

3989

/* This is a memory hole */

3989

/* This is a memory hole */

3990

return -1;

3990

return -1;

3991

}

3991

}

3992

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

3992

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

3993

3994

int __meminit early_pfn_to_nid(unsigned long pfn)

3994

int __meminit early_pfn_to_nid(unsigned long pfn)

3995

{

3995

{

3996

int nid;

3996

int nid;

3997

3998

nid = __early_pfn_to_nid(pfn);

3998

nid = __early_pfn_to_nid(pfn);

3999

if (nid >= 0)

3999

if (nid >= 0)

4000

return nid;

4000

return nid;

4001

/* just returns 0 */

4001

/* just returns 0 */

4002

return 0;

4002

return 0;

4003

}

4003

}

4004

4005

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4005

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4006

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4006

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4007

{

4007

{

4008

int nid;

4008

int nid;

4009

4010

nid = __early_pfn_to_nid(pfn);

4010

nid = __early_pfn_to_nid(pfn);

4011

if (nid >= 0 && nid != node)

4011

if (nid >= 0 && nid != node)

4012

return false;

4012

return false;

4013

return true;

4013

return true;

4014

}

4014

}

4015

#endif

4015

#endif

4016

4017

/**

4017

/**

4018

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4018

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4019

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4019

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4020

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4020

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4021

*

4021

*

4022

* If an architecture guarantees that all ranges registered with

4022

* If an architecture guarantees that all ranges registered with

4023

* add_active_ranges() contain no holes and may be freed, this

4023

* add_active_ranges() contain no holes and may be freed, this

4024

* this function may be used instead of calling free_bootmem() manually.

4024

* this function may be used instead of calling free_bootmem() manually.

4025

*/

4025

*/

4026

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4026

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4027

{

4027

{

4028

unsigned long start_pfn, end_pfn;

4028

unsigned long start_pfn, end_pfn;

4029

int i, this_nid;

4029

int i, this_nid;

4030

4031

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4031

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4032

start_pfn = min(start_pfn, max_low_pfn);

4032

start_pfn = min(start_pfn, max_low_pfn);

4033

end_pfn = min(end_pfn, max_low_pfn);

4033

end_pfn = min(end_pfn, max_low_pfn);

4034

4035

if (start_pfn < end_pfn)

4035

if (start_pfn < end_pfn)

4036

free_bootmem_node(NODE_DATA(this_nid),

4036

free_bootmem_node(NODE_DATA(this_nid),

4037

PFN_PHYS(start_pfn),

4037

PFN_PHYS(start_pfn),

4038

(end_pfn - start_pfn) << PAGE_SHIFT);

4038

(end_pfn - start_pfn) << PAGE_SHIFT);

4039

}

4039

}

4040

}

4040

}

4041

4042

/**

4042

/**

4043

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4043

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4044

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4044

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4045

*

4045

*

4046

* If an architecture guarantees that all ranges registered with

4046

* If an architecture guarantees that all ranges registered with

4047

* add_active_ranges() contain no holes and may be freed, this

4047

* add_active_ranges() contain no holes and may be freed, this

4048

* function may be used instead of calling memory_present() manually.

4048

* function may be used instead of calling memory_present() manually.

4049

*/

4049

*/

4050

void __init sparse_memory_present_with_active_regions(int nid)

4050

void __init sparse_memory_present_with_active_regions(int nid)

4051

{

4051

{

4052

unsigned long start_pfn, end_pfn;

4052

unsigned long start_pfn, end_pfn;

4053

int i, this_nid;

4053

int i, this_nid;

4054

4055

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4055

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4056

memory_present(this_nid, start_pfn, end_pfn);

4056

memory_present(this_nid, start_pfn, end_pfn);

4057

}

4057

}

4058

4059

/**

4059

/**

4060

* get_pfn_range_for_nid - Return the start and end page frames for a node

4060

* get_pfn_range_for_nid - Return the start and end page frames for a node

4061

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4061

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4062

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4062

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4063

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4063

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4064

*

4064

*

4065

* It returns the start and end page frame of a node based on information

4065

* It returns the start and end page frame of a node based on information

4066

* provided by an arch calling add_active_range(). If called for a node

4066

* provided by an arch calling add_active_range(). If called for a node

4067

* with no available memory, a warning is printed and the start and end

4067

* with no available memory, a warning is printed and the start and end

4068

* PFNs will be 0.

4068

* PFNs will be 0.

4069

*/

4069

*/

4070

void __meminit get_pfn_range_for_nid(unsigned int nid,

4070

void __meminit get_pfn_range_for_nid(unsigned int nid,

4071

unsigned long *start_pfn, unsigned long *end_pfn)

4071

unsigned long *start_pfn, unsigned long *end_pfn)

4072

{

4072

{

4073

unsigned long this_start_pfn, this_end_pfn;

4073

unsigned long this_start_pfn, this_end_pfn;

4074

int i;

4074

int i;

4075

4076

*start_pfn = -1UL;

4076

*start_pfn = -1UL;

4077

*end_pfn = 0;

4077

*end_pfn = 0;

4078

4079

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4079

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4080

*start_pfn = min(*start_pfn, this_start_pfn);

4080

*start_pfn = min(*start_pfn, this_start_pfn);

4081

*end_pfn = max(*end_pfn, this_end_pfn);

4081

*end_pfn = max(*end_pfn, this_end_pfn);

4082

}

4082

}

4083

4084

if (*start_pfn == -1UL)

4084

if (*start_pfn == -1UL)

4085

*start_pfn = 0;

4085

*start_pfn = 0;

4086

}

4086

}

4087

4088

/*

4088

/*

4089

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4089

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4090

* assumption is made that zones within a node are ordered in monotonic

4090

* assumption is made that zones within a node are ordered in monotonic

4091

* increasing memory addresses so that the "highest" populated zone is used

4091

* increasing memory addresses so that the "highest" populated zone is used

4092

*/

4092

*/

4093

static void __init find_usable_zone_for_movable(void)

4093

static void __init find_usable_zone_for_movable(void)

4094

{

4094

{

4095

int zone_index;

4095

int zone_index;

4096

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4096

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4097

if (zone_index == ZONE_MOVABLE)

4097

if (zone_index == ZONE_MOVABLE)

4098

continue;

4098

continue;

4099

4100

if (arch_zone_highest_possible_pfn[zone_index] >

4100

if (arch_zone_highest_possible_pfn[zone_index] >

4101

arch_zone_lowest_possible_pfn[zone_index])

4101

arch_zone_lowest_possible_pfn[zone_index])

4102

break;

4102

break;

4103

}

4103

}

4104

4105

VM_BUG_ON(zone_index == -1);

4105

VM_BUG_ON(zone_index == -1);

4106

movable_zone = zone_index;

4106

movable_zone = zone_index;

4107

}

4107

}

4108

4109

/*

4109

/*

4110

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4110

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4111

* because it is sized independent of architecture. Unlike the other zones,

4111

* because it is sized independent of architecture. Unlike the other zones,

4112

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4112

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4113

* in each node depending on the size of each node and how evenly kernelcore

4113

* in each node depending on the size of each node and how evenly kernelcore

4114

* is distributed. This helper function adjusts the zone ranges

4114

* is distributed. This helper function adjusts the zone ranges

4115

* provided by the architecture for a given node by using the end of the

4115

* provided by the architecture for a given node by using the end of the

4116

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4116

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4117

* zones within a node are in order of monotonic increases memory addresses

4117

* zones within a node are in order of monotonic increases memory addresses

4118

*/

4118

*/

4119

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4119

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4120

unsigned long zone_type,

4120

unsigned long zone_type,

4121

unsigned long node_start_pfn,

4121

unsigned long node_start_pfn,

4122

unsigned long node_end_pfn,

4122

unsigned long node_end_pfn,

4123

unsigned long *zone_start_pfn,

4123

unsigned long *zone_start_pfn,

4124

unsigned long *zone_end_pfn)

4124

unsigned long *zone_end_pfn)

4125

{

4125

{

4126

/* Only adjust if ZONE_MOVABLE is on this node */

4126

/* Only adjust if ZONE_MOVABLE is on this node */

4127

if (zone_movable_pfn[nid]) {

4127

if (zone_movable_pfn[nid]) {

4128

/* Size ZONE_MOVABLE */

4128

/* Size ZONE_MOVABLE */

4129

if (zone_type == ZONE_MOVABLE) {

4129

if (zone_type == ZONE_MOVABLE) {

4130

*zone_start_pfn = zone_movable_pfn[nid];

4130

*zone_start_pfn = zone_movable_pfn[nid];

4131

*zone_end_pfn = min(node_end_pfn,

4131

*zone_end_pfn = min(node_end_pfn,

4132

arch_zone_highest_possible_pfn[movable_zone]);

4132

arch_zone_highest_possible_pfn[movable_zone]);

4133

4134

/* Adjust for ZONE_MOVABLE starting within this range */

4134

/* Adjust for ZONE_MOVABLE starting within this range */

4135

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4135

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4136

*zone_end_pfn > zone_movable_pfn[nid]) {

4136

*zone_end_pfn > zone_movable_pfn[nid]) {

4137

*zone_end_pfn = zone_movable_pfn[nid];

4137

*zone_end_pfn = zone_movable_pfn[nid];

4138

4139

/* Check if this whole range is within ZONE_MOVABLE */

4139

/* Check if this whole range is within ZONE_MOVABLE */

4140

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4140

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4141

*zone_start_pfn = *zone_end_pfn;

4141

*zone_start_pfn = *zone_end_pfn;

4142

}

4142

}

4143

}

4143

}

4144

4145

/*

4145

/*

4146

* Return the number of pages a zone spans in a node, including holes

4146

* Return the number of pages a zone spans in a node, including holes

4147

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4147

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4148

*/

4148

*/

4149

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4149

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4150

unsigned long zone_type,

4150

unsigned long zone_type,

4151

unsigned long *ignored)

4151

unsigned long *ignored)

4152

{

4152

{

4153

unsigned long node_start_pfn, node_end_pfn;

4153

unsigned long node_start_pfn, node_end_pfn;

4154

unsigned long zone_start_pfn, zone_end_pfn;

4154

unsigned long zone_start_pfn, zone_end_pfn;

4155

4156

/* Get the start and end of the node and zone */

4156

/* Get the start and end of the node and zone */

4157

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4157

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4158

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4158

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4159

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4159

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4160

adjust_zone_range_for_zone_movable(nid, zone_type,

4160

adjust_zone_range_for_zone_movable(nid, zone_type,

4161

node_start_pfn, node_end_pfn,

4161

node_start_pfn, node_end_pfn,

4162

&zone_start_pfn, &zone_end_pfn);

4162

&zone_start_pfn, &zone_end_pfn);

4163

4164

/* Check that this node has pages within the zone's required range */

4164

/* Check that this node has pages within the zone's required range */

4165

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4165

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4166

return 0;

4166

return 0;

4167

4168

/* Move the zone boundaries inside the node if necessary */

4168

/* Move the zone boundaries inside the node if necessary */

4169

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4169

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4170

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4170

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4171

4172

/* Return the spanned pages */

4172

/* Return the spanned pages */

4173

return zone_end_pfn - zone_start_pfn;

4173

return zone_end_pfn - zone_start_pfn;

4174

}

4174

}

4175

4176

/*

4176

/*

4177

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4177

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4178

* then all holes in the requested range will be accounted for.

4178

* then all holes in the requested range will be accounted for.

4179

*/

4179

*/

4180

unsigned long __meminit __absent_pages_in_range(int nid,

4180

unsigned long __meminit __absent_pages_in_range(int nid,

4181

unsigned long range_start_pfn,

4181

unsigned long range_start_pfn,

4182

unsigned long range_end_pfn)

4182

unsigned long range_end_pfn)

4183

{

4183

{

4184

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4184

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4185

unsigned long start_pfn, end_pfn;

4185

unsigned long start_pfn, end_pfn;

4186

int i;

4186

int i;

4187

4188

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4188

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4189

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4189

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4190

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4190

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4191

nr_absent -= end_pfn - start_pfn;

4191

nr_absent -= end_pfn - start_pfn;

4192

}

4192

}

4193

return nr_absent;

4193

return nr_absent;

4194

}

4194

}

4195

4196

/**

4196

/**

4197

* absent_pages_in_range - Return number of page frames in holes within a range

4197

* absent_pages_in_range - Return number of page frames in holes within a range

4198

* @start_pfn: The start PFN to start searching for holes

4198

* @start_pfn: The start PFN to start searching for holes

4199

* @end_pfn: The end PFN to stop searching for holes

4199

* @end_pfn: The end PFN to stop searching for holes

4200

*

4200

*

4201

* It returns the number of pages frames in memory holes within a range.

4201

* It returns the number of pages frames in memory holes within a range.

4202

*/

4202

*/

4203

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4203

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4204

unsigned long end_pfn)

4204

unsigned long end_pfn)

4205

{

4205

{

4206

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4206

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4207

}

4207

}

4208

4209

/* Return the number of page frames in holes in a zone on a node */

4209

/* Return the number of page frames in holes in a zone on a node */

4210

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4210

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4211

unsigned long zone_type,

4211

unsigned long zone_type,

4212

unsigned long *ignored)

4212

unsigned long *ignored)

4213

{

4213

{

4214

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4214

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4215

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4215

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4216

unsigned long node_start_pfn, node_end_pfn;

4216

unsigned long node_start_pfn, node_end_pfn;

4217

unsigned long zone_start_pfn, zone_end_pfn;

4217

unsigned long zone_start_pfn, zone_end_pfn;

4218

4219

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4219

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4220

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4220

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4221

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4221

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4222

4223

adjust_zone_range_for_zone_movable(nid, zone_type,

4223

adjust_zone_range_for_zone_movable(nid, zone_type,

4224

node_start_pfn, node_end_pfn,

4224

node_start_pfn, node_end_pfn,

4225

&zone_start_pfn, &zone_end_pfn);

4225

&zone_start_pfn, &zone_end_pfn);

4226

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4226

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4227

}

4227

}

4228

4229

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4229

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4230

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4230

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4231

unsigned long zone_type,

4231

unsigned long zone_type,

4232

unsigned long *zones_size)

4232

unsigned long *zones_size)

4233

{

4233

{

4234

return zones_size[zone_type];

4234

return zones_size[zone_type];

4235

}

4235

}

4236

4237

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4237

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4238

unsigned long zone_type,

4238

unsigned long zone_type,

4239

unsigned long *zholes_size)

4239

unsigned long *zholes_size)

4240

{

4240

{

4241

if (!zholes_size)

4241

if (!zholes_size)

4242

return 0;

4242

return 0;

4243

4244

return zholes_size[zone_type];

4244

return zholes_size[zone_type];

4245

}

4245

}

4246

4247

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4247

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4248

4249

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4249

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4250

unsigned long *zones_size, unsigned long *zholes_size)

4250

unsigned long *zones_size, unsigned long *zholes_size)

4251

{

4251

{

4252

unsigned long realtotalpages, totalpages = 0;

4252

unsigned long realtotalpages, totalpages = 0;

4253

enum zone_type i;

4253

enum zone_type i;

4254

4255

for (i = 0; i < MAX_NR_ZONES; i++)

4255

for (i = 0; i < MAX_NR_ZONES; i++)

4256

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4256

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4257

zones_size);

4257

zones_size);

4258

pgdat->node_spanned_pages = totalpages;

4258

pgdat->node_spanned_pages = totalpages;

4259

4260

realtotalpages = totalpages;

4260

realtotalpages = totalpages;

4261

for (i = 0; i < MAX_NR_ZONES; i++)

4261

for (i = 0; i < MAX_NR_ZONES; i++)

4262

realtotalpages -=

4262

realtotalpages -=

4263

zone_absent_pages_in_node(pgdat->node_id, i,

4263

zone_absent_pages_in_node(pgdat->node_id, i,

4264

zholes_size);

4264

zholes_size);

4265

pgdat->node_present_pages = realtotalpages;

4265

pgdat->node_present_pages = realtotalpages;

4266

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4266

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4267

realtotalpages);

4267

realtotalpages);

4268

}

4268

}

4269

4270

#ifndef CONFIG_SPARSEMEM

4270

#ifndef CONFIG_SPARSEMEM

4271

/*

4271

/*

4272

* Calculate the size of the zone->blockflags rounded to an unsigned long

4272

* Calculate the size of the zone->blockflags rounded to an unsigned long

4273

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4273

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4274

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4274

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4275

* round what is now in bits to nearest long in bits, then return it in

4275

* round what is now in bits to nearest long in bits, then return it in

4276

* bytes.

4276

* bytes.

4277

*/

4277

*/

4278

static unsigned long __init usemap_size(unsigned long zonesize)

4278

static unsigned long __init usemap_size(unsigned long zonesize)

4279

{

4279

{

4280

unsigned long usemapsize;

4280

unsigned long usemapsize;

4281

4282

usemapsize = roundup(zonesize, pageblock_nr_pages);

4282

usemapsize = roundup(zonesize, pageblock_nr_pages);

4283

usemapsize = usemapsize >> pageblock_order;

4283

usemapsize = usemapsize >> pageblock_order;

4284

usemapsize *= NR_PAGEBLOCK_BITS;

4284

usemapsize *= NR_PAGEBLOCK_BITS;

4285

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4285

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4286

4287

return usemapsize / 8;

4287

return usemapsize / 8;

4288

}

4288

}

4289

4290

static void __init setup_usemap(struct pglist_data *pgdat,

4290

static void __init setup_usemap(struct pglist_data *pgdat,

4291

struct zone *zone, unsigned long zonesize)

4291

struct zone *zone, unsigned long zonesize)

4292

{

4292

{

4293

unsigned long usemapsize = usemap_size(zonesize);

4293

unsigned long usemapsize = usemap_size(zonesize);

4294

zone->pageblock_flags = NULL;

4294

zone->pageblock_flags = NULL;

4295

if (usemapsize)

4295

if (usemapsize)

4296

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4296

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4297

usemapsize);

4297

usemapsize);

4298

}

4298

}

4299

#else

4299

#else

4300

static inline void setup_usemap(struct pglist_data *pgdat,

4300

static inline void setup_usemap(struct pglist_data *pgdat,

4301

struct zone *zone, unsigned long zonesize) {}

4301

struct zone *zone, unsigned long zonesize) {}

4302

#endif /* CONFIG_SPARSEMEM */

4302

#endif /* CONFIG_SPARSEMEM */

4303

4304

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4304

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4305

4306

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4306

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4307

static inline void __init set_pageblock_order(void)

4307

static inline void __init set_pageblock_order(void)

4308

{

4308

{

4309

unsigned int order;

4309

unsigned int order;

4310

4311

/* Check that pageblock_nr_pages has not already been setup */

4311

/* Check that pageblock_nr_pages has not already been setup */

4312

if (pageblock_order)

4312

if (pageblock_order)

4313

return;

4313

return;

4314

4315

if (HPAGE_SHIFT > PAGE_SHIFT)

4315

if (HPAGE_SHIFT > PAGE_SHIFT)

4316

order = HUGETLB_PAGE_ORDER;

4316

order = HUGETLB_PAGE_ORDER;

4317

else

4317

else

4318

order = MAX_ORDER - 1;

4318

order = MAX_ORDER - 1;

4319

4320

/*

4320

/*

4321

* Assume the largest contiguous order of interest is a huge page.

4321

* Assume the largest contiguous order of interest is a huge page.

4322

* This value may be variable depending on boot parameters on IA64 and

4322

* This value may be variable depending on boot parameters on IA64 and

4323

* powerpc.

4323

* powerpc.

4324

*/

4324

*/

4325

pageblock_order = order;

4325

pageblock_order = order;

4326

}

4326

}

4327

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4327

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4328

4329

/*

4329

/*

4330

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4330

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4331

* is unused as pageblock_order is set at compile-time. See

4331

* is unused as pageblock_order is set at compile-time. See

4332

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4332

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4333

* the kernel config

4333

* the kernel config

4334

*/

4334

*/

4335

static inline void set_pageblock_order(void)

4335

static inline void set_pageblock_order(void)

4336

{

4336

{

4337

}

4337

}

4338

4339

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4339

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4340

4341

/*

4341

/*

4342

* Set up the zone data structures:

4342

* Set up the zone data structures:

4343

* - mark all pages reserved

4343

* - mark all pages reserved

4344

* - mark all memory queues empty

4344

* - mark all memory queues empty

4345

* - clear the memory bitmaps

4345

* - clear the memory bitmaps

4346

*/

4346

*/

4347

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4347

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4348

unsigned long *zones_size, unsigned long *zholes_size)

4348

unsigned long *zones_size, unsigned long *zholes_size)

4349

{

4349

{

4350

enum zone_type j;

4350

enum zone_type j;

4351

int nid = pgdat->node_id;

4351

int nid = pgdat->node_id;

4352

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4352

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4353

int ret;

4353

int ret;

4354

4355

pgdat_resize_init(pgdat);

4355

pgdat_resize_init(pgdat);

4356

pgdat->nr_zones = 0;

4356

pgdat->nr_zones = 0;

4357

init_waitqueue_head(&pgdat->kswapd_wait);

4357

init_waitqueue_head(&pgdat->kswapd_wait);

4358

pgdat->kswapd_max_order = 0;

4358

pgdat->kswapd_max_order = 0;

4359

pgdat_page_cgroup_init(pgdat);

4359

pgdat_page_cgroup_init(pgdat);

4360

4361

for (j = 0; j < MAX_NR_ZONES; j++) {

4361

for (j = 0; j < MAX_NR_ZONES; j++) {

4362

struct zone *zone = pgdat->node_zones + j;

4362

struct zone *zone = pgdat->node_zones + j;

4363

unsigned long size, realsize, memmap_pages;

4363

unsigned long size, realsize, memmap_pages;

4364

4365

size = zone_spanned_pages_in_node(nid, j, zones_size);

4365

size = zone_spanned_pages_in_node(nid, j, zones_size);

4366

realsize = size - zone_absent_pages_in_node(nid, j,

4366

realsize = size - zone_absent_pages_in_node(nid, j,

4367

zholes_size);

4367

zholes_size);

4368

4369

/*

4369

/*

4370

* Adjust realsize so that it accounts for how much memory

4370

* Adjust realsize so that it accounts for how much memory

4371

* is used by this zone for memmap. This affects the watermark

4371

* is used by this zone for memmap. This affects the watermark

4372

* and per-cpu initialisations

4372

* and per-cpu initialisations

4373

*/

4373

*/

4374

memmap_pages =

4374

memmap_pages =

4375

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

4375

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

4376

if (realsize >= memmap_pages) {

4376

if (realsize >= memmap_pages) {

4377

realsize -= memmap_pages;

4377

realsize -= memmap_pages;

4378

if (memmap_pages)

4378

if (memmap_pages)

4379

printk(KERN_DEBUG

4379

printk(KERN_DEBUG

4380

" %s zone: %lu pages used for memmap\n",

4380

" %s zone: %lu pages used for memmap\n",

4381

zone_names[j], memmap_pages);

4381

zone_names[j], memmap_pages);

4382

} else

4382

} else

4383

printk(KERN_WARNING

4383

printk(KERN_WARNING

4384

" %s zone: %lu pages exceeds realsize %lu\n",

4384

" %s zone: %lu pages exceeds realsize %lu\n",

4385

zone_names[j], memmap_pages, realsize);

4385

zone_names[j], memmap_pages, realsize);

4386

4387

/* Account for reserved pages */

4387

/* Account for reserved pages */

4388

if (j == 0 && realsize > dma_reserve) {

4388

if (j == 0 && realsize > dma_reserve) {

4389

realsize -= dma_reserve;

4389

realsize -= dma_reserve;

4390

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4390

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4391

zone_names[0], dma_reserve);

4391

zone_names[0], dma_reserve);

4392

}

4392

}

4393

4394

if (!is_highmem_idx(j))

4394

if (!is_highmem_idx(j))

4395

nr_kernel_pages += realsize;

4395

nr_kernel_pages += realsize;

4396

nr_all_pages += realsize;

4396

nr_all_pages += realsize;

4397

4398

zone->spanned_pages = size;

4398

zone->spanned_pages = size;

4399

zone->present_pages = realsize;

4399

zone->present_pages = realsize;

4400

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

4401

zone->compact_cached_free_pfn = zone->zone_start_pfn +

4402

zone->spanned_pages;

4403

zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);

4404

#endif

4400

#ifdef CONFIG_NUMA

4405

#ifdef CONFIG_NUMA

4401

zone->node = nid;

4406

zone->node = nid;

4402

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

4407

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

4403

/ 100;

4408

/ 100;

4404

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

4409

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

4405

#endif

4410

#endif

4406

zone->name = zone_names[j];

4411

zone->name = zone_names[j];

4407

spin_lock_init(&zone->lock);

4412

spin_lock_init(&zone->lock);

4408

spin_lock_init(&zone->lru_lock);

4413

spin_lock_init(&zone->lru_lock);

4409

zone_seqlock_init(zone);

4414

zone_seqlock_init(zone);

4410

zone->zone_pgdat = pgdat;

4415

zone->zone_pgdat = pgdat;

4411

4416

4412

zone_pcp_init(zone);

4417

zone_pcp_init(zone);

4413

lruvec_init(&zone->lruvec, zone);

4418

lruvec_init(&zone->lruvec, zone);

4414

zap_zone_vm_stats(zone);

4419

zap_zone_vm_stats(zone);

4415

zone->flags = 0;

4420

zone->flags = 0;

4416

if (!size)

4421

if (!size)

4417

continue;

4422

continue;

4418

4423

4419

set_pageblock_order();

4424

set_pageblock_order();

4420

setup_usemap(pgdat, zone, size);

4425

setup_usemap(pgdat, zone, size);

4421

ret = init_currently_empty_zone(zone, zone_start_pfn,

4426

ret = init_currently_empty_zone(zone, zone_start_pfn,

4422

size, MEMMAP_EARLY);

4427

size, MEMMAP_EARLY);

4423

BUG_ON(ret);

4428

BUG_ON(ret);

4424

memmap_init(size, nid, j, zone_start_pfn);

4429

memmap_init(size, nid, j, zone_start_pfn);

4425

zone_start_pfn += size;

4430

zone_start_pfn += size;

4426

}

4431

}

4427

}

4432

}

4428

4433

4429

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4434

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4430

{

4435

{

4431

/* Skip empty nodes */

4436

/* Skip empty nodes */

4432

if (!pgdat->node_spanned_pages)

4437

if (!pgdat->node_spanned_pages)

4433

return;

4438

return;

4434

4439

4435

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4440

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4436

/* ia64 gets its own node_mem_map, before this, without bootmem */

4441

/* ia64 gets its own node_mem_map, before this, without bootmem */

4437

if (!pgdat->node_mem_map) {

4442

if (!pgdat->node_mem_map) {

4438

unsigned long size, start, end;

4443

unsigned long size, start, end;

4439

struct page *map;

4444

struct page *map;

4440

4445

4441

/*

4446

/*

4442

* The zone's endpoints aren't required to be MAX_ORDER

4447

* The zone's endpoints aren't required to be MAX_ORDER

4443

* aligned but the node_mem_map endpoints must be in order

4448

* aligned but the node_mem_map endpoints must be in order

4444

* for the buddy allocator to function correctly.

4449

* for the buddy allocator to function correctly.

4445

*/

4450

*/

4446

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4451

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4447

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4452

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4448

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4453

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4449

size = (end - start) * sizeof(struct page);

4454

size = (end - start) * sizeof(struct page);

4450

map = alloc_remap(pgdat->node_id, size);

4455

map = alloc_remap(pgdat->node_id, size);

4451

if (!map)

4456

if (!map)

4452

map = alloc_bootmem_node_nopanic(pgdat, size);

4457

map = alloc_bootmem_node_nopanic(pgdat, size);

4453

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4458

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4454

}

4459

}

4455

#ifndef CONFIG_NEED_MULTIPLE_NODES

4460

#ifndef CONFIG_NEED_MULTIPLE_NODES

4456

/*

4461

/*

4457

* With no DISCONTIG, the global mem_map is just set as node 0's

4462

* With no DISCONTIG, the global mem_map is just set as node 0's

4458

*/

4463

*/

4459

if (pgdat == NODE_DATA(0)) {

4464

if (pgdat == NODE_DATA(0)) {

4460

mem_map = NODE_DATA(0)->node_mem_map;

4465

mem_map = NODE_DATA(0)->node_mem_map;

4461

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4466

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4462

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4467

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4463

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4468

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4464

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4469

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4465

}

4470

}

4466

#endif

4471

#endif

4467

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4472

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4468

}

4473

}

4469

4474

4470

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4475

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4471

unsigned long node_start_pfn, unsigned long *zholes_size)

4476

unsigned long node_start_pfn, unsigned long *zholes_size)

4472

{

4477

{

4473

pg_data_t *pgdat = NODE_DATA(nid);

4478

pg_data_t *pgdat = NODE_DATA(nid);

4474

4479

4475

pgdat->node_id = nid;

4480

pgdat->node_id = nid;

4476

pgdat->node_start_pfn = node_start_pfn;

4481

pgdat->node_start_pfn = node_start_pfn;

4477

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4482

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4478

4483

4479

alloc_node_mem_map(pgdat);

4484

alloc_node_mem_map(pgdat);

4480

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4485

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4481

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4486

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4482

nid, (unsigned long)pgdat,

4487

nid, (unsigned long)pgdat,

4483

(unsigned long)pgdat->node_mem_map);

4488

(unsigned long)pgdat->node_mem_map);

4484

#endif

4489

#endif

4485

4490

4486

free_area_init_core(pgdat, zones_size, zholes_size);

4491

free_area_init_core(pgdat, zones_size, zholes_size);

4487

}

4492

}

4488

4493

4489

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4494

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4490

4495

4491

#if MAX_NUMNODES > 1

4496

#if MAX_NUMNODES > 1

4492

/*

4497

/*

4493

* Figure out the number of possible node ids.

4498

* Figure out the number of possible node ids.

4494

*/

4499

*/

4495

static void __init setup_nr_node_ids(void)

4500

static void __init setup_nr_node_ids(void)

4496

{

4501

{

4497

unsigned int node;

4502

unsigned int node;

4498

unsigned int highest = 0;

4503

unsigned int highest = 0;

4499

4504

4500

for_each_node_mask(node, node_possible_map)

4505

for_each_node_mask(node, node_possible_map)

4501

highest = node;

4506

highest = node;

4502

nr_node_ids = highest + 1;

4507

nr_node_ids = highest + 1;

4503

}

4508

}

4504

#else

4509

#else

4505

static inline void setup_nr_node_ids(void)

4510

static inline void setup_nr_node_ids(void)

4506

{

4511

{

4507

}

4512

}

4508

#endif

4513

#endif

4509

4514

4510

/**

4515

/**

4511

* node_map_pfn_alignment - determine the maximum internode alignment

4516

* node_map_pfn_alignment - determine the maximum internode alignment

4512

*

4517

*

4513

* This function should be called after node map is populated and sorted.

4518

* This function should be called after node map is populated and sorted.

4514

* It calculates the maximum power of two alignment which can distinguish

4519

* It calculates the maximum power of two alignment which can distinguish

4515

* all the nodes.

4520

* all the nodes.

4516

*

4521

*

4517

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4522

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4518

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4523

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4519

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4524

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4520

* shifted, 1GiB is enough and this function will indicate so.

4525

* shifted, 1GiB is enough and this function will indicate so.

4521

*

4526

*

4522

* This is used to test whether pfn -> nid mapping of the chosen memory

4527

* This is used to test whether pfn -> nid mapping of the chosen memory

4523

* model has fine enough granularity to avoid incorrect mapping for the

4528

* model has fine enough granularity to avoid incorrect mapping for the

4524

* populated node map.

4529

* populated node map.

4525

*

4530

*

4526

* Returns the determined alignment in pfn's. 0 if there is no alignment

4531

* Returns the determined alignment in pfn's. 0 if there is no alignment

4527

* requirement (single node).

4532

* requirement (single node).

4528

*/

4533

*/

4529

unsigned long __init node_map_pfn_alignment(void)

4534

unsigned long __init node_map_pfn_alignment(void)

4530

{

4535

{

4531

unsigned long accl_mask = 0, last_end = 0;

4536

unsigned long accl_mask = 0, last_end = 0;

4532

unsigned long start, end, mask;

4537

unsigned long start, end, mask;

4533

int last_nid = -1;

4538

int last_nid = -1;

4534

int i, nid;

4539

int i, nid;

4535

4540

4536

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4541

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4537

if (!start || last_nid < 0 || last_nid == nid) {

4542

if (!start || last_nid < 0 || last_nid == nid) {

4538

last_nid = nid;

4543

last_nid = nid;

4539

last_end = end;

4544

last_end = end;

4540

continue;

4545

continue;

4541

}

4546

}

4542

4547

4543

/*

4548

/*

4544

* Start with a mask granular enough to pin-point to the

4549

* Start with a mask granular enough to pin-point to the

4545

* start pfn and tick off bits one-by-one until it becomes

4550

* start pfn and tick off bits one-by-one until it becomes

4546

* too coarse to separate the current node from the last.

4551

* too coarse to separate the current node from the last.

4547

*/

4552

*/

4548

mask = ~((1 << __ffs(start)) - 1);

4553

mask = ~((1 << __ffs(start)) - 1);

4549

while (mask && last_end <= (start & (mask << 1)))

4554

while (mask && last_end <= (start & (mask << 1)))

4550

mask <<= 1;

4555

mask <<= 1;

4551

4556

4552

/* accumulate all internode masks */

4557

/* accumulate all internode masks */

4553

accl_mask |= mask;

4558

accl_mask |= mask;

4554

}

4559

}

4555

4560

4556

/* convert mask to number of pages */

4561

/* convert mask to number of pages */

4557

return ~accl_mask + 1;

4562

return ~accl_mask + 1;

4558

}

4563

}

4559

4564

4560

/* Find the lowest pfn for a node */

4565

/* Find the lowest pfn for a node */

4561

static unsigned long __init find_min_pfn_for_node(int nid)

4566

static unsigned long __init find_min_pfn_for_node(int nid)

4562

{

4567

{

4563

unsigned long min_pfn = ULONG_MAX;

4568

unsigned long min_pfn = ULONG_MAX;

4564

unsigned long start_pfn;

4569

unsigned long start_pfn;

4565

int i;

4570

int i;

4566

4571

4567

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4572

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4568

min_pfn = min(min_pfn, start_pfn);

4573

min_pfn = min(min_pfn, start_pfn);

4569

4574

4570

if (min_pfn == ULONG_MAX) {

4575

if (min_pfn == ULONG_MAX) {

4571

printk(KERN_WARNING

4576

printk(KERN_WARNING

4572

"Could not find start_pfn for node %d\n", nid);

4577

"Could not find start_pfn for node %d\n", nid);

4573

return 0;

4578

return 0;

4574

}

4579

}

4575

4580

4576

return min_pfn;

4581

return min_pfn;

4577

}

4582

}

4578

4583

4579

/**

4584

/**

4580

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4585

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4581

*

4586

*

4582

* It returns the minimum PFN based on information provided via

4587

* It returns the minimum PFN based on information provided via

4583

* add_active_range().

4588

* add_active_range().

4584

*/

4589

*/

4585

unsigned long __init find_min_pfn_with_active_regions(void)

4590

unsigned long __init find_min_pfn_with_active_regions(void)

4586

{

4591

{

4587

return find_min_pfn_for_node(MAX_NUMNODES);

4592

return find_min_pfn_for_node(MAX_NUMNODES);

4588

}

4593

}

4589

4594

4590

/*

4595

/*

4591

* early_calculate_totalpages()

4596

* early_calculate_totalpages()

4592

* Sum pages in active regions for movable zone.

4597

* Sum pages in active regions for movable zone.

4593

* Populate N_HIGH_MEMORY for calculating usable_nodes.

4598

* Populate N_HIGH_MEMORY for calculating usable_nodes.

4594

*/

4599

*/

4595

static unsigned long __init early_calculate_totalpages(void)

4600

static unsigned long __init early_calculate_totalpages(void)

4596

{

4601

{

4597

unsigned long totalpages = 0;

4602

unsigned long totalpages = 0;

4598

unsigned long start_pfn, end_pfn;

4603

unsigned long start_pfn, end_pfn;

4599

int i, nid;

4604

int i, nid;

4600

4605

4601

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4606

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4602

unsigned long pages = end_pfn - start_pfn;

4607

unsigned long pages = end_pfn - start_pfn;

4603

4608

4604

totalpages += pages;

4609

totalpages += pages;

4605

if (pages)

4610

if (pages)

4606

node_set_state(nid, N_HIGH_MEMORY);

4611

node_set_state(nid, N_HIGH_MEMORY);

4607

}

4612

}

4608

return totalpages;

4613

return totalpages;

4609

}

4614

}

4610

4615

4611

/*

4616

/*

4612

* Find the PFN the Movable zone begins in each node. Kernel memory

4617

* Find the PFN the Movable zone begins in each node. Kernel memory

4613

* is spread evenly between nodes as long as the nodes have enough

4618

* is spread evenly between nodes as long as the nodes have enough

4614

* memory. When they don't, some nodes will have more kernelcore than

4619

* memory. When they don't, some nodes will have more kernelcore than

4615

* others

4620

* others

4616

*/

4621

*/

4617

static void __init find_zone_movable_pfns_for_nodes(void)

4622

static void __init find_zone_movable_pfns_for_nodes(void)

4618

{

4623

{

4619

int i, nid;

4624

int i, nid;

4620

unsigned long usable_startpfn;

4625

unsigned long usable_startpfn;

4621

unsigned long kernelcore_node, kernelcore_remaining;

4626

unsigned long kernelcore_node, kernelcore_remaining;

4622

/* save the state before borrow the nodemask */

4627

/* save the state before borrow the nodemask */

4623

nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];

4628

nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];

4624

unsigned long totalpages = early_calculate_totalpages();

4629

unsigned long totalpages = early_calculate_totalpages();

4625

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

4630

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

4626

4631

4627

/*

4632

/*

4628

* If movablecore was specified, calculate what size of

4633

* If movablecore was specified, calculate what size of

4629

* kernelcore that corresponds so that memory usable for

4634

* kernelcore that corresponds so that memory usable for

4630

* any allocation type is evenly spread. If both kernelcore

4635

* any allocation type is evenly spread. If both kernelcore

4631

* and movablecore are specified, then the value of kernelcore

4636

* and movablecore are specified, then the value of kernelcore

4632

* will be used for required_kernelcore if it's greater than

4637

* will be used for required_kernelcore if it's greater than

4633

* what movablecore would have allowed.

4638

* what movablecore would have allowed.

4634

*/

4639

*/

4635

if (required_movablecore) {

4640

if (required_movablecore) {

4636

unsigned long corepages;

4641

unsigned long corepages;

4637

4642

4638

/*

4643

/*

4639

* Round-up so that ZONE_MOVABLE is at least as large as what

4644

* Round-up so that ZONE_MOVABLE is at least as large as what

4640

* was requested by the user

4645

* was requested by the user

4641

*/

4646

*/

4642

required_movablecore =

4647

required_movablecore =

4643

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4648

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4644

corepages = totalpages - required_movablecore;

4649

corepages = totalpages - required_movablecore;

4645

4650

4646

required_kernelcore = max(required_kernelcore, corepages);

4651

required_kernelcore = max(required_kernelcore, corepages);

4647

}

4652

}

4648

4653

4649

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4654

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4650

if (!required_kernelcore)

4655

if (!required_kernelcore)

4651

goto out;

4656

goto out;

4652

4657

4653

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4658

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4654

find_usable_zone_for_movable();

4659

find_usable_zone_for_movable();

4655

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4660

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4656

4661

4657

restart:

4662

restart:

4658

/* Spread kernelcore memory as evenly as possible throughout nodes */

4663

/* Spread kernelcore memory as evenly as possible throughout nodes */

4659

kernelcore_node = required_kernelcore / usable_nodes;

4664

kernelcore_node = required_kernelcore / usable_nodes;

4660

for_each_node_state(nid, N_HIGH_MEMORY) {

4665

for_each_node_state(nid, N_HIGH_MEMORY) {

4661

unsigned long start_pfn, end_pfn;

4666

unsigned long start_pfn, end_pfn;

4662

4667

4663

/*

4668

/*

4664

* Recalculate kernelcore_node if the division per node

4669

* Recalculate kernelcore_node if the division per node

4665

* now exceeds what is necessary to satisfy the requested

4670

* now exceeds what is necessary to satisfy the requested

4666

* amount of memory for the kernel

4671

* amount of memory for the kernel

4667

*/

4672

*/

4668

if (required_kernelcore < kernelcore_node)

4673

if (required_kernelcore < kernelcore_node)

4669

kernelcore_node = required_kernelcore / usable_nodes;

4674

kernelcore_node = required_kernelcore / usable_nodes;

4670

4675

4671

/*

4676

/*

4672

* As the map is walked, we track how much memory is usable

4677

* As the map is walked, we track how much memory is usable

4673

* by the kernel using kernelcore_remaining. When it is

4678

* by the kernel using kernelcore_remaining. When it is

4674

* 0, the rest of the node is usable by ZONE_MOVABLE

4679

* 0, the rest of the node is usable by ZONE_MOVABLE

4675

*/

4680

*/

4676

kernelcore_remaining = kernelcore_node;

4681

kernelcore_remaining = kernelcore_node;

4677

4682

4678

/* Go through each range of PFNs within this node */

4683

/* Go through each range of PFNs within this node */

4679

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4684

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4680

unsigned long size_pages;

4685

unsigned long size_pages;

4681

4686

4682

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4687

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4683

if (start_pfn >= end_pfn)

4688

if (start_pfn >= end_pfn)

4684

continue;

4689

continue;

4685

4690

4686

/* Account for what is only usable for kernelcore */

4691

/* Account for what is only usable for kernelcore */

4687

if (start_pfn < usable_startpfn) {

4692

if (start_pfn < usable_startpfn) {

4688

unsigned long kernel_pages;

4693

unsigned long kernel_pages;

4689

kernel_pages = min(end_pfn, usable_startpfn)

4694

kernel_pages = min(end_pfn, usable_startpfn)

4690

- start_pfn;

4695

- start_pfn;

4691

4696

4692

kernelcore_remaining -= min(kernel_pages,

4697

kernelcore_remaining -= min(kernel_pages,

4693

kernelcore_remaining);

4698

kernelcore_remaining);

4694

required_kernelcore -= min(kernel_pages,

4699

required_kernelcore -= min(kernel_pages,

4695

required_kernelcore);

4700

required_kernelcore);

4696

4701

4697

/* Continue if range is now fully accounted */

4702

/* Continue if range is now fully accounted */

4698

if (end_pfn <= usable_startpfn) {

4703

if (end_pfn <= usable_startpfn) {

4699

4704

4700

/*

4705

/*

4701

* Push zone_movable_pfn to the end so

4706

* Push zone_movable_pfn to the end so

4702

* that if we have to rebalance

4707

* that if we have to rebalance

4703

* kernelcore across nodes, we will

4708

* kernelcore across nodes, we will

4704

* not double account here

4709

* not double account here

4705

*/

4710

*/

4706

zone_movable_pfn[nid] = end_pfn;

4711

zone_movable_pfn[nid] = end_pfn;

4707

continue;

4712

continue;

4708

}

4713

}

4709

start_pfn = usable_startpfn;

4714

start_pfn = usable_startpfn;

4710

}

4715

}

4711

4716

4712

/*

4717

/*

4713

* The usable PFN range for ZONE_MOVABLE is from

4718

* The usable PFN range for ZONE_MOVABLE is from

4714

* start_pfn->end_pfn. Calculate size_pages as the

4719

* start_pfn->end_pfn. Calculate size_pages as the

4715

* number of pages used as kernelcore

4720

* number of pages used as kernelcore

4716

*/

4721

*/

4717

size_pages = end_pfn - start_pfn;

4722

size_pages = end_pfn - start_pfn;

4718

if (size_pages > kernelcore_remaining)

4723

if (size_pages > kernelcore_remaining)

4719

size_pages = kernelcore_remaining;

4724

size_pages = kernelcore_remaining;

4720

zone_movable_pfn[nid] = start_pfn + size_pages;

4725

zone_movable_pfn[nid] = start_pfn + size_pages;

4721

4726

4722

/*

4727

/*

4723

* Some kernelcore has been met, update counts and

4728

* Some kernelcore has been met, update counts and

4724

* break if the kernelcore for this node has been

4729

* break if the kernelcore for this node has been

4725

* satisified

4730

* satisified

4726

*/

4731

*/

4727

required_kernelcore -= min(required_kernelcore,

4732

required_kernelcore -= min(required_kernelcore,

4728

size_pages);

4733

size_pages);

4729

kernelcore_remaining -= size_pages;

4734

kernelcore_remaining -= size_pages;

4730

if (!kernelcore_remaining)

4735

if (!kernelcore_remaining)

4731

break;

4736

break;

4732

}

4737

}

4733

}

4738

}

4734

4739

4735

/*

4740

/*

4736

* If there is still required_kernelcore, we do another pass with one

4741

* If there is still required_kernelcore, we do another pass with one

4737

* less node in the count. This will push zone_movable_pfn[nid] further

4742

* less node in the count. This will push zone_movable_pfn[nid] further

4738

* along on the nodes that still have memory until kernelcore is

4743

* along on the nodes that still have memory until kernelcore is

4739

* satisified

4744

* satisified

4740

*/

4745

*/

4741

usable_nodes--;

4746

usable_nodes--;

4742

if (usable_nodes && required_kernelcore > usable_nodes)

4747

if (usable_nodes && required_kernelcore > usable_nodes)

4743

goto restart;

4748

goto restart;

4744

4749

4745

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4750

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4746

for (nid = 0; nid < MAX_NUMNODES; nid++)

4751

for (nid = 0; nid < MAX_NUMNODES; nid++)

4747

zone_movable_pfn[nid] =

4752

zone_movable_pfn[nid] =

4748

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4753

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4749

4754

4750

out:

4755

out:

4751

/* restore the node_state */

4756

/* restore the node_state */

4752

node_states[N_HIGH_MEMORY] = saved_node_state;

4757

node_states[N_HIGH_MEMORY] = saved_node_state;

4753

}

4758

}

4754

4759

4755

/* Any regular memory on that node ? */

4760

/* Any regular memory on that node ? */

4756

static void check_for_regular_memory(pg_data_t *pgdat)

4761

static void check_for_regular_memory(pg_data_t *pgdat)

4757

{

4762

{

4758

#ifdef CONFIG_HIGHMEM

4763

#ifdef CONFIG_HIGHMEM

4759

enum zone_type zone_type;

4764

enum zone_type zone_type;

4760

4765

4761

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

4766

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

4762

struct zone *zone = &pgdat->node_zones[zone_type];

4767

struct zone *zone = &pgdat->node_zones[zone_type];

4763

if (zone->present_pages) {

4768

if (zone->present_pages) {

4764

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

4769

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

4765

break;

4770

break;

4766

}

4771

}

4767

}

4772

}

4768

#endif

4773

#endif

4769

}

4774

}

4770

4775

4771

/**

4776

/**

4772

* free_area_init_nodes - Initialise all pg_data_t and zone data

4777

* free_area_init_nodes - Initialise all pg_data_t and zone data

4773

* @max_zone_pfn: an array of max PFNs for each zone

4778

* @max_zone_pfn: an array of max PFNs for each zone

4774

*

4779

*

4775

* This will call free_area_init_node() for each active node in the system.

4780

* This will call free_area_init_node() for each active node in the system.

4776

* Using the page ranges provided by add_active_range(), the size of each

4781

* Using the page ranges provided by add_active_range(), the size of each

4777

* zone in each node and their holes is calculated. If the maximum PFN

4782

* zone in each node and their holes is calculated. If the maximum PFN

4778

* between two adjacent zones match, it is assumed that the zone is empty.

4783

* between two adjacent zones match, it is assumed that the zone is empty.

4779

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4784

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4780

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4785

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4781

* starts where the previous one ended. For example, ZONE_DMA32 starts

4786

* starts where the previous one ended. For example, ZONE_DMA32 starts

4782

* at arch_max_dma_pfn.

4787

* at arch_max_dma_pfn.

4783

*/

4788

*/

4784

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4789

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4785

{

4790

{

4786

unsigned long start_pfn, end_pfn;

4791

unsigned long start_pfn, end_pfn;

4787

int i, nid;

4792

int i, nid;

4788

4793

4789

/* Record where the zone boundaries are */

4794

/* Record where the zone boundaries are */

4790

memset(arch_zone_lowest_possible_pfn, 0,

4795

memset(arch_zone_lowest_possible_pfn, 0,

4791

sizeof(arch_zone_lowest_possible_pfn));

4796

sizeof(arch_zone_lowest_possible_pfn));

4792

memset(arch_zone_highest_possible_pfn, 0,

4797

memset(arch_zone_highest_possible_pfn, 0,

4793

sizeof(arch_zone_highest_possible_pfn));

4798

sizeof(arch_zone_highest_possible_pfn));

4794

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4799

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4795

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4800

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4796

for (i = 1; i < MAX_NR_ZONES; i++) {

4801

for (i = 1; i < MAX_NR_ZONES; i++) {

4797

if (i == ZONE_MOVABLE)

4802

if (i == ZONE_MOVABLE)

4798

continue;

4803

continue;

4799

arch_zone_lowest_possible_pfn[i] =

4804

arch_zone_lowest_possible_pfn[i] =

4800

arch_zone_highest_possible_pfn[i-1];

4805

arch_zone_highest_possible_pfn[i-1];

4801

arch_zone_highest_possible_pfn[i] =

4806

arch_zone_highest_possible_pfn[i] =

4802

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4807

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4803

}

4808

}

4804

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4809

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4805

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4810

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4806

4811

4807

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4812

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4808

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4813

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4809

find_zone_movable_pfns_for_nodes();

4814

find_zone_movable_pfns_for_nodes();

4810

4815

4811

/* Print out the zone ranges */

4816

/* Print out the zone ranges */

4812

printk("Zone ranges:\n");

4817

printk("Zone ranges:\n");

4813

for (i = 0; i < MAX_NR_ZONES; i++) {

4818

for (i = 0; i < MAX_NR_ZONES; i++) {

4814

if (i == ZONE_MOVABLE)

4819

if (i == ZONE_MOVABLE)

4815

continue;

4820

continue;

4816

printk(KERN_CONT " %-8s ", zone_names[i]);

4821

printk(KERN_CONT " %-8s ", zone_names[i]);

4817

if (arch_zone_lowest_possible_pfn[i] ==

4822

if (arch_zone_lowest_possible_pfn[i] ==

4818

arch_zone_highest_possible_pfn[i])

4823

arch_zone_highest_possible_pfn[i])

4819

printk(KERN_CONT "empty\n");

4824

printk(KERN_CONT "empty\n");

4820

else

4825

else

4821

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

4826

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

4822

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

4827

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

4823

(arch_zone_highest_possible_pfn[i]

4828

(arch_zone_highest_possible_pfn[i]

4824

<< PAGE_SHIFT) - 1);

4829

<< PAGE_SHIFT) - 1);

4825

}

4830

}

4826

4831

4827

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

4832

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

4828

printk("Movable zone start for each node\n");

4833

printk("Movable zone start for each node\n");

4829

for (i = 0; i < MAX_NUMNODES; i++) {

4834

for (i = 0; i < MAX_NUMNODES; i++) {

4830

if (zone_movable_pfn[i])

4835

if (zone_movable_pfn[i])

4831

printk(" Node %d: %#010lx\n", i,

4836

printk(" Node %d: %#010lx\n", i,

4832

zone_movable_pfn[i] << PAGE_SHIFT);

4837

zone_movable_pfn[i] << PAGE_SHIFT);

4833

}

4838

}

4834

4839

4835

/* Print out the early_node_map[] */

4840

/* Print out the early_node_map[] */

4836

printk("Early memory node ranges\n");

4841

printk("Early memory node ranges\n");

4837

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4842

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4838

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

4843

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

4839

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

4844

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

4840

4845

4841

/* Initialise every node */

4846

/* Initialise every node */

4842

mminit_verify_pageflags_layout();

4847

mminit_verify_pageflags_layout();

4843

setup_nr_node_ids();

4848

setup_nr_node_ids();

4844

for_each_online_node(nid) {

4849

for_each_online_node(nid) {

4845

pg_data_t *pgdat = NODE_DATA(nid);

4850

pg_data_t *pgdat = NODE_DATA(nid);

4846

free_area_init_node(nid, NULL,

4851

free_area_init_node(nid, NULL,

4847

find_min_pfn_for_node(nid), NULL);

4852

find_min_pfn_for_node(nid), NULL);

4848

4853

4849

/* Any memory on that node */

4854

/* Any memory on that node */

4850

if (pgdat->node_present_pages)

4855

if (pgdat->node_present_pages)

4851

node_set_state(nid, N_HIGH_MEMORY);

4856

node_set_state(nid, N_HIGH_MEMORY);

4852

check_for_regular_memory(pgdat);

4857

check_for_regular_memory(pgdat);

4853

}

4858

}

4854

}

4859

}

4855

4860

4856

static int __init cmdline_parse_core(char *p, unsigned long *core)

4861

static int __init cmdline_parse_core(char *p, unsigned long *core)

4857

{

4862

{

4858

unsigned long long coremem;

4863

unsigned long long coremem;

4859

if (!p)

4864

if (!p)

4860

return -EINVAL;

4865

return -EINVAL;

4861

4866

4862

coremem = memparse(p, &p);

4867

coremem = memparse(p, &p);

4863

*core = coremem >> PAGE_SHIFT;

4868

*core = coremem >> PAGE_SHIFT;

4864

4869

4865

/* Paranoid check that UL is enough for the coremem value */

4870

/* Paranoid check that UL is enough for the coremem value */

4866

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

4871

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

4867

4872

4868

return 0;

4873

return 0;

4869

}

4874

}

4870

4875

4871

/*

4876

/*

4872

* kernelcore=size sets the amount of memory for use for allocations that

4877

* kernelcore=size sets the amount of memory for use for allocations that

4873

* cannot be reclaimed or migrated.

4878

* cannot be reclaimed or migrated.

4874

*/

4879

*/

4875

static int __init cmdline_parse_kernelcore(char *p)

4880

static int __init cmdline_parse_kernelcore(char *p)

4876

{

4881

{

4877

return cmdline_parse_core(p, &required_kernelcore);

4882

return cmdline_parse_core(p, &required_kernelcore);

4878

}

4883

}

4879

4884

4880

/*

4885

/*

4881

* movablecore=size sets the amount of memory for use for allocations that

4886

* movablecore=size sets the amount of memory for use for allocations that

4882

* can be reclaimed or migrated.

4887

* can be reclaimed or migrated.

4883

*/

4888

*/

4884

static int __init cmdline_parse_movablecore(char *p)

4889

static int __init cmdline_parse_movablecore(char *p)

4885

{

4890

{

4886

return cmdline_parse_core(p, &required_movablecore);

4891

return cmdline_parse_core(p, &required_movablecore);

4887

}

4892

}

4888

4893

4889

early_param("kernelcore", cmdline_parse_kernelcore);

4894

early_param("kernelcore", cmdline_parse_kernelcore);

4890

early_param("movablecore", cmdline_parse_movablecore);

4895

early_param("movablecore", cmdline_parse_movablecore);

4891

4896

4892

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4897

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4893

4898

4894

/**

4899

/**

4895

* set_dma_reserve - set the specified number of pages reserved in the first zone

4900

* set_dma_reserve - set the specified number of pages reserved in the first zone

4896

* @new_dma_reserve: The number of pages to mark reserved

4901

* @new_dma_reserve: The number of pages to mark reserved

4897

*

4902

*

4898

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4903

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4899

* In the DMA zone, a significant percentage may be consumed by kernel image

4904

* In the DMA zone, a significant percentage may be consumed by kernel image

4900

* and other unfreeable allocations which can skew the watermarks badly. This

4905

* and other unfreeable allocations which can skew the watermarks badly. This

4901

* function may optionally be used to account for unfreeable pages in the

4906

* function may optionally be used to account for unfreeable pages in the

4902

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4907

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4903

* smaller per-cpu batchsize.

4908

* smaller per-cpu batchsize.

4904

*/

4909

*/

4905

void __init set_dma_reserve(unsigned long new_dma_reserve)

4910

void __init set_dma_reserve(unsigned long new_dma_reserve)

4906

{

4911

{

4907

dma_reserve = new_dma_reserve;

4912

dma_reserve = new_dma_reserve;

4908

}

4913

}

4909

4914

4910

void __init free_area_init(unsigned long *zones_size)

4915

void __init free_area_init(unsigned long *zones_size)

4911

{

4916

{

4912

free_area_init_node(0, zones_size,

4917

free_area_init_node(0, zones_size,

4913

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4918

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4914

}

4919

}

4915

4920

4916

static int page_alloc_cpu_notify(struct notifier_block *self,

4921

static int page_alloc_cpu_notify(struct notifier_block *self,

4917

unsigned long action, void *hcpu)

4922

unsigned long action, void *hcpu)

4918

{

4923

{

4919

int cpu = (unsigned long)hcpu;

4924

int cpu = (unsigned long)hcpu;

4920

4925

4921

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4926

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4922

lru_add_drain_cpu(cpu);

4927

lru_add_drain_cpu(cpu);

4923

drain_pages(cpu);

4928

drain_pages(cpu);

4924

4929

4925

/*

4930

/*

4926

* Spill the event counters of the dead processor

4931

* Spill the event counters of the dead processor

4927

* into the current processors event counters.

4932

* into the current processors event counters.

4928

* This artificially elevates the count of the current

4933

* This artificially elevates the count of the current

4929

* processor.

4934

* processor.

4930

*/

4935

*/

4931

vm_events_fold_cpu(cpu);

4936

vm_events_fold_cpu(cpu);

4932

4937

4933

/*

4938

/*

4934

* Zero the differential counters of the dead processor

4939

* Zero the differential counters of the dead processor

4935

* so that the vm statistics are consistent.

4940

* so that the vm statistics are consistent.

4936

*

4941

*

4937

* This is only okay since the processor is dead and cannot

4942

* This is only okay since the processor is dead and cannot

4938

* race with what we are doing.

4943

* race with what we are doing.

4939

*/

4944

*/

4940

refresh_cpu_vm_stats(cpu);

4945

refresh_cpu_vm_stats(cpu);

4941

}

4946

}

4942

return NOTIFY_OK;

4947

return NOTIFY_OK;

4943

}

4948

}

4944

4949

4945

void __init page_alloc_init(void)

4950

void __init page_alloc_init(void)

4946

{

4951

{

4947

hotcpu_notifier(page_alloc_cpu_notify, 0);

4952

hotcpu_notifier(page_alloc_cpu_notify, 0);

4948

}

4953

}

4949

4954

4950

/*

4955

/*

4951

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4956

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4952

* or min_free_kbytes changes.

4957

* or min_free_kbytes changes.

4953

*/

4958

*/

4954

static void calculate_totalreserve_pages(void)

4959

static void calculate_totalreserve_pages(void)

4955

{

4960

{

4956

struct pglist_data *pgdat;

4961

struct pglist_data *pgdat;

4957

unsigned long reserve_pages = 0;

4962

unsigned long reserve_pages = 0;

4958

enum zone_type i, j;

4963

enum zone_type i, j;

4959

4964

4960

for_each_online_pgdat(pgdat) {

4965

for_each_online_pgdat(pgdat) {

4961

for (i = 0; i < MAX_NR_ZONES; i++) {

4966

for (i = 0; i < MAX_NR_ZONES; i++) {

4962

struct zone *zone = pgdat->node_zones + i;

4967

struct zone *zone = pgdat->node_zones + i;

4963

unsigned long max = 0;

4968

unsigned long max = 0;

4964

4969

4965

/* Find valid and maximum lowmem_reserve in the zone */

4970

/* Find valid and maximum lowmem_reserve in the zone */

4966

for (j = i; j < MAX_NR_ZONES; j++) {

4971

for (j = i; j < MAX_NR_ZONES; j++) {

4967

if (zone->lowmem_reserve[j] > max)

4972

if (zone->lowmem_reserve[j] > max)

4968

max = zone->lowmem_reserve[j];

4973

max = zone->lowmem_reserve[j];

4969

}

4974

}

4970

4975

4971

/* we treat the high watermark as reserved pages. */

4976

/* we treat the high watermark as reserved pages. */

4972

max += high_wmark_pages(zone);

4977

max += high_wmark_pages(zone);

4973

4978

4974

if (max > zone->present_pages)

4979

if (max > zone->present_pages)

4975

max = zone->present_pages;

4980

max = zone->present_pages;

4976

reserve_pages += max;

4981

reserve_pages += max;

4977

/*

4982

/*

4978

* Lowmem reserves are not available to

4983

* Lowmem reserves are not available to

4979

* GFP_HIGHUSER page cache allocations and

4984

* GFP_HIGHUSER page cache allocations and

4980

* kswapd tries to balance zones to their high

4985

* kswapd tries to balance zones to their high

4981

* watermark. As a result, neither should be

4986

* watermark. As a result, neither should be

4982

* regarded as dirtyable memory, to prevent a

4987

* regarded as dirtyable memory, to prevent a

4983

* situation where reclaim has to clean pages

4988

* situation where reclaim has to clean pages

4984

* in order to balance the zones.

4989

* in order to balance the zones.

4985

*/

4990

*/

4986

zone->dirty_balance_reserve = max;

4991

zone->dirty_balance_reserve = max;

4987

}

4992

}

4988

}

4993

}

4989

dirty_balance_reserve = reserve_pages;

4994

dirty_balance_reserve = reserve_pages;

4990

totalreserve_pages = reserve_pages;

4995

totalreserve_pages = reserve_pages;

4991

}

4996

}

4992

4997

4993

/*

4998

/*

4994

* setup_per_zone_lowmem_reserve - called whenever

4999

* setup_per_zone_lowmem_reserve - called whenever

4995

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5000

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

4996

* has a correct pages reserved value, so an adequate number of

5001

* has a correct pages reserved value, so an adequate number of

4997

* pages are left in the zone after a successful __alloc_pages().

5002

* pages are left in the zone after a successful __alloc_pages().

4998

*/

5003

*/

4999

static void setup_per_zone_lowmem_reserve(void)

5004

static void setup_per_zone_lowmem_reserve(void)

5000

{

5005

{

5001

struct pglist_data *pgdat;

5006

struct pglist_data *pgdat;

5002

enum zone_type j, idx;

5007

enum zone_type j, idx;

5003

5008

5004

for_each_online_pgdat(pgdat) {

5009

for_each_online_pgdat(pgdat) {

5005

for (j = 0; j < MAX_NR_ZONES; j++) {

5010

for (j = 0; j < MAX_NR_ZONES; j++) {

5006

struct zone *zone = pgdat->node_zones + j;

5011

struct zone *zone = pgdat->node_zones + j;

5007

unsigned long present_pages = zone->present_pages;

5012

unsigned long present_pages = zone->present_pages;

5008

5013

5009

zone->lowmem_reserve[j] = 0;

5014

zone->lowmem_reserve[j] = 0;

5010

5015

5011

idx = j;

5016

idx = j;

5012

while (idx) {

5017

while (idx) {

5013

struct zone *lower_zone;

5018

struct zone *lower_zone;

5014

5019

5015

idx--;

5020

idx--;

5016

5021

5017

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5022

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5018

sysctl_lowmem_reserve_ratio[idx] = 1;

5023

sysctl_lowmem_reserve_ratio[idx] = 1;

5019

5024

5020

lower_zone = pgdat->node_zones + idx;

5025

lower_zone = pgdat->node_zones + idx;

5021

lower_zone->lowmem_reserve[j] = present_pages /

5026

lower_zone->lowmem_reserve[j] = present_pages /

5022

sysctl_lowmem_reserve_ratio[idx];

5027

sysctl_lowmem_reserve_ratio[idx];

5023

present_pages += lower_zone->present_pages;

5028

present_pages += lower_zone->present_pages;

5024

}

5029

}

5025

}

5030

}

5026

}

5031

}

5027

5032

5028

/* update totalreserve_pages */

5033

/* update totalreserve_pages */

5029

calculate_totalreserve_pages();

5034

calculate_totalreserve_pages();

5030

}

5035

}

5031

5036

5032

static void __setup_per_zone_wmarks(void)

5037

static void __setup_per_zone_wmarks(void)

5033

{

5038

{

5034

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5039

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5035

unsigned long lowmem_pages = 0;

5040

unsigned long lowmem_pages = 0;

5036

struct zone *zone;

5041

struct zone *zone;

5037

unsigned long flags;

5042

unsigned long flags;

5038

5043

5039

/* Calculate total number of !ZONE_HIGHMEM pages */

5044

/* Calculate total number of !ZONE_HIGHMEM pages */

5040

for_each_zone(zone) {

5045

for_each_zone(zone) {

5041

if (!is_highmem(zone))

5046

if (!is_highmem(zone))

5042

lowmem_pages += zone->present_pages;

5047

lowmem_pages += zone->present_pages;

5043

}

5048

}

5044

5049

5045

for_each_zone(zone) {

5050

for_each_zone(zone) {

5046

u64 tmp;

5051

u64 tmp;

5047

5052

5048

spin_lock_irqsave(&zone->lock, flags);

5053

spin_lock_irqsave(&zone->lock, flags);

5049

tmp = (u64)pages_min * zone->present_pages;

5054

tmp = (u64)pages_min * zone->present_pages;

5050

do_div(tmp, lowmem_pages);

5055

do_div(tmp, lowmem_pages);

5051

if (is_highmem(zone)) {

5056

if (is_highmem(zone)) {

5052

/*

5057

/*

5053

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5058

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5054

* need highmem pages, so cap pages_min to a small

5059

* need highmem pages, so cap pages_min to a small

5055

* value here.

5060

* value here.

5056

*

5061

*

5057

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5062

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5058

* deltas controls asynch page reclaim, and so should

5063

* deltas controls asynch page reclaim, and so should

5059

* not be capped for highmem.

5064

* not be capped for highmem.

5060

*/

5065

*/

5061

int min_pages;

5066

int min_pages;

5062

5067

5063

min_pages = zone->present_pages / 1024;

5068

min_pages = zone->present_pages / 1024;

5064

if (min_pages < SWAP_CLUSTER_MAX)

5069

if (min_pages < SWAP_CLUSTER_MAX)

5065

min_pages = SWAP_CLUSTER_MAX;

5070

min_pages = SWAP_CLUSTER_MAX;

5066

if (min_pages > 128)

5071

if (min_pages > 128)

5067

min_pages = 128;

5072

min_pages = 128;

5068

zone->watermark[WMARK_MIN] = min_pages;

5073

zone->watermark[WMARK_MIN] = min_pages;

5069

} else {

5074

} else {

5070

/*

5075

/*

5071

* If it's a lowmem zone, reserve a number of pages

5076

* If it's a lowmem zone, reserve a number of pages

5072

* proportionate to the zone's size.

5077

* proportionate to the zone's size.

5073

*/

5078

*/

5074

zone->watermark[WMARK_MIN] = tmp;

5079

zone->watermark[WMARK_MIN] = tmp;

5075

}

5080

}

5076

5081

5077

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5082

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5078

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5083

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5079

5084

5080

zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);

5085

zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);

5081

zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);

5086

zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);

5082

zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);

5087

zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);

5083

5088

5084

setup_zone_migrate_reserve(zone);

5089

setup_zone_migrate_reserve(zone);

5085

spin_unlock_irqrestore(&zone->lock, flags);

5090

spin_unlock_irqrestore(&zone->lock, flags);

5086

}

5091

}

5087

5092

5088

/* update totalreserve_pages */

5093

/* update totalreserve_pages */

5089

calculate_totalreserve_pages();

5094

calculate_totalreserve_pages();

5090

}

5095

}

5091

5096

5092

/**

5097

/**

5093

* setup_per_zone_wmarks - called when min_free_kbytes changes

5098

* setup_per_zone_wmarks - called when min_free_kbytes changes

5094

* or when memory is hot-{added|removed}

5099

* or when memory is hot-{added|removed}

5095

*

5100

*

5096

* Ensures that the watermark[min,low,high] values for each zone are set

5101

* Ensures that the watermark[min,low,high] values for each zone are set

5097

* correctly with respect to min_free_kbytes.

5102

* correctly with respect to min_free_kbytes.

5098

*/

5103

*/

5099

void setup_per_zone_wmarks(void)

5104

void setup_per_zone_wmarks(void)

5100

{

5105

{

5101

mutex_lock(&zonelists_mutex);

5106

mutex_lock(&zonelists_mutex);

5102

__setup_per_zone_wmarks();

5107

__setup_per_zone_wmarks();

5103

mutex_unlock(&zonelists_mutex);

5108

mutex_unlock(&zonelists_mutex);

5104

}

5109

}

5105

5110

5106

/*

5111

/*

5107

* The inactive anon list should be small enough that the VM never has to

5112

* The inactive anon list should be small enough that the VM never has to

5108

* do too much work, but large enough that each inactive page has a chance

5113

* do too much work, but large enough that each inactive page has a chance

5109

* to be referenced again before it is swapped out.

5114

* to be referenced again before it is swapped out.

5110

*

5115

*

5111

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5116

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5112

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5117

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5113

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5118

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5114

* the anonymous pages are kept on the inactive list.

5119

* the anonymous pages are kept on the inactive list.

5115

*

5120

*

5116

* total target max

5121

* total target max

5117

* memory ratio inactive anon

5122

* memory ratio inactive anon

5118

* -------------------------------------

5123

* -------------------------------------

5119

* 10MB 1 5MB

5124

* 10MB 1 5MB

5120

* 100MB 1 50MB

5125

* 100MB 1 50MB

5121

* 1GB 3 250MB

5126

* 1GB 3 250MB

5122

* 10GB 10 0.9GB

5127

* 10GB 10 0.9GB

5123

* 100GB 31 3GB

5128

* 100GB 31 3GB

5124

* 1TB 101 10GB

5129

* 1TB 101 10GB

5125

* 10TB 320 32GB

5130

* 10TB 320 32GB

5126

*/

5131

*/

5127

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5132

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5128

{

5133

{

5129

unsigned int gb, ratio;

5134

unsigned int gb, ratio;

5130

5135

5131

/* Zone size in gigabytes */

5136

/* Zone size in gigabytes */

5132

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5137

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5133

if (gb)

5138

if (gb)

5134

ratio = int_sqrt(10 * gb);

5139

ratio = int_sqrt(10 * gb);

5135

else

5140

else

5136

ratio = 1;

5141

ratio = 1;

5137

5142

5138

zone->inactive_ratio = ratio;

5143

zone->inactive_ratio = ratio;

5139

}

5144

}

5140

5145

5141

static void __meminit setup_per_zone_inactive_ratio(void)

5146

static void __meminit setup_per_zone_inactive_ratio(void)

5142

{

5147

{

5143

struct zone *zone;

5148

struct zone *zone;

5144

5149

5145

for_each_zone(zone)

5150

for_each_zone(zone)

5146

calculate_zone_inactive_ratio(zone);

5151

calculate_zone_inactive_ratio(zone);

5147

}

5152

}

5148

5153

5149

/*

5154

/*

5150

* Initialise min_free_kbytes.

5155

* Initialise min_free_kbytes.

5151

*

5156

*

5152

* For small machines we want it small (128k min). For large machines

5157

* For small machines we want it small (128k min). For large machines

5153

* we want it large (64MB max). But it is not linear, because network

5158

* we want it large (64MB max). But it is not linear, because network

5154

* bandwidth does not increase linearly with machine size. We use

5159

* bandwidth does not increase linearly with machine size. We use

5155

*

5160

*

5156

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5161

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5157

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5162

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5158

*

5163

*

5159

* which yields

5164

* which yields

5160

*

5165

*

5161

* 16MB: 512k

5166

* 16MB: 512k

5162

* 32MB: 724k

5167

* 32MB: 724k

5163

* 64MB: 1024k

5168

* 64MB: 1024k

5164

* 128MB: 1448k

5169

* 128MB: 1448k

5165

* 256MB: 2048k

5170

* 256MB: 2048k

5166

* 512MB: 2896k

5171

* 512MB: 2896k

5167

* 1024MB: 4096k

5172

* 1024MB: 4096k

5168

* 2048MB: 5792k

5173

* 2048MB: 5792k

5169

* 4096MB: 8192k

5174

* 4096MB: 8192k

5170

* 8192MB: 11584k

5175

* 8192MB: 11584k

5171

* 16384MB: 16384k

5176

* 16384MB: 16384k

5172

*/

5177

*/

5173

int __meminit init_per_zone_wmark_min(void)

5178

int __meminit init_per_zone_wmark_min(void)

5174

{

5179

{

5175

unsigned long lowmem_kbytes;

5180

unsigned long lowmem_kbytes;

5176

5181

5177

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5182

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5178

5183

5179

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5184

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5180

if (min_free_kbytes < 128)

5185

if (min_free_kbytes < 128)

5181

min_free_kbytes = 128;

5186

min_free_kbytes = 128;

5182

if (min_free_kbytes > 65536)

5187

if (min_free_kbytes > 65536)

5183

min_free_kbytes = 65536;

5188

min_free_kbytes = 65536;

5184

setup_per_zone_wmarks();

5189

setup_per_zone_wmarks();

5185

refresh_zone_stat_thresholds();

5190

refresh_zone_stat_thresholds();

5186

setup_per_zone_lowmem_reserve();

5191

setup_per_zone_lowmem_reserve();

5187

setup_per_zone_inactive_ratio();

5192

setup_per_zone_inactive_ratio();

5188

return 0;

5193

return 0;

5189

}

5194

}

5190

module_init(init_per_zone_wmark_min)

5195

module_init(init_per_zone_wmark_min)

5191

5196

5192

/*

5197

/*

5193

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5198

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5194

* that we can call two helper functions whenever min_free_kbytes

5199

* that we can call two helper functions whenever min_free_kbytes

5195

* changes.

5200

* changes.

5196

*/

5201

*/

5197

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5202

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5198

void __user *buffer, size_t *length, loff_t *ppos)

5203

void __user *buffer, size_t *length, loff_t *ppos)

5199

{

5204

{

5200

proc_dointvec(table, write, buffer, length, ppos);

5205

proc_dointvec(table, write, buffer, length, ppos);

5201

if (write)

5206

if (write)

5202

setup_per_zone_wmarks();

5207

setup_per_zone_wmarks();

5203

return 0;

5208

return 0;

5204

}

5209

}

5205

5210

5206

#ifdef CONFIG_NUMA

5211

#ifdef CONFIG_NUMA

5207

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5212

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5208

void __user *buffer, size_t *length, loff_t *ppos)

5213

void __user *buffer, size_t *length, loff_t *ppos)

5209

{

5214

{

5210

struct zone *zone;

5215

struct zone *zone;

5211

int rc;

5216

int rc;

5212

5217

5213

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5218

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5214

if (rc)

5219

if (rc)

5215

return rc;

5220

return rc;

5216

5221

5217

for_each_zone(zone)

5222

for_each_zone(zone)

5218

zone->min_unmapped_pages = (zone->present_pages *

5223

zone->min_unmapped_pages = (zone->present_pages *

5219

sysctl_min_unmapped_ratio) / 100;

5224

sysctl_min_unmapped_ratio) / 100;

5220

return 0;

5225

return 0;

5221

}

5226

}

5222

5227

5223

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5228

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5224

void __user *buffer, size_t *length, loff_t *ppos)

5229

void __user *buffer, size_t *length, loff_t *ppos)

5225

{

5230

{

5226

struct zone *zone;

5231

struct zone *zone;

5227

int rc;

5232

int rc;

5228

5233

5229

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5234

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5230

if (rc)

5235

if (rc)

5231

return rc;

5236

return rc;

5232

5237

5233

for_each_zone(zone)

5238

for_each_zone(zone)

5234

zone->min_slab_pages = (zone->present_pages *

5239

zone->min_slab_pages = (zone->present_pages *

5235

sysctl_min_slab_ratio) / 100;

5240

sysctl_min_slab_ratio) / 100;

5236

return 0;

5241

return 0;

5237

}

5242

}

5238

#endif

5243

#endif

5239

5244

5240

/*

5245

/*

5241

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5246

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5242

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5247

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5243

* whenever sysctl_lowmem_reserve_ratio changes.

5248

* whenever sysctl_lowmem_reserve_ratio changes.

5244

*

5249

*

5245

* The reserve ratio obviously has absolutely no relation with the

5250

* The reserve ratio obviously has absolutely no relation with the

5246

* minimum watermarks. The lowmem reserve ratio can only make sense

5251

* minimum watermarks. The lowmem reserve ratio can only make sense

5247

* if in function of the boot time zone sizes.

5252

* if in function of the boot time zone sizes.

5248

*/

5253

*/

5249

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5254

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5250

void __user *buffer, size_t *length, loff_t *ppos)

5255

void __user *buffer, size_t *length, loff_t *ppos)

5251

{

5256

{

5252

proc_dointvec_minmax(table, write, buffer, length, ppos);

5257

proc_dointvec_minmax(table, write, buffer, length, ppos);

5253

setup_per_zone_lowmem_reserve();

5258

setup_per_zone_lowmem_reserve();

5254

return 0;

5259

return 0;

5255

}

5260

}

5256

5261

5257

/*

5262

/*

5258

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5263

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5259

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5264

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5260

* can have before it gets flushed back to buddy allocator.

5265

* can have before it gets flushed back to buddy allocator.

5261

*/

5266

*/

5262

5267

5263

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5268

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5264

void __user *buffer, size_t *length, loff_t *ppos)

5269

void __user *buffer, size_t *length, loff_t *ppos)

5265

{

5270

{

5266

struct zone *zone;

5271

struct zone *zone;

5267

unsigned int cpu;

5272

unsigned int cpu;

5268

int ret;

5273

int ret;

5269

5274

5270

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5275

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5271

if (!write || (ret < 0))

5276

if (!write || (ret < 0))

5272

return ret;

5277

return ret;

5273

for_each_populated_zone(zone) {

5278

for_each_populated_zone(zone) {

5274

for_each_possible_cpu(cpu) {

5279

for_each_possible_cpu(cpu) {

5275

unsigned long high;

5280

unsigned long high;

5276

high = zone->present_pages / percpu_pagelist_fraction;

5281

high = zone->present_pages / percpu_pagelist_fraction;

5277

setup_pagelist_highmark(

5282

setup_pagelist_highmark(

5278

per_cpu_ptr(zone->pageset, cpu), high);

5283

per_cpu_ptr(zone->pageset, cpu), high);

5279

}

5284

}

5280

}

5285

}

5281

return 0;

5286

return 0;

5282

}

5287

}

5283

5288

5284

int hashdist = HASHDIST_DEFAULT;

5289

int hashdist = HASHDIST_DEFAULT;

5285

5290

5286

#ifdef CONFIG_NUMA

5291

#ifdef CONFIG_NUMA

5287

static int __init set_hashdist(char *str)

5292

static int __init set_hashdist(char *str)

5288

{

5293

{

5289

if (!str)

5294

if (!str)

5290

return 0;

5295

return 0;

5291

hashdist = simple_strtoul(str, &str, 0);

5296

hashdist = simple_strtoul(str, &str, 0);

5292

return 1;

5297

return 1;

5293

}

5298

}

5294

__setup("hashdist=", set_hashdist);

5299

__setup("hashdist=", set_hashdist);

5295

#endif

5300

#endif

5296

5301

5297

/*

5302

/*

5298

* allocate a large system hash table from bootmem

5303

* allocate a large system hash table from bootmem

5299

* - it is assumed that the hash table must contain an exact power-of-2

5304

* - it is assumed that the hash table must contain an exact power-of-2

5300

* quantity of entries

5305

* quantity of entries

5301

* - limit is the number of hash buckets, not the total allocation size

5306

* - limit is the number of hash buckets, not the total allocation size

5302

*/

5307

*/

5303

void *__init alloc_large_system_hash(const char *tablename,

5308

void *__init alloc_large_system_hash(const char *tablename,

5304

unsigned long bucketsize,

5309

unsigned long bucketsize,

5305

unsigned long numentries,

5310

unsigned long numentries,

5306

int scale,

5311

int scale,

5307

int flags,

5312

int flags,

5308

unsigned int *_hash_shift,

5313

unsigned int *_hash_shift,

5309

unsigned int *_hash_mask,

5314

unsigned int *_hash_mask,

5310

unsigned long low_limit,

5315

unsigned long low_limit,

5311

unsigned long high_limit)

5316

unsigned long high_limit)

5312

{

5317

{

5313

unsigned long long max = high_limit;

5318

unsigned long long max = high_limit;

5314

unsigned long log2qty, size;

5319

unsigned long log2qty, size;

5315

void *table = NULL;

5320

void *table = NULL;

5316

5321

5317

/* allow the kernel cmdline to have a say */

5322

/* allow the kernel cmdline to have a say */

5318

if (!numentries) {

5323

if (!numentries) {

5319

/* round applicable memory size up to nearest megabyte */

5324

/* round applicable memory size up to nearest megabyte */

5320

numentries = nr_kernel_pages;

5325

numentries = nr_kernel_pages;

5321

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5326

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5322

numentries >>= 20 - PAGE_SHIFT;

5327

numentries >>= 20 - PAGE_SHIFT;

5323

numentries <<= 20 - PAGE_SHIFT;

5328

numentries <<= 20 - PAGE_SHIFT;

5324

5329

5325

/* limit to 1 bucket per 2^scale bytes of low memory */

5330

/* limit to 1 bucket per 2^scale bytes of low memory */

5326

if (scale > PAGE_SHIFT)

5331

if (scale > PAGE_SHIFT)

5327

numentries >>= (scale - PAGE_SHIFT);

5332

numentries >>= (scale - PAGE_SHIFT);

5328

else

5333

else

5329

numentries <<= (PAGE_SHIFT - scale);

5334

numentries <<= (PAGE_SHIFT - scale);

5330

5335

5331

/* Make sure we've got at least a 0-order allocation.. */

5336

/* Make sure we've got at least a 0-order allocation.. */

5332

if (unlikely(flags & HASH_SMALL)) {

5337

if (unlikely(flags & HASH_SMALL)) {

5333

/* Makes no sense without HASH_EARLY */

5338

/* Makes no sense without HASH_EARLY */

5334

WARN_ON(!(flags & HASH_EARLY));

5339

WARN_ON(!(flags & HASH_EARLY));

5335

if (!(numentries >> *_hash_shift)) {

5340

if (!(numentries >> *_hash_shift)) {

5336

numentries = 1UL << *_hash_shift;

5341

numentries = 1UL << *_hash_shift;

5337

BUG_ON(!numentries);

5342

BUG_ON(!numentries);

5338

}

5343

}

5339

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5344

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5340

numentries = PAGE_SIZE / bucketsize;

5345

numentries = PAGE_SIZE / bucketsize;

5341

}

5346

}

5342

numentries = roundup_pow_of_two(numentries);

5347

numentries = roundup_pow_of_two(numentries);

5343

5348

5344

/* limit allocation size to 1/16 total memory by default */

5349

/* limit allocation size to 1/16 total memory by default */

5345

if (max == 0) {

5350

if (max == 0) {

5346

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5351

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5347

do_div(max, bucketsize);

5352

do_div(max, bucketsize);

5348

}

5353

}

5349

max = min(max, 0x80000000ULL);

5354

max = min(max, 0x80000000ULL);

5350

5355

5351

if (numentries < low_limit)

5356

if (numentries < low_limit)

5352

numentries = low_limit;

5357

numentries = low_limit;

5353

if (numentries > max)

5358

if (numentries > max)

5354

numentries = max;

5359

numentries = max;

5355

5360

5356

log2qty = ilog2(numentries);

5361

log2qty = ilog2(numentries);

5357

5362

5358

do {

5363

do {

5359

size = bucketsize << log2qty;

5364

size = bucketsize << log2qty;

5360

if (flags & HASH_EARLY)

5365

if (flags & HASH_EARLY)

5361

table = alloc_bootmem_nopanic(size);

5366

table = alloc_bootmem_nopanic(size);

5362

else if (hashdist)

5367

else if (hashdist)

5363

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5368

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5364

else {

5369

else {

5365

/*

5370

/*

5366

* If bucketsize is not a power-of-two, we may free

5371

* If bucketsize is not a power-of-two, we may free

5367

* some pages at the end of hash table which

5372

* some pages at the end of hash table which

5368

* alloc_pages_exact() automatically does

5373

* alloc_pages_exact() automatically does

5369

*/

5374

*/

5370

if (get_order(size) < MAX_ORDER) {

5375

if (get_order(size) < MAX_ORDER) {

5371

table = alloc_pages_exact(size, GFP_ATOMIC);

5376

table = alloc_pages_exact(size, GFP_ATOMIC);

5372

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5377

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5373

}

5378

}

5374

}

5379

}

5375

} while (!table && size > PAGE_SIZE && --log2qty);

5380

} while (!table && size > PAGE_SIZE && --log2qty);

5376

5381

5377

if (!table)

5382

if (!table)

5378

panic("Failed to allocate %s hash table\n", tablename);

5383

panic("Failed to allocate %s hash table\n", tablename);

5379

5384

5380

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5385

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5381

tablename,

5386

tablename,

5382

(1UL << log2qty),

5387

(1UL << log2qty),

5383

ilog2(size) - PAGE_SHIFT,

5388

ilog2(size) - PAGE_SHIFT,

5384

size);

5389

size);

5385

5390

5386

if (_hash_shift)

5391

if (_hash_shift)

5387

*_hash_shift = log2qty;

5392

*_hash_shift = log2qty;

5388

if (_hash_mask)

5393

if (_hash_mask)

5389

*_hash_mask = (1 << log2qty) - 1;

5394

*_hash_mask = (1 << log2qty) - 1;

5390

5395

5391

return table;

5396

return table;

5392

}

5397

}

5393

5398

5394

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5399

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5395

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5400

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5396

unsigned long pfn)

5401

unsigned long pfn)

5397

{

5402

{

5398

#ifdef CONFIG_SPARSEMEM

5403

#ifdef CONFIG_SPARSEMEM

5399

return __pfn_to_section(pfn)->pageblock_flags;

5404

return __pfn_to_section(pfn)->pageblock_flags;

5400

#else

5405

#else

5401

return zone->pageblock_flags;

5406

return zone->pageblock_flags;

5402

#endif /* CONFIG_SPARSEMEM */

5407

#endif /* CONFIG_SPARSEMEM */

5403

}

5408

}

5404

5409

5405

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5410

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5406

{

5411

{

5407

#ifdef CONFIG_SPARSEMEM

5412

#ifdef CONFIG_SPARSEMEM

5408

pfn &= (PAGES_PER_SECTION-1);

5413

pfn &= (PAGES_PER_SECTION-1);

5409

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5414

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5410

#else

5415

#else

5411

pfn = pfn - zone->zone_start_pfn;

5416

pfn = pfn - zone->zone_start_pfn;

5412

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5417

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5413

#endif /* CONFIG_SPARSEMEM */

5418

#endif /* CONFIG_SPARSEMEM */

5414

}

5419

}

5415

5420

5416

/**

5421

/**

5417

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5422

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5418

* @page: The page within the block of interest

5423

* @page: The page within the block of interest

5419

* @start_bitidx: The first bit of interest to retrieve

5424

* @start_bitidx: The first bit of interest to retrieve

5420

* @end_bitidx: The last bit of interest

5425

* @end_bitidx: The last bit of interest

5421

* returns pageblock_bits flags

5426

* returns pageblock_bits flags

5422

*/

5427

*/

5423

unsigned long get_pageblock_flags_group(struct page *page,

5428

unsigned long get_pageblock_flags_group(struct page *page,

5424

int start_bitidx, int end_bitidx)

5429

int start_bitidx, int end_bitidx)

5425

{

5430

{

5426

struct zone *zone;

5431

struct zone *zone;

5427

unsigned long *bitmap;

5432

unsigned long *bitmap;

5428

unsigned long pfn, bitidx;

5433

unsigned long pfn, bitidx;

5429

unsigned long flags = 0;

5434

unsigned long flags = 0;

5430

unsigned long value = 1;

5435

unsigned long value = 1;

5431

5436

5432

zone = page_zone(page);

5437

zone = page_zone(page);

5433

pfn = page_to_pfn(page);

5438

pfn = page_to_pfn(page);

5434

bitmap = get_pageblock_bitmap(zone, pfn);

5439

bitmap = get_pageblock_bitmap(zone, pfn);

5435

bitidx = pfn_to_bitidx(zone, pfn);

5440

bitidx = pfn_to_bitidx(zone, pfn);

5436

5441

5437

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5442

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5438

if (test_bit(bitidx + start_bitidx, bitmap))

5443

if (test_bit(bitidx + start_bitidx, bitmap))

5439

flags |= value;

5444

flags |= value;

5440

5445

5441

return flags;

5446

return flags;

5442

}

5447

}

5443

5448

5444

/**

5449

/**

5445

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5450

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5446

* @page: The page within the block of interest

5451

* @page: The page within the block of interest

5447

* @start_bitidx: The first bit of interest

5452

* @start_bitidx: The first bit of interest

5448

* @end_bitidx: The last bit of interest

5453

* @end_bitidx: The last bit of interest

5449

* @flags: The flags to set

5454

* @flags: The flags to set

5450

*/

5455

*/

5451

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5456

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5452

int start_bitidx, int end_bitidx)

5457

int start_bitidx, int end_bitidx)

5453

{

5458

{

5454

struct zone *zone;

5459

struct zone *zone;

5455

unsigned long *bitmap;

5460

unsigned long *bitmap;

5456

unsigned long pfn, bitidx;

5461

unsigned long pfn, bitidx;

5457

unsigned long value = 1;

5462

unsigned long value = 1;

5458

5463

5459

zone = page_zone(page);

5464

zone = page_zone(page);

5460

pfn = page_to_pfn(page);

5465

pfn = page_to_pfn(page);

5461

bitmap = get_pageblock_bitmap(zone, pfn);

5466

bitmap = get_pageblock_bitmap(zone, pfn);

5462

bitidx = pfn_to_bitidx(zone, pfn);

5467

bitidx = pfn_to_bitidx(zone, pfn);

5463

VM_BUG_ON(pfn < zone->zone_start_pfn);

5468

VM_BUG_ON(pfn < zone->zone_start_pfn);

5464

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5469

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5465

5470

5466

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5471

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5467

if (flags & value)

5472

if (flags & value)

5468

__set_bit(bitidx + start_bitidx, bitmap);

5473

__set_bit(bitidx + start_bitidx, bitmap);

5469

else

5474

else

5470

__clear_bit(bitidx + start_bitidx, bitmap);

5475

__clear_bit(bitidx + start_bitidx, bitmap);

5471

}

5476

}

5472

5477

5473

/*

5478

/*

5474

* This function checks whether pageblock includes unmovable pages or not.

5479

* This function checks whether pageblock includes unmovable pages or not.

5475

* If @count is not zero, it is okay to include less @count unmovable pages

5480

* If @count is not zero, it is okay to include less @count unmovable pages

5476

*

5481

*

5477

* PageLRU check wihtout isolation or lru_lock could race so that

5482

* PageLRU check wihtout isolation or lru_lock could race so that

5478

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5483

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5479

* expect this function should be exact.

5484

* expect this function should be exact.

5480

*/

5485

*/

5481

static bool

5486

static bool

5482

__has_unmovable_pages(struct zone *zone, struct page *page, int count)

5487

__has_unmovable_pages(struct zone *zone, struct page *page, int count)

5483

{

5488

{

5484

unsigned long pfn, iter, found;

5489

unsigned long pfn, iter, found;

5485

int mt;

5490

int mt;

5486

5491

5487

/*

5492

/*

5488

* For avoiding noise data, lru_add_drain_all() should be called

5493

* For avoiding noise data, lru_add_drain_all() should be called

5489

* If ZONE_MOVABLE, the zone never contains unmovable pages

5494

* If ZONE_MOVABLE, the zone never contains unmovable pages

5490

*/

5495

*/

5491

if (zone_idx(zone) == ZONE_MOVABLE)

5496

if (zone_idx(zone) == ZONE_MOVABLE)

5492

return false;

5497

return false;

5493

mt = get_pageblock_migratetype(page);

5498

mt = get_pageblock_migratetype(page);

5494

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5499

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5495

return false;

5500

return false;

5496

5501

5497

pfn = page_to_pfn(page);

5502

pfn = page_to_pfn(page);

5498

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5503

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5499

unsigned long check = pfn + iter;

5504

unsigned long check = pfn + iter;

5500

5505

5501

if (!pfn_valid_within(check))

5506

if (!pfn_valid_within(check))

5502

continue;

5507

continue;

5503

5508

5504

page = pfn_to_page(check);

5509

page = pfn_to_page(check);

5505

/*

5510

/*

5506

* We can't use page_count without pin a page

5511

* We can't use page_count without pin a page

5507

* because another CPU can free compound page.

5512

* because another CPU can free compound page.

5508

* This check already skips compound tails of THP

5513

* This check already skips compound tails of THP

5509

* because their page->_count is zero at all time.

5514

* because their page->_count is zero at all time.

5510

*/

5515

*/

5511

if (!atomic_read(&page->_count)) {

5516

if (!atomic_read(&page->_count)) {

5512

if (PageBuddy(page))

5517

if (PageBuddy(page))

5513

iter += (1 << page_order(page)) - 1;

5518

iter += (1 << page_order(page)) - 1;

5514

continue;

5519

continue;

5515

}

5520

}

5516

5521

5517

if (!PageLRU(page))

5522

if (!PageLRU(page))

5518

found++;

5523

found++;

5519

/*

5524

/*

5520

* If there are RECLAIMABLE pages, we need to check it.

5525

* If there are RECLAIMABLE pages, we need to check it.

5521

* But now, memory offline itself doesn't call shrink_slab()

5526

* But now, memory offline itself doesn't call shrink_slab()

5522

* and it still to be fixed.

5527

* and it still to be fixed.

5523

*/

5528

*/

5524

/*

5529

/*

5525

* If the page is not RAM, page_count()should be 0.

5530

* If the page is not RAM, page_count()should be 0.

5526

* we don't need more check. This is an _used_ not-movable page.

5531

* we don't need more check. This is an _used_ not-movable page.

5527

*

5532

*

5528

* The problematic thing here is PG_reserved pages. PG_reserved

5533

* The problematic thing here is PG_reserved pages. PG_reserved

5529

* is set to both of a memory hole page and a _used_ kernel

5534

* is set to both of a memory hole page and a _used_ kernel

5530

* page at boot.

5535

* page at boot.

5531

*/

5536

*/

5532

if (found > count)

5537

if (found > count)

5533

return true;

5538

return true;

5534

}

5539

}

5535

return false;

5540

return false;

5536

}

5541

}

5537

5542

5538

bool is_pageblock_removable_nolock(struct page *page)

5543

bool is_pageblock_removable_nolock(struct page *page)

5539

{

5544

{

5540

struct zone *zone;

5545

struct zone *zone;

5541

unsigned long pfn;

5546

unsigned long pfn;

5542

5547

5543

/*

5548

/*

5544

* We have to be careful here because we are iterating over memory

5549

* We have to be careful here because we are iterating over memory

5545

* sections which are not zone aware so we might end up outside of

5550

* sections which are not zone aware so we might end up outside of

5546

* the zone but still within the section.

5551

* the zone but still within the section.

5547

* We have to take care about the node as well. If the node is offline

5552

* We have to take care about the node as well. If the node is offline

5548

* its NODE_DATA will be NULL - see page_zone.

5553

* its NODE_DATA will be NULL - see page_zone.

5549

*/

5554

*/

5550

if (!node_online(page_to_nid(page)))

5555

if (!node_online(page_to_nid(page)))

5551

return false;

5556

return false;

5552

5557

5553

zone = page_zone(page);

5558

zone = page_zone(page);

5554

pfn = page_to_pfn(page);

5559

pfn = page_to_pfn(page);

5555

if (zone->zone_start_pfn > pfn ||

5560

if (zone->zone_start_pfn > pfn ||

5556

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5561

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5557

return false;

5562

return false;

5558

5563

5559

return !__has_unmovable_pages(zone, page, 0);

5564

return !__has_unmovable_pages(zone, page, 0);

5560

}

5565

}

5561

5566

5562

int set_migratetype_isolate(struct page *page)

5567

int set_migratetype_isolate(struct page *page)

5563

{

5568

{

5564

struct zone *zone;

5569

struct zone *zone;

5565

unsigned long flags, pfn;

5570

unsigned long flags, pfn;

5566

struct memory_isolate_notify arg;

5571

struct memory_isolate_notify arg;

5567

int notifier_ret;

5572

int notifier_ret;

5568

int ret = -EBUSY;

5573

int ret = -EBUSY;

5569

5574

5570

zone = page_zone(page);

5575

zone = page_zone(page);

5571

5576

5572

spin_lock_irqsave(&zone->lock, flags);

5577

spin_lock_irqsave(&zone->lock, flags);

5573

5578

5574

pfn = page_to_pfn(page);

5579

pfn = page_to_pfn(page);

5575

arg.start_pfn = pfn;

5580

arg.start_pfn = pfn;

5576

arg.nr_pages = pageblock_nr_pages;

5581

arg.nr_pages = pageblock_nr_pages;

5577

arg.pages_found = 0;

5582

arg.pages_found = 0;

5578

5583

5579

/*

5584

/*

5580

* It may be possible to isolate a pageblock even if the

5585

* It may be possible to isolate a pageblock even if the

5581

* migratetype is not MIGRATE_MOVABLE. The memory isolation

5586

* migratetype is not MIGRATE_MOVABLE. The memory isolation

5582

* notifier chain is used by balloon drivers to return the

5587

* notifier chain is used by balloon drivers to return the

5583

* number of pages in a range that are held by the balloon

5588

* number of pages in a range that are held by the balloon

5584

* driver to shrink memory. If all the pages are accounted for

5589

* driver to shrink memory. If all the pages are accounted for

5585

* by balloons, are free, or on the LRU, isolation can continue.

5590

* by balloons, are free, or on the LRU, isolation can continue.

5586

* Later, for example, when memory hotplug notifier runs, these

5591

* Later, for example, when memory hotplug notifier runs, these

5587

* pages reported as "can be isolated" should be isolated(freed)

5592

* pages reported as "can be isolated" should be isolated(freed)

5588

* by the balloon driver through the memory notifier chain.

5593

* by the balloon driver through the memory notifier chain.

5589

*/

5594

*/

5590

notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);

5595

notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);

5591

notifier_ret = notifier_to_errno(notifier_ret);

5596

notifier_ret = notifier_to_errno(notifier_ret);

5592

if (notifier_ret)

5597

if (notifier_ret)

5593

goto out;

5598

goto out;

5594

/*

5599

/*

5595

* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.

5600

* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.

5596

* We just check MOVABLE pages.

5601

* We just check MOVABLE pages.

5597

*/

5602

*/

5598

if (!__has_unmovable_pages(zone, page, arg.pages_found))

5603

if (!__has_unmovable_pages(zone, page, arg.pages_found))

5599

ret = 0;

5604

ret = 0;

5600

/*

5605

/*

5601

* Unmovable means "not-on-lru" pages. If Unmovable pages are

5606

* Unmovable means "not-on-lru" pages. If Unmovable pages are

5602

* larger than removable-by-driver pages reported by notifier,

5607

* larger than removable-by-driver pages reported by notifier,

5603

* we'll fail.

5608

* we'll fail.

5604

*/

5609

*/

5605

5610

5606

out:

5611

out:

5607

if (!ret) {

5612

if (!ret) {

5608

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

5613

set_pageblock_migratetype(page, MIGRATE_ISOLATE);

5609

move_freepages_block(zone, page, MIGRATE_ISOLATE);

5614

move_freepages_block(zone, page, MIGRATE_ISOLATE);

5610

}

5615

}

5611

5616

5612

spin_unlock_irqrestore(&zone->lock, flags);

5617

spin_unlock_irqrestore(&zone->lock, flags);

5613

if (!ret)

5618

if (!ret)

5614

drain_all_pages();

5619

drain_all_pages();

5615

return ret;

5620

return ret;

5616

}

5621

}

5617

5622

5618

void unset_migratetype_isolate(struct page *page, unsigned migratetype)

5623

void unset_migratetype_isolate(struct page *page, unsigned migratetype)

5619

{

5624

{

5620

struct zone *zone;

5625

struct zone *zone;

5621

unsigned long flags;

5626

unsigned long flags;

5622

zone = page_zone(page);

5627

zone = page_zone(page);

5623

spin_lock_irqsave(&zone->lock, flags);

5628

spin_lock_irqsave(&zone->lock, flags);

5624

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

5629

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

5625

goto out;

5630

goto out;

5626

set_pageblock_migratetype(page, migratetype);

5631

set_pageblock_migratetype(page, migratetype);

5627

move_freepages_block(zone, page, migratetype);

5632

move_freepages_block(zone, page, migratetype);

5628

out:

5633

out:

5629

spin_unlock_irqrestore(&zone->lock, flags);

5634

spin_unlock_irqrestore(&zone->lock, flags);

5630

}

5635

}

5631

5636

5632

#ifdef CONFIG_CMA

5637

#ifdef CONFIG_CMA

5633

5638

5634

static unsigned long pfn_max_align_down(unsigned long pfn)

5639

static unsigned long pfn_max_align_down(unsigned long pfn)

5635

{

5640

{

5636

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5641

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5637

pageblock_nr_pages) - 1);

5642

pageblock_nr_pages) - 1);

5638

}

5643

}

5639

5644

5640

static unsigned long pfn_max_align_up(unsigned long pfn)

5645

static unsigned long pfn_max_align_up(unsigned long pfn)

5641

{

5646

{

5642

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5647

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5643

pageblock_nr_pages));

5648

pageblock_nr_pages));

5644

}

5649

}

5645

5650

5646

static struct page *

5651

static struct page *

5647

__alloc_contig_migrate_alloc(struct page *page, unsigned long private,

5652

__alloc_contig_migrate_alloc(struct page *page, unsigned long private,

5648

int **resultp)

5653

int **resultp)

5649

{

5654

{

5650

gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;

5655

gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;

5651

5656

5652

if (PageHighMem(page))

5657

if (PageHighMem(page))

5653

gfp_mask |= __GFP_HIGHMEM;

5658

gfp_mask |= __GFP_HIGHMEM;

5654

5659

5655

return alloc_page(gfp_mask);

5660

return alloc_page(gfp_mask);

5656

}

5661

}

5657

5662

5658

/* [start, end) must belong to a single zone. */

5663

/* [start, end) must belong to a single zone. */

5659

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

5664

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

5660

{

5665

{

5661

/* This function is based on compact_zone() from compaction.c. */

5666

/* This function is based on compact_zone() from compaction.c. */

5662

5667

5663

unsigned long pfn = start;

5668

unsigned long pfn = start;

5664

unsigned int tries = 0;

5669

unsigned int tries = 0;

5665

int ret = 0;

5670

int ret = 0;

5666

5671

5667

struct compact_control cc = {

5672

struct compact_control cc = {

5668

.nr_migratepages = 0,

5673

.nr_migratepages = 0,

5669

.order = -1,

5674

.order = -1,

5670

.zone = page_zone(pfn_to_page(start)),

5675

.zone = page_zone(pfn_to_page(start)),

5671

.sync = true,

5676

.sync = true,

5672

};

5677

};

5673

INIT_LIST_HEAD(&cc.migratepages);

5678

INIT_LIST_HEAD(&cc.migratepages);

5674

5679

5675

migrate_prep_local();

5680

migrate_prep_local();

5676

5681

5677

while (pfn < end || !list_empty(&cc.migratepages)) {

5682

while (pfn < end || !list_empty(&cc.migratepages)) {

5678

if (fatal_signal_pending(current)) {

5683

if (fatal_signal_pending(current)) {

5679

ret = -EINTR;

5684

ret = -EINTR;

5680

break;

5685

break;

5681

}

5686

}

5682

5687

5683

if (list_empty(&cc.migratepages)) {

5688

if (list_empty(&cc.migratepages)) {

5684

cc.nr_migratepages = 0;

5689

cc.nr_migratepages = 0;

5685

pfn = isolate_migratepages_range(cc.zone, &cc,

5690

pfn = isolate_migratepages_range(cc.zone, &cc,

5686

pfn, end);

5691

pfn, end);

5687

if (!pfn) {

5692

if (!pfn) {

5688

ret = -EINTR;

5693

ret = -EINTR;

5689

break;

5694

break;

5690

}

5695

}

5691

tries = 0;

5696

tries = 0;

5692

} else if (++tries == 5) {

5697

} else if (++tries == 5) {

5693

ret = ret < 0 ? ret : -EBUSY;

5698

ret = ret < 0 ? ret : -EBUSY;

5694

break;

5699

break;

5695

}

5700

}

5696

5701

5697

ret = migrate_pages(&cc.migratepages,

5702

ret = migrate_pages(&cc.migratepages,

5698

__alloc_contig_migrate_alloc,

5703

__alloc_contig_migrate_alloc,

5699

0, false, MIGRATE_SYNC);

5704

0, false, MIGRATE_SYNC);

5700

}

5705

}

5701

5706

5702

putback_lru_pages(&cc.migratepages);

5707

putback_lru_pages(&cc.migratepages);

5703

return ret > 0 ? 0 : ret;

5708

return ret > 0 ? 0 : ret;

5704

}

5709

}

5705

5710

5706

/*

5711

/*

5707

* Update zone's cma pages counter used for watermark level calculation.

5712

* Update zone's cma pages counter used for watermark level calculation.

5708

*/

5713

*/

5709

static inline void __update_cma_watermarks(struct zone *zone, int count)

5714

static inline void __update_cma_watermarks(struct zone *zone, int count)

5710

{

5715

{

5711

unsigned long flags;

5716

unsigned long flags;

5712

spin_lock_irqsave(&zone->lock, flags);

5717

spin_lock_irqsave(&zone->lock, flags);

5713

zone->min_cma_pages += count;

5718

zone->min_cma_pages += count;

5714

spin_unlock_irqrestore(&zone->lock, flags);

5719

spin_unlock_irqrestore(&zone->lock, flags);

5715

setup_per_zone_wmarks();

5720

setup_per_zone_wmarks();

5716

}

5721

}

5717

5722

5718

/*

5723

/*

5719

* Trigger memory pressure bump to reclaim some pages in order to be able to

5724

* Trigger memory pressure bump to reclaim some pages in order to be able to

5720

* allocate 'count' pages in single page units. Does similar work as

5725

* allocate 'count' pages in single page units. Does similar work as

5721

*__alloc_pages_slowpath() function.

5726

*__alloc_pages_slowpath() function.

5722

*/

5727

*/

5723

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

5728

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

5724

{

5729

{

5725

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

5730

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

5726

struct zonelist *zonelist = node_zonelist(0, gfp_mask);

5731

struct zonelist *zonelist = node_zonelist(0, gfp_mask);

5727

int did_some_progress = 0;

5732

int did_some_progress = 0;

5728

int order = 1;

5733

int order = 1;

5729

5734

5730

/*

5735

/*

5731

* Increase level of watermarks to force kswapd do his job

5736

* Increase level of watermarks to force kswapd do his job

5732

* to stabilise at new watermark level.

5737

* to stabilise at new watermark level.

5733

*/

5738

*/

5734

__update_cma_watermarks(zone, count);

5739

__update_cma_watermarks(zone, count);

5735

5740

5736

/* Obey watermarks as if the page was being allocated */

5741

/* Obey watermarks as if the page was being allocated */

5737

while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {

5742

while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {

5738

wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

5743

wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

5739

5744

5740

did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

5745

did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

5741

NULL);

5746

NULL);

5742

if (!did_some_progress) {

5747

if (!did_some_progress) {

5743

/* Exhausted what can be done so it's blamo time */

5748

/* Exhausted what can be done so it's blamo time */

5744

out_of_memory(zonelist, gfp_mask, order, NULL, false);

5749

out_of_memory(zonelist, gfp_mask, order, NULL, false);

5745

}

5750

}

5746

}

5751

}

5747

5752

5748

/* Restore original watermark levels. */

5753

/* Restore original watermark levels. */

5749

__update_cma_watermarks(zone, -count);

5754

__update_cma_watermarks(zone, -count);

5750

5755

5751

return count;

5756

return count;

5752

}

5757

}

5753

5758

5754

/**

5759

/**

5755

* alloc_contig_range() -- tries to allocate given range of pages

5760

* alloc_contig_range() -- tries to allocate given range of pages

5756

* @start: start PFN to allocate

5761

* @start: start PFN to allocate

5757

* @end: one-past-the-last PFN to allocate

5762

* @end: one-past-the-last PFN to allocate

5758

* @migratetype: migratetype of the underlaying pageblocks (either

5763

* @migratetype: migratetype of the underlaying pageblocks (either

5759

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5764

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5760

* in range must have the same migratetype and it must

5765

* in range must have the same migratetype and it must

5761

* be either of the two.

5766

* be either of the two.

5762

*

5767

*

5763

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5768

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5764

* aligned, however it's the caller's responsibility to guarantee that

5769

* aligned, however it's the caller's responsibility to guarantee that

5765

* we are the only thread that changes migrate type of pageblocks the

5770

* we are the only thread that changes migrate type of pageblocks the

5766

* pages fall in.

5771

* pages fall in.

5767

*

5772

*

5768

* The PFN range must belong to a single zone.

5773

* The PFN range must belong to a single zone.

5769

*

5774

*

5770

* Returns zero on success or negative error code. On success all

5775

* Returns zero on success or negative error code. On success all

5771

* pages which PFN is in [start, end) are allocated for the caller and

5776

* pages which PFN is in [start, end) are allocated for the caller and

5772

* need to be freed with free_contig_range().

5777

* need to be freed with free_contig_range().

5773

*/

5778

*/

5774

int alloc_contig_range(unsigned long start, unsigned long end,

5779

int alloc_contig_range(unsigned long start, unsigned long end,

5775

unsigned migratetype)

5780

unsigned migratetype)

5776

{

5781

{

5777

struct zone *zone = page_zone(pfn_to_page(start));

5782

struct zone *zone = page_zone(pfn_to_page(start));

5778

unsigned long outer_start, outer_end;

5783

unsigned long outer_start, outer_end;

5779

int ret = 0, order;

5784

int ret = 0, order;

5780

5785

5781

/*

5786

/*

5782

* What we do here is we mark all pageblocks in range as

5787

* What we do here is we mark all pageblocks in range as

5783

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5788

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5784

* have different sizes, and due to the way page allocator

5789

* have different sizes, and due to the way page allocator

5785

* work, we align the range to biggest of the two pages so

5790

* work, we align the range to biggest of the two pages so

5786

* that page allocator won't try to merge buddies from

5791

* that page allocator won't try to merge buddies from

5787

* different pageblocks and change MIGRATE_ISOLATE to some

5792

* different pageblocks and change MIGRATE_ISOLATE to some

5788

* other migration type.

5793

* other migration type.

5789

*

5794

*

5790

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5795

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5791

* migrate the pages from an unaligned range (ie. pages that

5796

* migrate the pages from an unaligned range (ie. pages that

5792

* we are interested in). This will put all the pages in

5797

* we are interested in). This will put all the pages in

5793

* range back to page allocator as MIGRATE_ISOLATE.

5798

* range back to page allocator as MIGRATE_ISOLATE.

5794

*

5799

*

5795

* When this is done, we take the pages in range from page

5800

* When this is done, we take the pages in range from page

5796

* allocator removing them from the buddy system. This way

5801

* allocator removing them from the buddy system. This way

5797

* page allocator will never consider using them.

5802

* page allocator will never consider using them.

5798

*

5803

*

5799

* This lets us mark the pageblocks back as

5804

* This lets us mark the pageblocks back as

5800

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5805

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5801

* aligned range but not in the unaligned, original range are

5806

* aligned range but not in the unaligned, original range are

5802

* put back to page allocator so that buddy can use them.

5807

* put back to page allocator so that buddy can use them.

5803

*/

5808

*/

5804

5809

5805

ret = start_isolate_page_range(pfn_max_align_down(start),

5810

ret = start_isolate_page_range(pfn_max_align_down(start),

5806

pfn_max_align_up(end), migratetype);

5811

pfn_max_align_up(end), migratetype);

5807

if (ret)

5812

if (ret)

5808

goto done;

5813

goto done;

5809

5814

5810

ret = __alloc_contig_migrate_range(start, end);

5815

ret = __alloc_contig_migrate_range(start, end);

5811

if (ret)

5816

if (ret)

5812

goto done;

5817

goto done;

5813

5818

5814

/*

5819

/*

5815

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5820

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5816

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5821

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5817

* more, all pages in [start, end) are free in page allocator.

5822

* more, all pages in [start, end) are free in page allocator.

5818

* What we are going to do is to allocate all pages from

5823

* What we are going to do is to allocate all pages from

5819

* [start, end) (that is remove them from page allocator).

5824

* [start, end) (that is remove them from page allocator).

5820

*

5825

*

5821

* The only problem is that pages at the beginning and at the

5826

* The only problem is that pages at the beginning and at the

5822

* end of interesting range may be not aligned with pages that

5827

* end of interesting range may be not aligned with pages that

5823

* page allocator holds, ie. they can be part of higher order

5828

* page allocator holds, ie. they can be part of higher order

5824

* pages. Because of this, we reserve the bigger range and

5829

* pages. Because of this, we reserve the bigger range and

5825

* once this is done free the pages we are not interested in.

5830

* once this is done free the pages we are not interested in.

5826

*

5831

*

5827

* We don't have to hold zone->lock here because the pages are

5832

* We don't have to hold zone->lock here because the pages are

5828

* isolated thus they won't get removed from buddy.

5833

* isolated thus they won't get removed from buddy.

5829

*/

5834

*/

5830

5835

5831

lru_add_drain_all();

5836

lru_add_drain_all();

5832

drain_all_pages();

5837

drain_all_pages();

5833

5838

5834

order = 0;

5839

order = 0;

5835

outer_start = start;

5840

outer_start = start;

5836

while (!PageBuddy(pfn_to_page(outer_start))) {

5841

while (!PageBuddy(pfn_to_page(outer_start))) {

5837

if (++order >= MAX_ORDER) {

5842

if (++order >= MAX_ORDER) {

5838

ret = -EBUSY;

5843

ret = -EBUSY;

5839

goto done;

5844

goto done;

5840

}

5845

}

5841

outer_start &= ~0UL << order;

5846

outer_start &= ~0UL << order;

5842

}

5847

}

5843

5848

5844

/* Make sure the range is really isolated. */

5849

/* Make sure the range is really isolated. */

5845

if (test_pages_isolated(outer_start, end)) {

5850

if (test_pages_isolated(outer_start, end)) {

5846

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5851

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5847

outer_start, end);

5852

outer_start, end);

5848

ret = -EBUSY;

5853

ret = -EBUSY;

5849

goto done;

5854

goto done;

5850

}

5855

}

5851

5856

5852

/*

5857

/*

5853

* Reclaim enough pages to make sure that contiguous allocation

5858

* Reclaim enough pages to make sure that contiguous allocation

5854

* will not starve the system.

5859

* will not starve the system.

5855

*/

5860

*/

5856

__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

5861

__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

5857

5862

5858

/* Grab isolated pages from freelists. */

5863

/* Grab isolated pages from freelists. */

5859

outer_end = isolate_freepages_range(outer_start, end);

5864

outer_end = isolate_freepages_range(outer_start, end);

5860

if (!outer_end) {

5865

if (!outer_end) {

5861

ret = -EBUSY;

5866

ret = -EBUSY;

5862

goto done;

5867

goto done;

5863

}

5868

}

5864

5869

5865

/* Free head and tail (if any) */

5870

/* Free head and tail (if any) */

5866

if (start != outer_start)

5871

if (start != outer_start)

5867

free_contig_range(outer_start, start - outer_start);

5872

free_contig_range(outer_start, start - outer_start);

5868

if (end != outer_end)

5873

if (end != outer_end)

5869

free_contig_range(end, outer_end - end);

5874

free_contig_range(end, outer_end - end);

5870

5875

5871

done:

5876

done:

5872

undo_isolate_page_range(pfn_max_align_down(start),

5877

undo_isolate_page_range(pfn_max_align_down(start),

5873

pfn_max_align_up(end), migratetype);

5878

pfn_max_align_up(end), migratetype);

5874

return ret;

5879

return ret;

5875

}

5880

}

5876

5881

5877

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5882

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5878

{

5883

{

5879

for (; nr_pages--; ++pfn)

5884

for (; nr_pages--; ++pfn)

5880

__free_page(pfn_to_page(pfn));

5885

__free_page(pfn_to_page(pfn));

5881

}

5886

}

5882

#endif

5887

#endif

5883

5888

5884

#ifdef CONFIG_MEMORY_HOTREMOVE

5889

#ifdef CONFIG_MEMORY_HOTREMOVE

5885

/*

5890

/*

5886

* All pages in the range must be isolated before calling this.

5891

* All pages in the range must be isolated before calling this.

5887

*/

5892

*/

5888

void

5893

void

5889

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

5894

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

5890

{

5895

{

5891

struct page *page;

5896

struct page *page;

5892

struct zone *zone;

5897

struct zone *zone;

5893

int order, i;

5898

int order, i;

5894

unsigned long pfn;

5899

unsigned long pfn;

5895

unsigned long flags;

5900

unsigned long flags;

5896

/* find the first valid pfn */

5901

/* find the first valid pfn */

5897

for (pfn = start_pfn; pfn < end_pfn; pfn++)

5902

for (pfn = start_pfn; pfn < end_pfn; pfn++)

5898

if (pfn_valid(pfn))

5903

if (pfn_valid(pfn))

5899

break;

5904

break;

5900

if (pfn == end_pfn)

5905

if (pfn == end_pfn)

5901

return;

5906

return;

5902

zone = page_zone(pfn_to_page(pfn));

5907

zone = page_zone(pfn_to_page(pfn));

5903

spin_lock_irqsave(&zone->lock, flags);

5908

spin_lock_irqsave(&zone->lock, flags);

5904

pfn = start_pfn;

5909

pfn = start_pfn;

5905

while (pfn < end_pfn) {

5910

while (pfn < end_pfn) {

5906

if (!pfn_valid(pfn)) {

5911

if (!pfn_valid(pfn)) {

5907

pfn++;

5912

pfn++;

5908

continue;

5913

continue;

5909

}

5914

}

5910

page = pfn_to_page(pfn);

5915

page = pfn_to_page(pfn);

5911

BUG_ON(page_count(page));

5916

BUG_ON(page_count(page));

5912

BUG_ON(!PageBuddy(page));

5917

BUG_ON(!PageBuddy(page));

5913

order = page_order(page);

5918

order = page_order(page);

5914

#ifdef CONFIG_DEBUG_VM

5919

#ifdef CONFIG_DEBUG_VM

5915

printk(KERN_INFO "remove from free list %lx %d %lx\n",

5920

printk(KERN_INFO "remove from free list %lx %d %lx\n",

5916

pfn, 1 << order, end_pfn);

5921

pfn, 1 << order, end_pfn);

5917

#endif

5922

#endif

5918

list_del(&page->lru);

5923

list_del(&page->lru);

5919

rmv_page_order(page);

5924

rmv_page_order(page);

5920

zone->free_area[order].nr_free--;

5925

zone->free_area[order].nr_free--;

5921

__mod_zone_page_state(zone, NR_FREE_PAGES,

5926

__mod_zone_page_state(zone, NR_FREE_PAGES,

5922

- (1UL << order));

5927

- (1UL << order));

5923

for (i = 0; i < (1 << order); i++)

5928

for (i = 0; i < (1 << order); i++)

5924

SetPageReserved((page+i));

5929

SetPageReserved((page+i));

5925

pfn += (1 << order);

5930

pfn += (1 << order);

5926

}

5931

}

5927

spin_unlock_irqrestore(&zone->lock, flags);

5932

spin_unlock_irqrestore(&zone->lock, flags);

5928

}

5933

}

5929

#endif

5934

#endif

5930

5935

5931

#ifdef CONFIG_MEMORY_FAILURE

5936

#ifdef CONFIG_MEMORY_FAILURE

5932

bool is_free_buddy_page(struct page *page)

5937

bool is_free_buddy_page(struct page *page)

5933

{

5938

{

5934

struct zone *zone = page_zone(page);

5939

struct zone *zone = page_zone(page);

5935

unsigned long pfn = page_to_pfn(page);

5940

unsigned long pfn = page_to_pfn(page);

5936

unsigned long flags;

5941

unsigned long flags;

5937

int order;

5942

int order;

5938

5943

5939

spin_lock_irqsave(&zone->lock, flags);

5944

spin_lock_irqsave(&zone->lock, flags);

5940

for (order = 0; order < MAX_ORDER; order++) {

5945

for (order = 0; order < MAX_ORDER; order++) {

5941

struct page *page_head = page - (pfn & ((1 << order) - 1));

5946

struct page *page_head = page - (pfn & ((1 << order) - 1));

5942

5947

5943

if (PageBuddy(page_head) && page_order(page_head) >= order)

5948

if (PageBuddy(page_head) && page_order(page_head) >= order)

5944

break;

5949

break;

5945

}

5950

}

5946

spin_unlock_irqrestore(&zone->lock, flags);

5951

spin_unlock_irqrestore(&zone->lock, flags);

5947

5952

5948

return order < MAX_ORDER;

5953

return order < MAX_ORDER;

5949

}

5954

}

5950

#endif

5955

#endif

5951

5956

5952

static const struct trace_print_flags pageflag_names[] = {

5957

static const struct trace_print_flags pageflag_names[] = {

5953

{1UL << PG_locked, "locked" },

5958

{1UL << PG_locked, "locked" },

5954

{1UL << PG_error, "error" },

5959

{1UL << PG_error, "error" },

5955

{1UL << PG_referenced, "referenced" },

5960

{1UL << PG_referenced, "referenced" },

5956

{1UL << PG_uptodate, "uptodate" },

5961

{1UL << PG_uptodate, "uptodate" },

5957

{1UL << PG_dirty, "dirty" },

5962

{1UL << PG_dirty, "dirty" },

5958

{1UL << PG_lru, "lru" },

5963

{1UL << PG_lru, "lru" },

5959

{1UL << PG_active, "active" },

5964

{1UL << PG_active, "active" },

5960

{1UL << PG_slab, "slab" },

5965

{1UL << PG_slab, "slab" },

5961

{1UL << PG_owner_priv_1, "owner_priv_1" },

5966

{1UL << PG_owner_priv_1, "owner_priv_1" },

5962

{1UL << PG_arch_1, "arch_1" },

5967

{1UL << PG_arch_1, "arch_1" },

5963

{1UL << PG_reserved, "reserved" },

5968

{1UL << PG_reserved, "reserved" },

5964

{1UL << PG_private, "private" },

5969

{1UL << PG_private, "private" },

5965

{1UL << PG_private_2, "private_2" },

5970

{1UL << PG_private_2, "private_2" },

5966

{1UL << PG_writeback, "writeback" },

5971

{1UL << PG_writeback, "writeback" },

5967

#ifdef CONFIG_PAGEFLAGS_EXTENDED

5972

#ifdef CONFIG_PAGEFLAGS_EXTENDED

5968

{1UL << PG_head, "head" },

5973

{1UL << PG_head, "head" },

5969

{1UL << PG_tail, "tail" },

5974

{1UL << PG_tail, "tail" },

5970

#else

5975

#else

5971

{1UL << PG_compound, "compound" },

5976

{1UL << PG_compound, "compound" },

5972

#endif

5977

#endif

5973

{1UL << PG_swapcache, "swapcache" },

5978

{1UL << PG_swapcache, "swapcache" },

5974

{1UL << PG_mappedtodisk, "mappedtodisk" },

5979

{1UL << PG_mappedtodisk, "mappedtodisk" },

5975

{1UL << PG_reclaim, "reclaim" },

5980

{1UL << PG_reclaim, "reclaim" },

5976

{1UL << PG_swapbacked, "swapbacked" },

5981

{1UL << PG_swapbacked, "swapbacked" },

5977

{1UL << PG_unevictable, "unevictable" },

5982

{1UL << PG_unevictable, "unevictable" },

5978

#ifdef CONFIG_MMU

5983

#ifdef CONFIG_MMU

5979

{1UL << PG_mlocked, "mlocked" },

5984

{1UL << PG_mlocked, "mlocked" },

5980

#endif

5985

#endif

5981

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

5986

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

5982

{1UL << PG_uncached, "uncached" },

5987

{1UL << PG_uncached, "uncached" },

5983

#endif

5988

#endif

5984

#ifdef CONFIG_MEMORY_FAILURE

5989

#ifdef CONFIG_MEMORY_FAILURE

5985

{1UL << PG_hwpoison, "hwpoison" },

5990

{1UL << PG_hwpoison, "hwpoison" },

5986

#endif

5991

#endif

5987

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5992

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5988

{1UL << PG_compound_lock, "compound_lock" },

5993

{1UL << PG_compound_lock, "compound_lock" },

5989

#endif

5994

#endif

5990

};

5995

};

5991

5996

5992

static void dump_page_flags(unsigned long flags)

5997

static void dump_page_flags(unsigned long flags)

5993

{

5998

{

5994

const char *delim = "";

5999

const char *delim = "";

5995

unsigned long mask;

6000

unsigned long mask;

5996

int i;

6001

int i;

5997

6002

5998

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6003

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

5999

6004

6000

printk(KERN_ALERT "page flags: %#lx(", flags);

6005

printk(KERN_ALERT "page flags: %#lx(", flags);

6001

6006

6002

/* remove zone id */

6007

/* remove zone id */

6003

flags &= (1UL << NR_PAGEFLAGS) - 1;

6008

flags &= (1UL << NR_PAGEFLAGS) - 1;

6004

6009

6005

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6010

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6006

6011

6007

mask = pageflag_names[i].mask;

6012

mask = pageflag_names[i].mask;

6008

if ((flags & mask) != mask)

6013

if ((flags & mask) != mask)

6009

continue;

6014

continue;

6010

6015

6011

flags &= ~mask;

6016

flags &= ~mask;

6012

printk("%s%s", delim, pageflag_names[i].name);

6017

printk("%s%s", delim, pageflag_names[i].name);

6013

delim = "|";

6018

delim = "|";

6014

}

6019

}

6015

6020

6016

/* check for left over flags */

6021

/* check for left over flags */

6017

if (flags)

6022

if (flags)

6018

printk("%s%#lx", delim, flags);

6023

printk("%s%#lx", delim, flags);

6019

6024

6020

printk(")\n");

6025

printk(")\n");

6021

}

6026

}

6022

6027

6023

void dump_page(struct page *page)

6028

void dump_page(struct page *page)

6024

{

6029

{

6025

printk(KERN_ALERT

6030

printk(KERN_ALERT

6026

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6031

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6027

page, atomic_read(&page->_count), page_mapcount(page),

6032

page, atomic_read(&page->_count), page_mapcount(page),

6028

page->mapping, page->index);

6033

page->mapping, page->index);

6029

dump_page_flags(page->flags);

6034

dump_page_flags(page->flags);

6030

mem_cgroup_print_bad_page(page);

6035

mem_cgroup_print_bad_page(page);

6031

}

6036

}

6032

6037

GITLAB

mm: have order > 0 compaction start off where it left

 #ifndef _LINUX_MMZONE_H
 #define _LINUX_MMZONE_H
 #ifndef __ASSEMBLY__
 #ifndef __GENERATING_BOUNDS_H
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/bitops.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <linux/init.h>
 #include <linux/seqlock.h>
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
 #include <generated/bounds.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 /* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 11
 #else
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
  * coalesce naturally under reasonable reclaim pressure and those which
  * will not.
  */
 #define PAGE_ALLOC_COSTLY_ORDER 3
 enum {
 	MIGRATE_UNMOVABLE,
 	MIGRATE_RECLAIMABLE,
 	MIGRATE_MOVABLE,
 	MIGRATE_PCPTYPES,	/* the number of types on the pcp lists */
 	MIGRATE_RESERVE = MIGRATE_PCPTYPES,
 #ifdef CONFIG_CMA
 	/*
 	 * MIGRATE_CMA migration type is designed to mimic the way
 	 * ZONE_MOVABLE works.  Only movable pages can be allocated
 	 * from MIGRATE_CMA pageblocks and page allocator never
 	 * implicitly change migration type of MIGRATE_CMA pageblock.
 	 *
 	 * The way to use it is to change migratetype of a range of
 	 * pageblocks to MIGRATE_CMA which can be done by
 	 * __free_pageblock_cma() function.  What is important though
 	 * is that a range of pageblocks must be aligned to
 	 * MAX_ORDER_NR_PAGES should biggest page be bigger then
 	 * a single pageblock.
 	 */
 	MIGRATE_CMA,
 #endif
 	MIGRATE_ISOLATE,	/* can't allocate from here */
 	MIGRATE_TYPES
 };
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define cma_wmark_pages(zone)	zone->min_cma_pages
 #else
 #  define is_migrate_cma(migratetype) false
 #  define cma_wmark_pages(zone) 0
 #endif
 #define for_each_migratetype_order(order, type) \
 	for (order = 0; order < MAX_ORDER; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
 extern int page_group_by_mobility_disabled;
 static inline int get_pageblock_migratetype(struct page *page)
 {
 	return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end);
 }
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
 };
 struct pglist_data;
 /*
  * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
  * So add a wild amount of padding here to ensure that they fall into separate
  * cachelines.  There are very few zone structures in the machine, so space
  * consumption is not a concern here.
  */
 #if defined(CONFIG_SMP)
 struct zone_padding {
 	char x[0];
 } ____cacheline_internodealigned_in_smp;
 #define ZONE_PADDING(name)	struct zone_padding name;
 #else
 #define ZONE_PADDING(name)
 #endif
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
 	NR_LRU_BASE,
 	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_KERNEL_STACK,
 	/* Second 128 byte cacheline */
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
 	NR_VMSCAN_IMMEDIATE,	/* Prioritise for reclaim when writeback ends */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
 	NUMA_MISS,		/* allocated in non intended node */
 	NUMA_FOREIGN,		/* was intended here, hit elsewhere */
 	NUMA_INTERLEAVE_HIT,	/* interleaver preferred this zone */
 	NUMA_LOCAL,		/* allocation from local node */
 	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_ANON_TRANSPARENT_HUGEPAGES,
 	NR_VM_ZONE_STAT_ITEMS };
 /*
  * We do arithmetic on the LRU lists in various places in the code,
  * so it is important to keep the active lists LRU_ACTIVE higher in
  * the array than the corresponding inactive lists, and to keep
  * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
  *
  * This has to be kept in sync with the statistics in zone_stat_item
  * above and the descriptions in vmstat_text in mm/vmstat.c
  */
 #define LRU_BASE 0
 #define LRU_ACTIVE 1
 #define LRU_FILE 2
 enum lru_list {
 	LRU_INACTIVE_ANON = LRU_BASE,
 	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
 	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
 	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
 	LRU_UNEVICTABLE,
 	NR_LRU_LISTS
 };
 #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)
 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
 static inline int is_file_lru(enum lru_list lru)
 {
 	return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
 }
 static inline int is_active_lru(enum lru_list lru)
 {
 	return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
 }
 static inline int is_unevictable_lru(enum lru_list lru)
 {
 	return (lru == LRU_UNEVICTABLE);
 }
 struct zone_reclaim_stat {
 	/*
 	 * The pageout code in vmscan.c keeps track of how many of the
 	 * mem/swap backed and file backed pages are referenced.
 	 * The higher the rotated/scanned ratio, the more valuable
 	 * that cache is.
 	 *
 	 * The anon LRU stats live in [0], file LRU stats in [1]
 	 */
 	unsigned long		recent_rotated[2];
 	unsigned long		recent_scanned[2];
 };
 struct lruvec {
 	struct list_head lists[NR_LRU_LISTS];
 	struct zone_reclaim_stat reclaim_stat;
 #ifdef CONFIG_MEMCG
 	struct zone *zone;
 #endif
 };
 /* Mask used at gathering information at once (see memcontrol.c) */
 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
 #define LRU_ALL	     ((1 << NR_LRU_LISTS) - 1)
 /* Isolate clean file */
 #define ISOLATE_CLEAN		((__force isolate_mode_t)0x1)
 /* Isolate unmapped file */
 #define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* LRU Isolation modes. */
 typedef unsigned __bitwise__ isolate_mode_t;
 enum zone_watermarks {
 	WMARK_MIN,
 	WMARK_LOW,
 	WMARK_HIGH,
 	NR_WMARK
 };
 #define min_wmark_pages(z) (z->watermark[WMARK_MIN])
 #define low_wmark_pages(z) (z->watermark[WMARK_LOW])
 #define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[MIGRATE_PCPTYPES];
 };
 struct per_cpu_pageset {
 	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
 #endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
 #endif
 };
 #endif /* !__GENERATING_BOUNDS.H */
 enum zone_type {
 #ifdef CONFIG_ZONE_DMA
 	/*
 	 * ZONE_DMA is used when there are devices that are not able
 	 * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
 	 * carve out the portion of memory that is needed for these devices.
 	 * The range is arch specific.
 	 *
 	 * Some examples
 	 *
 	 * Architecture		Limit
 	 * ---------------------------
 	 * parisc, ia64, sparc	<4G
 	 * s390			<2G
 	 * arm			Various
 	 * alpha		Unlimited or 0-16MB.
 	 *
 	 * i386, x86_64 and multiple other arches
 	 * 			<16M.
 	 */
 	ZONE_DMA,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	/*
 	 * x86_64 needs two ZONE_DMAs because it supports devices that are
 	 * only able to do DMA to the lower 16M but also 32 bit devices that
 	 * can only do DMA areas below 4G.
 	 */
 	ZONE_DMA32,
 #endif
 	/*
 	 * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
 	 * performed on pages in ZONE_NORMAL if the DMA devices support
 	 * transfers to all addressable memory.
 	 */
 	ZONE_NORMAL,
 #ifdef CONFIG_HIGHMEM
 	/*
 	 * A memory area that is only addressable by the kernel through
 	 * mapping portions into its own address space. This is for example
 	 * used by i386 to allow the kernel to address the memory beyond
 	 * 900MB. The kernel will set up special mappings (page
 	 * table entries on i386) for each page that the kernel needs to
 	 * access.
 	 */
 	ZONE_HIGHMEM,
 #endif
 	ZONE_MOVABLE,
 	__MAX_NR_ZONES
 };
 #ifndef __GENERATING_BOUNDS_H
 /*
  * When a memory allocation must conform to specific limitations (such
  * as being suitable for DMA) the caller will pass in hints to the
  * allocator in the gfp_mask, in the zone modifier bits.  These bits
  * are used to select a priority ordered list of memory zones which
  * match the requested limits. See gfp_zone() in include/linux/gfp.h
  */
 #if MAX_NR_ZONES < 2
 #define ZONES_SHIFT 0
 #elif MAX_NR_ZONES <= 2
 #define ZONES_SHIFT 1
 #elif MAX_NR_ZONES <= 4
 #define ZONES_SHIFT 2
 #else
 #error ZONES_SHIFT -- too many zones configured adjust calculation
 #endif
 struct zone {
 	/* Fields commonly accessed by the page allocator */
 	/* zone watermarks, access with *_wmark_pages(zone) macros */
 	unsigned long watermark[NR_WMARK];
 	/*
 	 * When free pages are below this point, additional steps are taken
 	 * when reading the number of free pages to avoid per-cpu counter
 	 * drift allowing watermarks to be breached
 	 */
 	unsigned long percpu_drift_mark;
 	/*
 	 * We don't know if the memory that we're going to allocate will be freeable
 	 * or/and it will be released eventually, so to avoid totally wasting several
 	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
 	 * to run OOM on the lower zones despite there's tons of freeable ram
 	 * on the higher zones). This array is recalculated at runtime if the
 	 * sysctl_lowmem_reserve_ratio sysctl changes.
 	 */
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 	/*
 	 * This is a per-zone reserve of pages that should not be
 	 * considered dirtyable memory.
 	 */
 	unsigned long		dirty_balance_reserve;
 #ifdef CONFIG_NUMA
 	int node;
 	/*
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
 	unsigned long		min_unmapped_pages;
 	unsigned long		min_slab_pages;
 #endif
 	struct per_cpu_pageset __percpu *pageset;
 	/*
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
 	int                     all_unreclaimable; /* All pages pinned */
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+	/* pfn where the last incremental compaction isolated free pages */
+	unsigned long		compact_cached_free_pfn;
+#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/* see spanned/present_pages for more description */
 	seqlock_t		span_seqlock;
 #endif
 #ifdef CONFIG_CMA
 	/*
 	 * CMA needs to increase watermark levels during the allocation
 	 * process to make sure that the system is not starved.
 	 */
 	unsigned long		min_cma_pages;
 #endif
 	struct free_area	free_area[MAX_ORDER];
 #ifndef CONFIG_SPARSEMEM
 	/*
 	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
 	 * In SPARSEMEM, this map is stored in struct mem_section
 	 */
 	unsigned long		*pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_COMPACTION
 	/*
 	 * On compaction failure, 1<<compact_defer_shift compactions
 	 * are skipped before trying again. The number attempted since
 	 * last failure is tracked with compact_considered.
 	 */
 	unsigned int		compact_considered;
 	unsigned int		compact_defer_shift;
 	int			compact_order_failed;
 #endif
 	ZONE_PADDING(_pad1_)
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;
 	struct lruvec		lruvec;
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
 	/*
 	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
 	 * this zone's LRU.  Maintained by the pageout code.
 	 */
 	unsigned int inactive_ratio;
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
 	/*
 	 * wait_table		-- the array holding the hash table
 	 * wait_table_hash_nr_entries	-- the size of the hash table array
 	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
 	 * waiting for a page to become available and make them
 	 * runnable again when possible. The trouble is that this
 	 * consumes a lot of space, especially when so few things
 	 * wait on pages at a given time. So instead of using
 	 * per-page waitqueues, we use a waitqueue hash table.
 	 *
 	 * The bucket discipline is to sleep on the same queue when
 	 * colliding and wake all in that wait queue when removing.
 	 * When something wakes, it must check to be sure its page is
 	 * truly available, a la thundering herd. The cost of a
 	 * collision is great, but given the expected load of the
 	 * table, they should be so rare as to be outweighed by the
 	 * benefits from the saved space.
 	 *
 	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
 	 * primary users of these fields, and in mm/page_alloc.c
 	 * free_area_init_core() performs the initialization of them.
 	 */
 	wait_queue_head_t	* wait_table;
 	unsigned long		wait_table_hash_nr_entries;
 	unsigned long		wait_table_bits;
 	/*
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 	/*
 	 * zone_start_pfn, spanned_pages and present_pages are all
 	 * protected by span_seqlock.  It is a seqlock because it has
 	 * to be read outside of zone->lock, and it is done in the main
 	 * allocator path.  But, it is written quite infrequently.
 	 *
 	 * The lock is declared along with zone->lock because it is
 	 * frequently read in proximity to zone->lock.  It's good to
 	 * give them a chance of being in the same cacheline.
 	 */
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 	/*
 	 * rarely used fields:
 	 */
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 typedef enum {
 	ZONE_RECLAIM_LOCKED,		/* prevents concurrent reclaim */
 	ZONE_OOM_LOCKED,		/* zone is in OOM killer zonelist */
 	ZONE_CONGESTED,			/* zone has many dirty pages backed by
 					 * a congested BDI
 					 */
 } zone_flags_t;
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
 {
 	set_bit(flag, &zone->flags);
 }
 static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag)
 {
 	return test_and_set_bit(flag, &zone->flags);
 }
 static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
 {
 	clear_bit(flag, &zone->flags);
 }
 static inline int zone_is_reclaim_congested(const struct zone *zone)
 {
 	return test_bit(ZONE_CONGESTED, &zone->flags);
 }
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
 }
 static inline int zone_is_oom_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
 }
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
  * queues ("queue_length >> 12") during an aging round.
  */
 #define DEF_PRIORITY 12
 /* Maximum number of zones on a zonelist */
 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
 #ifdef CONFIG_NUMA
 /*
  * The NUMA zonelists are doubled because we need zonelists that restrict the
  * allocations to a single node for GFP_THISNODE.
  *
  * [0]	: Zonelist with fallback
  * [1]	: No fallback (GFP_THISNODE)
  */
 #define MAX_ZONELISTS 2
 /*
  * We cache key information from each zonelist for smaller cache
  * footprint when scanning for free pages in get_page_from_freelist().
  *
  * 1) The BITMAP fullzones tracks which zones in a zonelist have come
  *    up short of free memory since the last time (last_fullzone_zap)
  *    we zero'd fullzones.
  * 2) The array z_to_n[] maps each zone in the zonelist to its node
  *    id, so that we can efficiently evaluate whether that node is
  *    set in the current tasks mems_allowed.
  *
  * Both fullzones and z_to_n[] are one-to-one with the zonelist,
  * indexed by a zones offset in the zonelist zones[] array.
  *
  * The get_page_from_freelist() routine does two scans.  During the
  * first scan, we skip zones whose corresponding bit in 'fullzones'
  * is set or whose corresponding node in current->mems_allowed (which
  * comes from cpusets) is not set.  During the second scan, we bypass
  * this zonelist_cache, to ensure we look methodically at each zone.
  *
  * Once per second, we zero out (zap) fullzones, forcing us to
  * reconsider nodes that might have regained more free memory.
  * The field last_full_zap is the time we last zapped fullzones.
  *
  * This mechanism reduces the amount of time we waste repeatedly
  * reexaming zones for free memory when they just came up low on
  * memory momentarilly ago.
  *
  * The zonelist_cache struct members logically belong in struct
  * zonelist.  However, the mempolicy zonelists constructed for
  * MPOL_BIND are intentionally variable length (and usually much
  * shorter).  A general purpose mechanism for handling structs with
  * multiple variable length members is more mechanism than we want
  * here.  We resort to some special case hackery instead.
  *
  * The MPOL_BIND zonelists don't need this zonelist_cache (in good
  * part because they are shorter), so we put the fixed length stuff
  * at the front of the zonelist struct, ending in a variable length
  * zones[], as is needed by MPOL_BIND.
  *
  * Then we put the optional zonelist cache on the end of the zonelist
  * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
  * the fixed length portion at the front of the struct.  This pointer
  * both enables us to find the zonelist cache, and in the case of
  * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
  * to know that the zonelist cache is not there.
  *
  * The end result is that struct zonelists come in two flavors:
  *  1) The full, fixed length version, shown below, and
  *  2) The custom zonelists for MPOL_BIND.
  * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
  *
  * Even though there may be multiple CPU cores on a node modifying
  * fullzones or last_full_zap in the same zonelist_cache at the same
  * time, we don't lock it.  This is just hint data - if it is wrong now
  * and then, the allocator will still function, perhaps a bit slower.
  */
 struct zonelist_cache {
 	unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];		/* zone->nid */
 	DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);	/* zone full? */
 	unsigned long last_full_zap;		/* when last zap'd (jiffies) */
 };
 #else
 #define MAX_ZONELISTS 1
 struct zonelist_cache;
 #endif
 /*
  * This struct contains information about a zone in a zonelist. It is stored
  * here to avoid dereferences into large structures and lookups of tables
  */
 struct zoneref {
 	struct zone *zone;	/* Pointer to actual zone */
 	int zone_idx;		/* zone_idx(zoneref->zone) */
 };
 /*
  * One allocation request operates on a zonelist. A zonelist
  * is a list of zones, the first one is the 'goal' of the
  * allocation, the other zones are fallback zones, in decreasing
  * priority.
  *
  * If zlcache_ptr is not NULL, then it is just the address of zlcache,
  * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
  * *
  * To speed the reading of the zonelist, the zonerefs contain the zone index
  * of the entry being read. Helper functions to access information given
  * a struct zoneref are
  *
  * zonelist_zone()	- Return the struct zone * for an entry in _zonerefs
  * zonelist_zone_idx()	- Return the index of the zone for an entry
  * zonelist_node_idx()	- Return the index of the node for an entry
  */
 struct zonelist {
 	struct zonelist_cache *zlcache_ptr;		     // NULL or &zlcache
 	struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
 #ifdef CONFIG_NUMA
 	struct zonelist_cache zlcache;			     // optional ...
 #endif
 };
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 struct node_active_region {
 	unsigned long start_pfn;
 	unsigned long end_pfn;
 	int nid;
 };
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
 #endif
 /*
  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
  * (mostly NUMA machines?) to denote a higher-level memory zone than the
  * zone denotes.
  *
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout.
  *
  * Memory statistics and page replacement data structures are maintained on a
  * per-zone basis.
  */
 struct bootmem_data;
 typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
 #ifdef CONFIG_MEMCG
 	struct page_cgroup *node_page_cgroup;
 #endif
 #endif
 #ifndef CONFIG_NO_BOOTMEM
 	struct bootmem_data *bdata;
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 	/*
 	 * Must be held any time you expect node_start_pfn, node_present_pages
 	 * or node_spanned_pages stay constant.  Holding this will also
 	 * guarantee that any pfn_valid() stays that way.
 	 *
 	 * Nests above zone->lock and zone->size_seqlock.
 	 */
 	spinlock_t node_size_lock;
 #endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
 } pg_data_t;
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
 #else
 #define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid) ({\
 	pg_data_t *__pgdat = NODE_DATA(nid);\
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
 })
 #include <linux/memory_hotplug.h>
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(void *data);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		int classzone_idx, int alloc_flags);
 enum memmap_context {
 	MEMMAP_EARLY,
 	MEMMAP_HOTPLUG,
 };
 extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 				     unsigned long size,
 				     enum memmap_context context);
 extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
 {
 #ifdef CONFIG_MEMCG
 	return lruvec->zone;
 #else
 	return container_of(lruvec, struct zone, lruvec);
 #endif
 }
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
 #else
 static inline void memory_present(int nid, unsigned long start, unsigned long end) {}
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 int local_memory_node(int node_id);
 #else
 static inline int local_memory_node(int node_id) { return node_id; };
 #endif
 #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #endif
 /*
  * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 static inline int populated_zone(struct zone *zone)
 {
 	return (!!zone->present_pages);
 }
 extern int movable_zone;
 static inline int zone_movable_is_highmem(void)
 {
 #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE)
 	return movable_zone == ZONE_HIGHMEM;
 #else
 	return 0;
 #endif
 }
 static inline int is_highmem_idx(enum zone_type idx)
 {
 #ifdef CONFIG_HIGHMEM
 	return (idx == ZONE_HIGHMEM ||
 		(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
 #else
 	return 0;
 #endif
 }
 static inline int is_normal_idx(enum zone_type idx)
 {
 	return (idx == ZONE_NORMAL);
 }
 /**
  * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
  * @zone - pointer to struct zone variable
  */
 static inline int is_highmem(struct zone *zone)
 {
 #ifdef CONFIG_HIGHMEM
 	int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
 	return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
 	       (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
 		zone_movable_is_highmem());
 #else
 	return 0;
 #endif
 }
 static inline int is_normal(struct zone *zone)
 {
 	return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
 }
 static inline int is_dma32(struct zone *zone)
 {
 #ifdef CONFIG_ZONE_DMA32
 	return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
 #else
 	return 0;
 #endif
 }
 static inline int is_dma(struct zone *zone)
 {
 #ifdef CONFIG_ZONE_DMA
 	return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
 #else
 	return 0;
 #endif
 }
 /* These two functions are used to setup the per zone pages min values */
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
 #define NODE_MEM_MAP(nid)	mem_map
 #else /* CONFIG_NEED_MULTIPLE_NODES */
 #include <asm/mmzone.h>
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 extern struct pglist_data *first_online_pgdat(void);
 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
 extern struct zone *next_zone(struct zone *zone);
 /**
  * for_each_online_pgdat - helper macro to iterate over all online nodes
  * @pgdat - pointer to a pg_data_t variable
  */
 #define for_each_online_pgdat(pgdat)			\
 	for (pgdat = first_online_pgdat();		\
 	     pgdat;					\
 	     pgdat = next_online_pgdat(pgdat))
 /**
  * for_each_zone - helper macro to iterate over all memory zones
  * @zone - pointer to struct zone variable
  *
  * The user only needs to declare the zone variable, for_each_zone
  * fills it in.
  */
 #define for_each_zone(zone)			        \
 	for (zone = (first_online_pgdat())->node_zones; \
 	     zone;					\
 	     zone = next_zone(zone))
 #define for_each_populated_zone(zone)		        \
 	for (zone = (first_online_pgdat())->node_zones; \
 	     zone;					\
 	     zone = next_zone(zone))			\
 		if (!populated_zone(zone))		\
 			; /* do nothing */		\
 		else
 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
 {
 	return zoneref->zone;
 }
 static inline int zonelist_zone_idx(struct zoneref *zoneref)
 {
 	return zoneref->zone_idx;
 }
 static inline int zonelist_node_idx(struct zoneref *zoneref)
 {
 #ifdef CONFIG_NUMA
 	/* zone_to_nid not available in this context */
 	return zoneref->zone->node;
 #else
 	return 0;
 #endif /* CONFIG_NUMA */
 }
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
  * @z - The cursor used as a starting point for the search
  * @highest_zoneidx - The zone index of the highest zone to return
  * @nodes - An optional nodemask to filter the zonelist with
  * @zone - The first suitable zone found is returned via this parameter
  *
  * This function returns the next zone at or below a given zone index that is
  * within the allowed nodemask using a cursor as the starting point for the
  * search. The zoneref returned is a cursor that represents the current zone
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
  */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
 					nodemask_t *nodes,
 					struct zone **zone);
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
  * @zonelist - The zonelist to search for a suitable zone
  * @highest_zoneidx - The zone index of the highest zone to return
  * @nodes - An optional nodemask to filter the zonelist with
  * @zone - The first suitable zone found is returned via this parameter
  *
  * This function returns the first zone at or below a given zone index that is
  * within the allowed nodemask. The zoneref returned is a cursor that can be
  * used to iterate the zonelist with next_zones_zonelist by advancing it by
  * one before calling.
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
 					nodemask_t *nodes,
 					struct zone **zone)
 {
 	return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
 								zone);
 }
 /**
  * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
  * @zone - The current zone in the iterator
  * @z - The current pointer within zonelist->zones being iterated
  * @zlist - The zonelist being iterated
  * @highidx - The zone index of the highest zone to return
  * @nodemask - Nodemask allowed by the allocator
  *
  * This iterator iterates though all zones at or below a given zone index and
  * within a given nodemask
  */
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
 	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask, &zone))	\
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
  * @zone - The current zone in the iterator
  * @z - The current pointer within zonelist->zones being iterated
  * @zlist - The zonelist being iterated
  * @highidx - The zone index of the highest zone to return
  *
  * This iterator iterates though all zones at or below a given zone index.
  */
 #define for_each_zone_zonelist(zone, z, zlist, highidx) \
 	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
 #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
 	!defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
 static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_FLATMEM
 #define pfn_to_nid(pfn)		(0)
 #endif
 #ifdef CONFIG_SPARSEMEM
 /*
  * SECTION_SHIFT    		#bits space required to store a section #
  *
  * PA_SECTION_SHIFT		physical address to/from section number
  * PFN_SECTION_SHIFT		pfn to/from section number
  */
 #define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
 #define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
 #define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
 #define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
 #define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
 #define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
 #define SECTION_BLOCKFLAGS_BITS \
 	((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
 #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
 #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 struct page;
 struct page_cgroup;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
 	 * pages.  However, it is stored with some other magic.
 	 * (see sparse.c::sparse_init_one_section())
 	 *
 	 * Additionally during early boot we encode node id of
 	 * the location of the section here to guide allocation.
 	 * (see sparse.c::memory_present())
 	 *
 	 * Making it a UL at least makes someone do a cast
 	 * before using it wrong.
 	 */
 	unsigned long section_mem_map;
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
 #ifdef CONFIG_MEMCG
 	/*
 	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
 	 * section. (see memcontrol.h/page_cgroup.h about this.)
 	 */
 	struct page_cgroup *page_cgroup;
 	unsigned long pad;
 #endif
 };
 #ifdef CONFIG_SPARSEMEM_EXTREME
 #define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
 #else
 #define SECTIONS_PER_ROOT	1
 #endif
 #define SECTION_NR_TO_ROOT(sec)	((sec) / SECTIONS_PER_ROOT)
 #define NR_SECTION_ROOTS	DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
 #define SECTION_ROOT_MASK	(SECTIONS_PER_ROOT - 1)
 #ifdef CONFIG_SPARSEMEM_EXTREME
 extern struct mem_section *mem_section[NR_SECTION_ROOTS];
 #else
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 	if (!mem_section[SECTION_NR_TO_ROOT(nr)])
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern int __section_nr(struct mem_section* ms);
 extern unsigned long usemap_size(void);
 /*
  * We use the lower bits of the mem_map pointer to store
  * a little bit of information.  There should be at least
  * 3 bits here due to 32-bit alignment.
  */
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_MAP_LAST_BIT	(1UL<<2)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
 #define SECTION_NID_SHIFT	2
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
 	unsigned long map = section->section_mem_map;
 	map &= SECTION_MAP_MASK;
 	return (struct page *)map;
 }
 static inline int present_section(struct mem_section *section)
 {
 	return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
 }
 static inline int present_section_nr(unsigned long nr)
 {
 	return present_section(__nr_to_section(nr));
 }
 static inline int valid_section(struct mem_section *section)
 {
 	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 static inline int valid_section_nr(unsigned long nr)
 {
 	return valid_section(__nr_to_section(nr));
 }
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 #endif
 static inline int pfn_present(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
 	return present_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
  * this restriction.
  */
 #ifdef CONFIG_NUMA
 #define pfn_to_nid(pfn)							\
 ({									\
 	unsigned long __pfn_to_nid_pfn = (pfn);				\
 	page_to_nid(pfn_to_page(__pfn_to_nid_pfn));			\
 })
 #else
 #define pfn_to_nid(pfn)		(0)
 #endif
 #define early_pfn_valid(pfn)	pfn_valid(pfn)
 void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool early_pfn_in_nid(unsigned long pfn, int nid);
 #else
 #define early_pfn_in_nid(pfn, nid)	(1)
 #endif
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
 void memory_present(int nid, unsigned long start, unsigned long end);
 unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
  * need to check pfn validility within that MAX_ORDER_NR_PAGES block.
  * pfn_valid_within() should be used in this case; we optimise this away
  * when we have no holes within a MAX_ORDER_NR_PAGES block.
  */
 #ifdef CONFIG_HOLES_IN_ZONE
 #define pfn_valid_within(pfn) pfn_valid(pfn)
 #else
 #define pfn_valid_within(pfn) (1)
 #endif
 #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
 /*
  * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
  * associated with it or not. In FLATMEM, it is expected that holes always
  * have valid memmap as long as there is valid PFNs either side of the hole.
  * In SPARSEMEM, it is assumed that a valid section has a memmap for the
  * entire section.
  *
  * However, an ARM, and maybe other embedded architectures in the future
  * free memmap backing holes to save memory on the assumption the memmap is
  * never used. The page_zone linkages are then broken even though pfn_valid()
  * returns true. A walker of the full memmap must then do this additional
  * check to ensure the memmap they are looking at is sane by making sure
  * the zone and PFN linkages are still valid. This is expensive, but walkers
  * of the full memmap are extremely rare.
  */
 int memmap_valid_within(unsigned long pfn,
 					struct page *page, struct zone *zone);
 #else
 static inline int memmap_valid_within(unsigned long pfn,
 					struct page *page, struct zone *zone)
 {
 	return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
 #endif /* !__GENERATING_BOUNDS.H */
 #endif /* !__ASSEMBLY__ */
 #endif /* _LINUX_MMZONE_H */

 /* internal.h: mm/ internal definitions
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
 #ifndef __MM_INTERNAL_H
 #define __MM_INTERNAL_H
 #include <linux/mm.h>
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 static inline void set_page_count(struct page *page, int v)
 {
 	atomic_set(&page->_count, v);
 }
 /*
  * Turn a non-refcounted page (->_count == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
 {
 	VM_BUG_ON(PageTail(page));
 	VM_BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 static inline void __put_page(struct page *page)
 {
 	atomic_dec(&page->_count);
 }
 static inline void __get_page_tail_foll(struct page *page,
 					bool get_page_head)
 {
 	/*
 	 * If we're getting a tail page, the elevated page->_count is
 	 * required only in the head page and we will elevate the head
 	 * page->_count and tail page->_mapcount.
 	 *
 	 * We elevate page_tail->_mapcount for tail pages to force
 	 * page_tail->_count to be zero at all times to avoid getting
 	 * false positives from get_page_unless_zero() with
 	 * speculative page access (like in
 	 * page_cache_get_speculative()) on tail pages.
 	 */
 	VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
 	VM_BUG_ON(atomic_read(&page->_count) != 0);
 	VM_BUG_ON(page_mapcount(page) < 0);
 	if (get_page_head)
 		atomic_inc(&page->first_page->_count);
 	atomic_inc(&page->_mapcount);
 }
 /*
  * This is meant to be called as the FOLL_GET operation of
  * follow_page() and it must be called while holding the proper PT
  * lock while the pte (or pmd_trans_huge) is still mapping the page.
  */
 static inline void get_page_foll(struct page *page)
 {
 	if (unlikely(PageTail(page)))
 		/*
 		 * This is safe only because
 		 * __split_huge_page_refcount() can't run under
 		 * get_page_foll() because we hold the proper PT lock.
 		 */
 		__get_page_tail_foll(page, true);
 	else {
 		/*
 		 * Getting a normal page or the head of a compound page
 		 * requires to already have an elevated page->_count.
 		 */
 		VM_BUG_ON(atomic_read(&page->_count) <= 0);
 		atomic_inc(&page->_count);
 	}
 }
 extern unsigned long highest_memmap_pfn;
 /*
  * in mm/vmscan.c:
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 /*
  * in mm/page_alloc.c
  */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
 #ifdef CONFIG_MEMORY_FAILURE
 extern bool is_free_buddy_page(struct page *page);
 #endif
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 /*
  * in mm/compaction.c
  */
 /*
  * compact_control is used to track pages being migrated and the free pages
  * they are being migrated to during memory compaction. The free_pfn starts
  * at the end of a zone and migrate_pfn begins at the start. Movable pages
  * are moved to the end of a zone during a compaction run and the run
  * completes when free_pfn <= migrate_pfn
  */
 struct compact_control {
 	struct list_head freepages;	/* List of free pages to migrate to */
 	struct list_head migratepages;	/* List of pages being migrated */
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long free_pfn;		/* isolate_freepages search base */
+	unsigned long start_free_pfn;	/* where we started the search */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	bool sync;			/* Synchronous migration */
+	bool wrapped;			/* Order > 0 compactions are
+					   incremental, once free_pfn
+					   and migrate_pfn meet, we restart
+					   from the top of the zone;
+					   remember we wrapped around. */
 	int order;			/* order a direct compactor needs */
 	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
 	struct zone *zone;
 };
 unsigned long
 isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
 unsigned long
 isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
 			   unsigned long low_pfn, unsigned long end_pfn);
 #endif
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page)
 {
 	/* PageBuddy() must be checked by the caller */
 	return page_private(page);
 }
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
 /*
  * Called only in fault path via page_evictable() for a new page
  * to determine if it's being mapped into a LOCKED vma.
  * If so, mark page as mlocked.
  */
 static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
 				    struct page *page)
 {
 	VM_BUG_ON(PageLRU(page));
 	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
 		return 0;
 	if (!TestSetPageMlocked(page)) {
 		inc_zone_page_state(page, NR_MLOCK);
 		count_vm_event(UNEVICTABLE_PGMLOCKED);
 	}
 	return 1;
 }
 /*
  * must be called with vma's mmap_sem held for read or write, and page locked.
  */
 extern void mlock_vma_page(struct page *page);
 extern void munlock_vma_page(struct page *page);
 /*
  * Clear the page's PageMlocked().  This can be useful in a situation where
  * we want to unconditionally remove a page from the pagecache -- e.g.,
  * on truncation or freeing.
  *
  * It is legal to call this function for any page, mlocked or not.
  * If called for a page that is still mapped by mlocked vmas, all we do
  * is revert to lazy LRU behaviour -- semantics are not broken.
  */
 extern void __clear_page_mlock(struct page *page);
 static inline void clear_page_mlock(struct page *page)
 {
 	if (unlikely(TestClearPageMlocked(page)))
 		__clear_page_mlock(page);
 }
 /*
  * mlock_migrate_page - called only from migrate_page_copy() to
  * migrate the Mlocked page flag; update statistics.
  */
 static inline void mlock_migrate_page(struct page *newpage, struct page *page)
 {
 	if (TestClearPageMlocked(page)) {
 		unsigned long flags;
 		local_irq_save(flags);
 		__dec_zone_page_state(page, NR_MLOCK);
 		SetPageMlocked(newpage);
 		__inc_zone_page_state(newpage, NR_MLOCK);
 		local_irq_restore(flags);
 	}
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern unsigned long vma_address(struct page *page,
 				 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
 static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
 	return 0;
 }
 static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
 #endif /* !CONFIG_MMU */
 /*
  * Return the mem_map entry representing the 'offset' subpage within
  * the maximally aligned gigantic page 'base'.  Handle any discontiguity
  * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
  */
 static inline struct page *mem_map_offset(struct page *base, int offset)
 {
 	if (unlikely(offset >= MAX_ORDER_NR_PAGES))
 		return pfn_to_page(page_to_pfn(base) + offset);
 	return base + offset;
 }
 /*
  * Iterator over all subpages within the maximally aligned gigantic
  * page 'base'.  Handle any discontiguity in the mem_map.
  */
 static inline struct page *mem_map_next(struct page *iter,
 						struct page *base, int offset)
 {
 	if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
 		unsigned long pfn = page_to_pfn(base) + offset;
 		if (!pfn_valid(pfn))
 			return NULL;
 		return pfn_to_page(pfn);
 	}
 	return iter + 1;
 }
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
  * in those cases. SPARSEMEM, however, allows for memory hotplug,
  * and alloc_bootmem_node is not used.
  */
 #ifdef CONFIG_SPARSEMEM
 #define __paginginit __meminit
 #else
 #define __paginginit __init
 #endif
 /* Memory initialisation debug and verification */
 enum mminit_level {
 	MMINIT_WARNING,
 	MMINIT_VERIFY,
 	MMINIT_TRACE
 };
 #ifdef CONFIG_DEBUG_MEMORY_INIT
 extern int mminit_loglevel;
 #define mminit_dprintk(level, prefix, fmt, arg...) \
 do { \
 	if (level < mminit_loglevel) { \
 		printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
 		printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
 	} \
 } while (0)
 extern void mminit_verify_pageflags_layout(void);
 extern void mminit_verify_page_links(struct page *page,
 		enum zone_type zone, unsigned long nid, unsigned long pfn);
 extern void mminit_verify_zonelist(void);
 #else
 static inline void mminit_dprintk(enum mminit_level level,
 				const char *prefix, const char *fmt, ...)
 {
 }
 static inline void mminit_verify_pageflags_layout(void)
 {
 }
 static inline void mminit_verify_page_links(struct page *page,
 		enum zone_type zone, unsigned long nid, unsigned long pfn)
 {
 }
 static inline void mminit_verify_zonelist(void)
 {
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
 #if defined(CONFIG_SPARSEMEM)
 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 				unsigned long *end_pfn);
 #else
 static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 				unsigned long *end_pfn)
 {
 }
 #endif /* CONFIG_SPARSEMEM */
 #define ZONE_RECLAIM_NOSCAN	-2
 #define ZONE_RECLAIM_FULL	-1
 #define ZONE_RECLAIM_SOME	0
 #define ZONE_RECLAIM_SUCCESS	1
 #endif
 extern int hwpoison_filter(struct page *p);
 extern u32 hwpoison_filter_dev_major;
 extern u32 hwpoison_filter_dev_minor;
 extern u64 hwpoison_filter_flags_mask;
 extern u64 hwpoison_filter_flags_value;
 extern u64 hwpoison_filter_memcg;
 extern u32 hwpoison_filter_enable;
 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 static void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	do {
 		seq = zone_span_seqbegin(zone);
 		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
 			ret = 1;
 		else if (pfn < zone->zone_start_pfn)
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		reset_page_mapcount(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	reset_page_mapcount(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		__SetPageTail(p);
 		set_page_count(p, 0);
 		p->first_page = page;
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order) ||
 	    unlikely(!PageHead(page))) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount -2.
  * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 /*
  * free_page_mlock() -- clean up attempts to free and mlocked() page.
  * Page should not be on lru, so no need to fix that up.
  * free_pages_check() will verify...
  */
 static inline void free_page_mlock(struct page *page)
 {
 	__dec_zone_page_state(page, NR_MLOCK);
 	__count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, page_private(page));
 			trace_mm_page_pcpu_drain(page, 0, page_private(page));
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int wasMlocked = __TestClearPageMlocked(page);
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order,
 					get_pageblock_migratetype(page));
 	local_irq_restore(flags);
 }
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	unsigned int loop;
 	prefetchw(page);
 	for (loop = 0; loop < nr_pages; loop++) {
 		struct page *p = &page[loop];
 		if (loop + 1 < nr_pages)
 			prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_page_refcounted(page);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	__free_pages(page, pageblock_order);
 	totalram_pages += pageblock_nr_pages;
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- wli
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area * area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 static int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (start_pfn < zone->zone_start_pfn)
 		start_page = page;
 	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area * area;
 	int current_order;
 	struct page *page;
 	int migratetype, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			/*
 			 * If breaking a large block of pages, move all free
 			 * pages to the preferred allocation list. If falling
 			 * back for a reclaimable kernel allocation, be more
 			 * aggressive about taking ownership of free pages
 			 *
 			 * On the other hand, never change migration
 			 * type of MIGRATE_CMA pageblocks nor move CMA
 			 * pages on different free lists. We don't
 			 * want unmovable pages to be allocated from
 			 * MIGRATE_CMA areas.
 			 */
 			if (!is_migrate_cma(migratetype) &&
 			    (unlikely(current_order >= pageblock_order / 2) ||
 			     start_migratetype == MIGRATE_RECLAIMABLE ||
 			     page_group_by_mobility_disabled)) {
 				int pages;
 				pages = move_freepages_block(zone, page,
 								start_migratetype);
 				/* Claim the whole block if over half of it is free */
 				if (pages >= (1 << (pageblock_order-1)) ||
 						page_group_by_mobility_disabled)
 					set_pageblock_migratetype(page,
 								start_migratetype);
 				migratetype = start_migratetype;
 			}
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			/* Take ownership for orders >= pageblock_order */
 			if (current_order >= pageblock_order &&
 			    !is_migrate_cma(migratetype))
 				change_pageblock_range(page, current_order,
 							start_migratetype);
 			expand(zone, page, order, current_order, area,
 			       is_migrate_cma(migratetype)
 			     ? migratetype : start_migratetype);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
 	int mt = migratetype, i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		if (IS_ENABLED(CONFIG_CMA)) {
 			mt = get_pageblock_migratetype(page);
 			if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
 				mt = migratetype;
 		}
 		set_page_private(page, mt);
 		list = &page->lru;
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	local_irq_save(flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (!zone->spanned_pages)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	int wasMlocked = __TestClearPageMlocked(page);
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pcppages_bulk(zone, pcp->batch, pcp);
 		pcp->count -= pcp->batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	order = page_order(page);
 	/* Obey watermarks as if the page was being allocated */
 	watermark = low_wmark_pages(zone) + (1 << order);
 	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 		return 0;
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1 << order;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
 	}
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
 #define ALLOC_WMARK_LOW		WMARK_LOW
 #define ALLOC_WMARK_HIGH	WMARK_HIGH
 #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
 /* Mask to get the watermark bits */
 #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	int o;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_HIGH_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if ((alloc_flags & ALLOC_WMARK_LOW) &&
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			goto this_zone_full;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
 			int ret;
 			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags))
 				goto try_this_zone;
 			if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0)
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (NUMA_BUILD && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (!zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto this_zone_full;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	struct page *page;
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags, preferred_zone,
 				migratetype);
 		if (page) {
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
 				preferred_zone->compact_order_failed = order + 1;
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (sync_migration)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (NUMA_BUILD)
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags, preferred_zone,
 					migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 						enum zone_type high_zoneidx,
 						enum zone_type classzone_idx)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (!wait) {
 		/*
 		 * Not worth trying to allocate harder for
 		 * __GFP_NOMEMALLOC even if it can't schedule.
 		 */
 		if  (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (!in_interrupt() &&
 		    ((current->flags & PF_MEMALLOC) ||
 		     unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 	return alloc_flags;
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	bool sync_migration = false;
 	bool deferred_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapd(order, zonelist, high_zoneidx,
 						zone_idx(preferred_zone));
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
 		first_zones_zonelist(zonelist, high_zoneidx, NULL,
 					&preferred_zone);
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 		if (page)
 			goto got_pg;
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	sync_migration = true;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * has requested the system not be heavily disrupted, fail the
 	 * allocation now instead of entering direct reclaim
 	 */
 	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
 			preferred_zone, migratetype);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 static unsigned int nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned int sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /*
  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  */
 unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /*
  * Amount of free RAM allocatable within all zones
  */
 unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (NUMA_BUILD)
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = get_mems_allowed();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (!put_mems_allowed(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			nr[order] = zone->free_area[order].nr_free;
 			total += nr[order] << order;
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++)
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 		printk("= %lukB\n", K(total));
 	}
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 	BUG_ON(zone_type >= MAX_NR_ZONES);
 	zone_type++;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write)
 		strcpy(saved_string, (char*)table->data);
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		if (__parse_numa_zonelist_order((char*)table->data)) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char*)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_HIGH_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size,total_size;
 	struct zone *z;
 	int average_size;
 	/*
          * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
   	 * If there is a node whose DMA/DMA32 memory is very big area on
  	 * local memory, NODE_ORDER may be suitable.
          */
 	average_size = total_size /
 				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
 		/*
 		 * If another node is sufficiently far away then it is better
 		 * to reclaim pages in a zone before going off node.
 		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (distance != node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static __init_refok int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(void *data)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (data)
 			setup_zone_pageset((struct zone *)data);
 #endif
 		stop_machine(__build_all_zonelists, NULL, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < z->zone_start_pfn + z->spanned_pages)
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 /*
  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	struct per_cpu_pages *pcp;
 	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))
 		pcp->batch = PAGE_SHIFT * 8;
 }
 static void setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 		setup_pageset(pcp, zone_batchsize(zone));
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(pcp,
 				(zone->present_pages /
 					percpu_pagelist_fraction));
 	}
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static int __zone_pcp_update(void *data)
 {
 	struct zone *zone = data;
 	int cpu;
 	unsigned long batch = zone_batchsize(zone), flags;
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		local_irq_save(flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 		setup_pageset(pset, batch);
 		local_irq_restore(flags);
 	}
 	return 0;
 }
 void zone_pcp_update(struct zone *zone)
 {
 	stop_machine(__zone_pcp_update, zone, NULL);
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 __meminit int init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		if (start_pfn <= pfn && pfn < end_pfn)
 			return nid;
 	/* This is a memory hole */
 	return -1;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the node and zone */
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 								zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 								zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 static inline void __init set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 static inline void set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
 								zholes_size);
 		/*
 		 * Adjust realsize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages =
 			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
 		if (realsize >= memmap_pages) {
 			realsize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds realsize %lu\n",
 				zone_names[j], memmap_pages, realsize);
 		/* Account for reserved pages */
 		if (j == 0 && realsize > dma_reserve) {
 			realsize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+		zone->compact_cached_free_pfn = zone->zone_start_pfn +
+						zone->spanned_pages;
+		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		lruvec_init(&zone->lruvec, zone);
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 static void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #else
 static inline void setup_nr_node_ids(void)
 {
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_HIGH_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 	}
   	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisified
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisified
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_HIGH_MEMORY] = saved_node_state;
 }
 /* Any regular memory on that node ? */
 static void check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
 	enum zone_type zone_type;
 	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 			break;
 		}
 	}
 #endif
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early_node_map[] */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 		check_for_regular_memory(pgdat);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->present_pages)
 				max = zone->present_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = present_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->present_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			int min_pages;
 			min_pages = zone->present_pages / 1024;
 			if (min_pages < SWAP_CLUSTER_MAX)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
 		zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
 		zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->present_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (min_free_kbytes < 128)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->present_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->present_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
  * can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret < 0))
 		return ret;
 	for_each_populated_zone(zone) {
 		for_each_possible_cpu(cpu) {
 			unsigned long  high;
 			high = zone->present_pages / percpu_pagelist_fraction;
 			setup_pagelist_highmark(
 				per_cpu_ptr(zone->pageset, cpu), high);
 		}
 	}
 	return 0;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - zone->zone_start_pfn;
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_group(struct page *page,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long flags = 0;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (test_bit(bitidx + start_bitidx, bitmap))
 			flags |= value;
 	return flags;
 }
 /**
  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	VM_BUG_ON(pfn < zone->zone_start_pfn);
 	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (flags & value)
 			__set_bit(bitidx + start_bitidx, bitmap);
 		else
 			__clear_bit(bitidx + start_bitidx, bitmap);
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check wihtout isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 static bool
 __has_unmovable_pages(struct zone *zone, struct page *page, int count)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (zone->zone_start_pfn > pfn ||
 			zone->zone_start_pfn + zone->spanned_pages <= pfn)
 		return false;
 	return !__has_unmovable_pages(zone, page, 0);
 }
 int set_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags, pfn;
 	struct memory_isolate_notify arg;
 	int notifier_ret;
 	int ret = -EBUSY;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = page_to_pfn(page);
 	arg.start_pfn = pfn;
 	arg.nr_pages = pageblock_nr_pages;
 	arg.pages_found = 0;
 	/*
 	 * It may be possible to isolate a pageblock even if the
 	 * migratetype is not MIGRATE_MOVABLE. The memory isolation
 	 * notifier chain is used by balloon drivers to return the
 	 * number of pages in a range that are held by the balloon
 	 * driver to shrink memory. If all the pages are accounted for
 	 * by balloons, are free, or on the LRU, isolation can continue.
 	 * Later, for example, when memory hotplug notifier runs, these
 	 * pages reported as "can be isolated" should be isolated(freed)
 	 * by the balloon driver through the memory notifier chain.
 	 */
 	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
 	notifier_ret = notifier_to_errno(notifier_ret);
 	if (notifier_ret)
 		goto out;
 	/*
 	 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
 	 * We just check MOVABLE pages.
 	 */
 	if (!__has_unmovable_pages(zone, page, arg.pages_found))
 		ret = 0;
 	/*
 	 * Unmovable means "not-on-lru" pages. If Unmovable pages are
 	 * larger than removable-by-driver pages reported by notifier,
 	 * we'll fail.
 	 */
 out:
 	if (!ret) {
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		move_freepages_block(zone, page, MIGRATE_ISOLATE);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	if (!ret)
 		drain_all_pages();
 	return ret;
 }
 void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 {
 	struct zone *zone;
 	unsigned long flags;
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		goto out;
 	set_pageblock_migratetype(page, migratetype);
 	move_freepages_block(zone, page, migratetype);
 out:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 static struct page *
 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
 			     int **resultp)
 {
 	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
 	return alloc_page(gfp_mask);
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.sync = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	migrate_prep_local();
 	while (pfn < end || !list_empty(&cc.migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc.migratepages)) {
 			cc.nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc.zone, &cc,
 							 pfn, end);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		ret = migrate_pages(&cc.migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
 	}
 	putback_lru_pages(&cc.migratepages);
 	return ret > 0 ? 0 : ret;
 }
 /*
  * Update zone's cma pages counter used for watermark level calculation.
  */
 static inline void __update_cma_watermarks(struct zone *zone, int count)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->min_cma_pages += count;
 	spin_unlock_irqrestore(&zone->lock, flags);
 	setup_per_zone_wmarks();
 }
 /*
  * Trigger memory pressure bump to reclaim some pages in order to be able to
  * allocate 'count' pages in single page units. Does similar work as
  *__alloc_pages_slowpath() function.
  */
 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zonelist *zonelist = node_zonelist(0, gfp_mask);
 	int did_some_progress = 0;
 	int order = 1;
 	/*
 	 * Increase level of watermarks to force kswapd do his job
 	 * to stabilise at new watermark level.
 	 */
 	__update_cma_watermarks(zone, count);
 	/* Obey watermarks as if the page was being allocated */
 	while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
 		wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
 		did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 						      NULL);
 		if (!did_some_progress) {
 			/* Exhausted what can be done so it's blamo time */
 			out_of_memory(zonelist, gfp_mask, order, NULL, false);
 		}
 	}
 	/* Restore original watermark levels. */
 	__update_cma_watermarks(zone, -count);
 	return count;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	struct zone *zone = page_zone(pfn_to_page(start));
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype);
 	if (ret)
 		goto done;
 	ret = __alloc_contig_migrate_range(start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/*
 	 * Reclaim enough pages to make sure that contiguous allocation
 	 * will not starve the system.
 	 */
 	__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	for (; nr_pages--; ++pfn)
 		__free_page(pfn_to_page(pfn));
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES,
 				      - (1UL << order));
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);
 }