Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

73

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

73

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

DEFINE_PER_CPU(int, numa_node);

74

DEFINE_PER_CPU(int, numa_node);

75

EXPORT_PER_CPU_SYMBOL(numa_node);

75

EXPORT_PER_CPU_SYMBOL(numa_node);

76

#endif

76

#endif

77

78

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

78

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

/*

79

/*

80

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

80

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

81

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

82

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* defined in <linux/topology.h>.

83

* defined in <linux/topology.h>.

84

*/

84

*/

85

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

85

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

86

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

#endif

87

#endif

88

89

/*

89

/*

90

* Array of node states.

90

* Array of node states.

91

*/

91

*/

92

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

92

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

[N_POSSIBLE] = NODE_MASK_ALL,

93

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_ONLINE] = { { [0] = 1UL } },

94

[N_ONLINE] = { { [0] = 1UL } },

95

#ifndef CONFIG_NUMA

95

#ifndef CONFIG_NUMA

96

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

96

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

#ifdef CONFIG_HIGHMEM

97

#ifdef CONFIG_HIGHMEM

98

[N_HIGH_MEMORY] = { { [0] = 1UL } },

98

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

#endif

99

#endif

100

#ifdef CONFIG_MOVABLE_NODE

100

#ifdef CONFIG_MOVABLE_NODE

101

[N_MEMORY] = { { [0] = 1UL } },

101

[N_MEMORY] = { { [0] = 1UL } },

102

#endif

102

#endif

103

[N_CPU] = { { [0] = 1UL } },

103

[N_CPU] = { { [0] = 1UL } },

104

#endif /* NUMA */

104

#endif /* NUMA */

105

};

105

};

106

EXPORT_SYMBOL(node_states);

106

EXPORT_SYMBOL(node_states);

107

108

/* Protect totalram_pages and zone->managed_pages */

108

/* Protect totalram_pages and zone->managed_pages */

109

static DEFINE_SPINLOCK(managed_page_count_lock);

109

static DEFINE_SPINLOCK(managed_page_count_lock);

110

111

unsigned long totalram_pages __read_mostly;

111

unsigned long totalram_pages __read_mostly;

112

unsigned long totalreserve_pages __read_mostly;

112

unsigned long totalreserve_pages __read_mostly;

113

/*

113

/*

114

* When calculating the number of globally allowed dirty pages, there

114

* When calculating the number of globally allowed dirty pages, there

115

* is a certain number of per-zone reserves that should not be

115

* is a certain number of per-zone reserves that should not be

116

* considered dirtyable memory. This is the sum of those reserves

116

* considered dirtyable memory. This is the sum of those reserves

117

* over all existing zones that contribute dirtyable memory.

117

* over all existing zones that contribute dirtyable memory.

118

*/

118

*/

119

unsigned long dirty_balance_reserve __read_mostly;

119

unsigned long dirty_balance_reserve __read_mostly;

120

121

int percpu_pagelist_fraction;

121

int percpu_pagelist_fraction;

122

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

122

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

124

#ifdef CONFIG_PM_SLEEP

124

#ifdef CONFIG_PM_SLEEP

125

/*

125

/*

126

* The following functions are used by the suspend/hibernate code to temporarily

126

* The following functions are used by the suspend/hibernate code to temporarily

127

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

127

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* while devices are suspended. To avoid races with the suspend/hibernate code,

128

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* they should always be called with pm_mutex held (gfp_allowed_mask also should

129

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* only be modified with pm_mutex held, unless the suspend/hibernate code is

130

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* guaranteed not to run in parallel with that modification).

131

* guaranteed not to run in parallel with that modification).

132

*/

132

*/

133

134

static gfp_t saved_gfp_mask;

134

static gfp_t saved_gfp_mask;

135

136

void pm_restore_gfp_mask(void)

136

void pm_restore_gfp_mask(void)

137

{

137

{

138

WARN_ON(!mutex_is_locked(&pm_mutex));

138

WARN_ON(!mutex_is_locked(&pm_mutex));

139

if (saved_gfp_mask) {

139

if (saved_gfp_mask) {

140

gfp_allowed_mask = saved_gfp_mask;

140

gfp_allowed_mask = saved_gfp_mask;

141

saved_gfp_mask = 0;

141

saved_gfp_mask = 0;

142

}

142

}

143

}

143

}

144

145

void pm_restrict_gfp_mask(void)

145

void pm_restrict_gfp_mask(void)

146

{

146

{

147

WARN_ON(!mutex_is_locked(&pm_mutex));

147

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(saved_gfp_mask);

148

WARN_ON(saved_gfp_mask);

149

saved_gfp_mask = gfp_allowed_mask;

149

saved_gfp_mask = gfp_allowed_mask;

150

gfp_allowed_mask &= ~GFP_IOFS;

150

gfp_allowed_mask &= ~GFP_IOFS;

151

}

151

}

152

153

bool pm_suspended_storage(void)

153

bool pm_suspended_storage(void)

154

{

154

{

155

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

155

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

return false;

156

return false;

157

return true;

157

return true;

158

}

158

}

159

#endif /* CONFIG_PM_SLEEP */

159

#endif /* CONFIG_PM_SLEEP */

160

161

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

161

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

int pageblock_order __read_mostly;

162

int pageblock_order __read_mostly;

163

#endif

163

#endif

164

165

static void __free_pages_ok(struct page *page, unsigned int order);

165

static void __free_pages_ok(struct page *page, unsigned int order);

166

167

/*

167

/*

168

* results with 256, 32 in the lowmem_reserve sysctl:

168

* results with 256, 32 in the lowmem_reserve sysctl:

169

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

169

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 784M normal, 224M high)

170

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

171

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

172

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

173

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

*

174

*

175

* TBD: should special case ZONE_DMA32 machines here - in those we normally

175

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* don't need any ZONE_NORMAL reservation

176

* don't need any ZONE_NORMAL reservation

177

*/

177

*/

178

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

178

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

#ifdef CONFIG_ZONE_DMA

179

#ifdef CONFIG_ZONE_DMA

180

256,

180

256,

181

#endif

181

#endif

182

#ifdef CONFIG_ZONE_DMA32

182

#ifdef CONFIG_ZONE_DMA32

183

256,

183

256,

184

#endif

184

#endif

185

#ifdef CONFIG_HIGHMEM

185

#ifdef CONFIG_HIGHMEM

186

32,

186

32,

187

#endif

187

#endif

188

32,

188

32,

189

};

189

};

190

191

EXPORT_SYMBOL(totalram_pages);

191

EXPORT_SYMBOL(totalram_pages);

192

193

static char * const zone_names[MAX_NR_ZONES] = {

193

static char * const zone_names[MAX_NR_ZONES] = {

194

#ifdef CONFIG_ZONE_DMA

194

#ifdef CONFIG_ZONE_DMA

195

"DMA",

195

"DMA",

196

#endif

196

#endif

197

#ifdef CONFIG_ZONE_DMA32

197

#ifdef CONFIG_ZONE_DMA32

198

"DMA32",

198

"DMA32",

199

#endif

199

#endif

200

"Normal",

200

"Normal",

201

#ifdef CONFIG_HIGHMEM

201

#ifdef CONFIG_HIGHMEM

202

"HighMem",

202

"HighMem",

203

#endif

203

#endif

204

"Movable",

204

"Movable",

205

};

205

};

206

207

int min_free_kbytes = 1024;

207

int min_free_kbytes = 1024;

208

int user_min_free_kbytes;

208

int user_min_free_kbytes;

209

210

static unsigned long __meminitdata nr_kernel_pages;

210

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_all_pages;

211

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata dma_reserve;

212

static unsigned long __meminitdata dma_reserve;

213

214

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

214

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

215

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __initdata required_kernelcore;

217

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_movablecore;

218

static unsigned long __initdata required_movablecore;

219

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

219

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

221

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

221

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

int movable_zone;

222

int movable_zone;

223

EXPORT_SYMBOL(movable_zone);

223

EXPORT_SYMBOL(movable_zone);

224

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

224

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

226

#if MAX_NUMNODES > 1

226

#if MAX_NUMNODES > 1

227

int nr_node_ids __read_mostly = MAX_NUMNODES;

227

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_online_nodes __read_mostly = 1;

228

int nr_online_nodes __read_mostly = 1;

229

EXPORT_SYMBOL(nr_node_ids);

229

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_online_nodes);

230

EXPORT_SYMBOL(nr_online_nodes);

231

#endif

231

#endif

232

233

int page_group_by_mobility_disabled __read_mostly;

233

int page_group_by_mobility_disabled __read_mostly;

234

235

void set_pageblock_migratetype(struct page *page, int migratetype)

235

void set_pageblock_migratetype(struct page *page, int migratetype)

236

{

236

{

237

if (unlikely(page_group_by_mobility_disabled &&

237

if (unlikely(page_group_by_mobility_disabled &&

238

migratetype < MIGRATE_PCPTYPES))

238

migratetype < MIGRATE_PCPTYPES))

239

migratetype = MIGRATE_UNMOVABLE;

239

migratetype = MIGRATE_UNMOVABLE;

240

241

set_pageblock_flags_group(page, (unsigned long)migratetype,

241

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

PB_migrate, PB_migrate_end);

242

PB_migrate, PB_migrate_end);

243

}

243

}

244

245

bool oom_killer_disabled __read_mostly;

245

bool oom_killer_disabled __read_mostly;

246

247

#ifdef CONFIG_DEBUG_VM

247

#ifdef CONFIG_DEBUG_VM

248

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

248

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

{

249

{

250

int ret = 0;

250

int ret = 0;

251

unsigned seq;

251

unsigned seq;

252

unsigned long pfn = page_to_pfn(page);

252

unsigned long pfn = page_to_pfn(page);

253

unsigned long sp, start_pfn;

253

unsigned long sp, start_pfn;

254

255

do {

255

do {

256

seq = zone_span_seqbegin(zone);

256

seq = zone_span_seqbegin(zone);

257

start_pfn = zone->zone_start_pfn;

257

start_pfn = zone->zone_start_pfn;

258

sp = zone->spanned_pages;

258

sp = zone->spanned_pages;

259

if (!zone_spans_pfn(zone, pfn))

259

if (!zone_spans_pfn(zone, pfn))

260

ret = 1;

260

ret = 1;

261

} while (zone_span_seqretry(zone, seq));

261

} while (zone_span_seqretry(zone, seq));

262

263

if (ret)

263

if (ret)

264

pr_err("page %lu outside zone [ %lu - %lu ]\n",

264

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pfn, start_pfn, start_pfn + sp);

265

pfn, start_pfn, start_pfn + sp);

266

267

return ret;

267

return ret;

268

}

268

}

269

270

static int page_is_consistent(struct zone *zone, struct page *page)

270

static int page_is_consistent(struct zone *zone, struct page *page)

271

{

271

{

272

if (!pfn_valid_within(page_to_pfn(page)))

272

if (!pfn_valid_within(page_to_pfn(page)))

273

return 0;

273

return 0;

274

if (zone != page_zone(page))

274

if (zone != page_zone(page))

275

return 0;

275

return 0;

276

277

return 1;

277

return 1;

278

}

278

}

279

/*

279

/*

280

* Temporary debugging check for pages not lying within a given zone.

280

* Temporary debugging check for pages not lying within a given zone.

281

*/

281

*/

282

static int bad_range(struct zone *zone, struct page *page)

282

static int bad_range(struct zone *zone, struct page *page)

283

{

283

{

284

if (page_outside_zone_boundaries(zone, page))

284

if (page_outside_zone_boundaries(zone, page))

285

return 1;

285

return 1;

286

if (!page_is_consistent(zone, page))

286

if (!page_is_consistent(zone, page))

287

return 1;

287

return 1;

288

289

return 0;

289

return 0;

290

}

290

}

291

#else

291

#else

292

static inline int bad_range(struct zone *zone, struct page *page)

292

static inline int bad_range(struct zone *zone, struct page *page)

293

{

293

{

294

return 0;

294

return 0;

295

}

295

}

296

#endif

296

#endif

297

298

static void bad_page(struct page *page)

298

static void bad_page(struct page *page)

299

{

299

{

300

static unsigned long resume;

300

static unsigned long resume;

301

static unsigned long nr_shown;

301

static unsigned long nr_shown;

302

static unsigned long nr_unshown;

302

static unsigned long nr_unshown;

303

304

/* Don't complain about poisoned pages */

304

/* Don't complain about poisoned pages */

305

if (PageHWPoison(page)) {

305

if (PageHWPoison(page)) {

306

page_mapcount_reset(page); /* remove PageBuddy */

306

page_mapcount_reset(page); /* remove PageBuddy */

307

return;

307

return;

308

}

308

}

309

310

/*

310

/*

311

* Allow a burst of 60 reports, then keep quiet for that minute;

311

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* or allow a steady drip of one report per second.

312

* or allow a steady drip of one report per second.

313

*/

313

*/

314

if (nr_shown == 60) {

314

if (nr_shown == 60) {

315

if (time_before(jiffies, resume)) {

315

if (time_before(jiffies, resume)) {

316

nr_unshown++;

316

nr_unshown++;

317

goto out;

317

goto out;

318

}

318

}

319

if (nr_unshown) {

319

if (nr_unshown) {

320

printk(KERN_ALERT

320

printk(KERN_ALERT

321

"BUG: Bad page state: %lu messages suppressed\n",

321

"BUG: Bad page state: %lu messages suppressed\n",

322

nr_unshown);

322

nr_unshown);

323

nr_unshown = 0;

323

nr_unshown = 0;

324

}

324

}

325

nr_shown = 0;

325

nr_shown = 0;

326

}

326

}

327

if (nr_shown++ == 0)

327

if (nr_shown++ == 0)

328

resume = jiffies + 60 * HZ;

328

resume = jiffies + 60 * HZ;

329

330

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

330

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

current->comm, page_to_pfn(page));

331

current->comm, page_to_pfn(page));

332

dump_page(page);

332

dump_page(page);

333

334

print_modules();

334

print_modules();

335

dump_stack();

335

dump_stack();

336

out:

336

out:

337

/* Leave bad fields for debug, except PageBuddy could make trouble */

337

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

page_mapcount_reset(page); /* remove PageBuddy */

338

page_mapcount_reset(page); /* remove PageBuddy */

339

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

339

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

}

340

}

341

342

/*

342

/*

343

* Higher-order pages are called "compound pages". They are structured thusly:

343

* Higher-order pages are called "compound pages". They are structured thusly:

344

*

344

*

345

* The first PAGE_SIZE page is called the "head page".

345

* The first PAGE_SIZE page is called the "head page".

346

*

346

*

347

* The remaining PAGE_SIZE pages are called "tail pages".

347

* The remaining PAGE_SIZE pages are called "tail pages".

348

*

348

*

349

* All pages have PG_compound set. All tail pages have their ->first_page

349

* All pages have PG_compound set. All tail pages have their ->first_page

350

* pointing at the head page.

350

* pointing at the head page.

351

*

351

*

352

* The first tail page's ->lru.next holds the address of the compound page's

352

* The first tail page's ->lru.next holds the address of the compound page's

353

* put_page() function. Its ->lru.prev holds the order of allocation.

353

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* This usage means that zero-order pages may not be compound.

354

* This usage means that zero-order pages may not be compound.

355

*/

355

*/

356

357

static void free_compound_page(struct page *page)

357

static void free_compound_page(struct page *page)

358

{

358

{

359

__free_pages_ok(page, compound_order(page));

359

__free_pages_ok(page, compound_order(page));

360

}

360

}

361

362

void prep_compound_page(struct page *page, unsigned long order)

362

void prep_compound_page(struct page *page, unsigned long order)

363

{

363

{

364

int i;

364

int i;

365

int nr_pages = 1 << order;

365

int nr_pages = 1 << order;

366

367

set_compound_page_dtor(page, free_compound_page);

367

set_compound_page_dtor(page, free_compound_page);

368

set_compound_order(page, order);

368

set_compound_order(page, order);

369

__SetPageHead(page);

369

__SetPageHead(page);

370

for (i = 1; i < nr_pages; i++) {

370

for (i = 1; i < nr_pages; i++) {

371

struct page *p = page + i;

371

struct page *p = page + i;

372

__SetPageTail(p);

372

__SetPageTail(p);

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

}

375

}

376

}

376

}

377

378

/* update __split_huge_page_refcount if you change this function */

378

/* update __split_huge_page_refcount if you change this function */

379

static int destroy_compound_page(struct page *page, unsigned long order)

379

static int destroy_compound_page(struct page *page, unsigned long order)

380

{

380

{

381

int i;

381

int i;

382

int nr_pages = 1 << order;

382

int nr_pages = 1 << order;

383

int bad = 0;

383

int bad = 0;

384

385

if (unlikely(compound_order(page) != order)) {

385

if (unlikely(compound_order(page) != order)) {

386

bad_page(page);

386

bad_page(page);

387

bad++;

387

bad++;

388

}

388

}

389

390

__ClearPageHead(page);

390

__ClearPageHead(page);

391

392

for (i = 1; i < nr_pages; i++) {

392

for (i = 1; i < nr_pages; i++) {

393

struct page *p = page + i;

393

struct page *p = page + i;

394

395

if (unlikely(!PageTail(p) || (p->first_page != page))) {

395

if (unlikely(!PageTail(p) || (p->first_page != page))) {

396

bad_page(page);

396

bad_page(page);

397

bad++;

397

bad++;

398

}

398

}

399

__ClearPageTail(p);

399

__ClearPageTail(p);

400

}

400

}

401

402

return bad;

402

return bad;

403

}

403

}

404

405

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

405

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

406

{

406

{

407

int i;

407

int i;

408

409

/*

409

/*

410

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

410

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

411

* and __GFP_HIGHMEM from hard or soft interrupt context.

411

* and __GFP_HIGHMEM from hard or soft interrupt context.

412

*/

412

*/

413

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

413

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

414

for (i = 0; i < (1 << order); i++)

414

for (i = 0; i < (1 << order); i++)

415

clear_highpage(page + i);

415

clear_highpage(page + i);

416

}

416

}

417

418

#ifdef CONFIG_DEBUG_PAGEALLOC

418

#ifdef CONFIG_DEBUG_PAGEALLOC

419

unsigned int _debug_guardpage_minorder;

419

unsigned int _debug_guardpage_minorder;

420

421

static int __init debug_guardpage_minorder_setup(char *buf)

421

static int __init debug_guardpage_minorder_setup(char *buf)

422

{

422

{

423

unsigned long res;

423

unsigned long res;

424

425

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

425

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

426

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

426

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

427

return 0;

427

return 0;

428

}

428

}

429

_debug_guardpage_minorder = res;

429

_debug_guardpage_minorder = res;

430

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

430

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

431

return 0;

431

return 0;

432

}

432

}

433

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

433

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

434

435

static inline void set_page_guard_flag(struct page *page)

435

static inline void set_page_guard_flag(struct page *page)

436

{

436

{

437

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

437

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

438

}

438

}

439

440

static inline void clear_page_guard_flag(struct page *page)

440

static inline void clear_page_guard_flag(struct page *page)

441

{

441

{

442

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

442

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

443

}

443

}

444

#else

444

#else

445

static inline void set_page_guard_flag(struct page *page) { }

445

static inline void set_page_guard_flag(struct page *page) { }

446

static inline void clear_page_guard_flag(struct page *page) { }

446

static inline void clear_page_guard_flag(struct page *page) { }

447

#endif

447

#endif

448

449

static inline void set_page_order(struct page *page, int order)

449

static inline void set_page_order(struct page *page, int order)

450

{

450

{

451

set_page_private(page, order);

451

set_page_private(page, order);

452

__SetPageBuddy(page);

452

__SetPageBuddy(page);

453

}

453

}

454

455

static inline void rmv_page_order(struct page *page)

455

static inline void rmv_page_order(struct page *page)

456

{

456

{

457

__ClearPageBuddy(page);

457

__ClearPageBuddy(page);

458

set_page_private(page, 0);

458

set_page_private(page, 0);

459

}

459

}

460

461

/*

461

/*

462

* Locate the struct page for both the matching buddy in our

462

* Locate the struct page for both the matching buddy in our

463

* pair (buddy1) and the combined O(n+1) page they form (page).

463

* pair (buddy1) and the combined O(n+1) page they form (page).

464

*

464

*

465

* 1) Any buddy B1 will have an order O twin B2 which satisfies

465

* 1) Any buddy B1 will have an order O twin B2 which satisfies

466

* the following equation:

466

* the following equation:

467

* B2 = B1 ^ (1 << O)

467

* B2 = B1 ^ (1 << O)

468

* For example, if the starting buddy (buddy2) is #8 its order

468

* For example, if the starting buddy (buddy2) is #8 its order

469

* 1 buddy is #10:

469

* 1 buddy is #10:

470

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

470

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

471

*

471

*

472

* 2) Any buddy B will have an order O+1 parent P which

472

* 2) Any buddy B will have an order O+1 parent P which

473

* satisfies the following equation:

473

* satisfies the following equation:

474

* P = B & ~(1 << O)

474

* P = B & ~(1 << O)

475

*

475

*

476

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

476

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

477

*/

477

*/

478

static inline unsigned long

478

static inline unsigned long

479

__find_buddy_index(unsigned long page_idx, unsigned int order)

479

__find_buddy_index(unsigned long page_idx, unsigned int order)

480

{

480

{

481

return page_idx ^ (1 << order);

481

return page_idx ^ (1 << order);

482

}

482

}

483

484

/*

484

/*

485

* This function checks whether a page is free && is the buddy

485

* This function checks whether a page is free && is the buddy

486

* we can do coalesce a page and its buddy if

486

* we can do coalesce a page and its buddy if

487

* (a) the buddy is not in a hole &&

487

* (a) the buddy is not in a hole &&

488

* (b) the buddy is in the buddy system &&

488

* (b) the buddy is in the buddy system &&

489

* (c) a page and its buddy have the same order &&

489

* (c) a page and its buddy have the same order &&

490

* (d) a page and its buddy are in the same zone.

490

* (d) a page and its buddy are in the same zone.

491

*

491

*

492

* For recording whether a page is in the buddy system, we set ->_mapcount

492

* For recording whether a page is in the buddy system, we set ->_mapcount

493

* PAGE_BUDDY_MAPCOUNT_VALUE.

493

* PAGE_BUDDY_MAPCOUNT_VALUE.

494

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

494

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

495

* serialized by zone->lock.

495

* serialized by zone->lock.

496

*

496

*

497

* For recording page's order, we use page_private(page).

497

* For recording page's order, we use page_private(page).

498

*/

498

*/

499

static inline int page_is_buddy(struct page *page, struct page *buddy,

499

static inline int page_is_buddy(struct page *page, struct page *buddy,

500

int order)

500

int order)

501

{

501

{

502

if (!pfn_valid_within(page_to_pfn(buddy)))

502

if (!pfn_valid_within(page_to_pfn(buddy)))

503

return 0;

503

return 0;

504

505

if (page_zone_id(page) != page_zone_id(buddy))

505

if (page_zone_id(page) != page_zone_id(buddy))

506

return 0;

506

return 0;

507

508

if (page_is_guard(buddy) && page_order(buddy) == order) {

508

if (page_is_guard(buddy) && page_order(buddy) == order) {

509

VM_BUG_ON(page_count(buddy) != 0);

509

VM_BUG_ON(page_count(buddy) != 0);

510

return 1;

510

return 1;

511

}

511

}

512

513

if (PageBuddy(buddy) && page_order(buddy) == order) {

513

if (PageBuddy(buddy) && page_order(buddy) == order) {

514

VM_BUG_ON(page_count(buddy) != 0);

514

VM_BUG_ON(page_count(buddy) != 0);

515

return 1;

515

return 1;

516

}

516

}

517

return 0;

517

return 0;

518

}

518

}

519

520

/*

520

/*

521

* Freeing function for a buddy system allocator.

521

* Freeing function for a buddy system allocator.

522

*

522

*

523

* The concept of a buddy system is to maintain direct-mapped table

523

* The concept of a buddy system is to maintain direct-mapped table

524

* (containing bit values) for memory blocks of various "orders".

524

* (containing bit values) for memory blocks of various "orders".

525

* The bottom level table contains the map for the smallest allocatable

525

* The bottom level table contains the map for the smallest allocatable

526

* units of memory (here, pages), and each level above it describes

526

* units of memory (here, pages), and each level above it describes

527

* pairs of units from the levels below, hence, "buddies".

527

* pairs of units from the levels below, hence, "buddies".

528

* At a high level, all that happens here is marking the table entry

528

* At a high level, all that happens here is marking the table entry

529

* at the bottom level available, and propagating the changes upward

529

* at the bottom level available, and propagating the changes upward

530

* as necessary, plus some accounting needed to play nicely with other

530

* as necessary, plus some accounting needed to play nicely with other

531

* parts of the VM system.

531

* parts of the VM system.

532

* At each level, we keep a list of pages, which are heads of continuous

532

* At each level, we keep a list of pages, which are heads of continuous

533

* free pages of length of (1 << order) and marked with _mapcount

533

* free pages of length of (1 << order) and marked with _mapcount

534

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

534

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

535

* field.

535

* field.

536

* So when we are allocating or freeing one, we can derive the state of the

536

* So when we are allocating or freeing one, we can derive the state of the

537

* other. That is, if we allocate a small block, and both were

537

* other. That is, if we allocate a small block, and both were

538

* free, the remainder of the region must be split into blocks.

538

* free, the remainder of the region must be split into blocks.

539

* If a block is freed, and its buddy is also free, then this

539

* If a block is freed, and its buddy is also free, then this

540

* triggers coalescing into a block of larger size.

540

* triggers coalescing into a block of larger size.

541

*

541

*

542

* -- nyc

542

* -- nyc

543

*/

543

*/

544

545

static inline void __free_one_page(struct page *page,

545

static inline void __free_one_page(struct page *page,

546

struct zone *zone, unsigned int order,

546

struct zone *zone, unsigned int order,

547

int migratetype)

547

int migratetype)

548

{

548

{

549

unsigned long page_idx;

549

unsigned long page_idx;

550

unsigned long combined_idx;

550

unsigned long combined_idx;

551

unsigned long uninitialized_var(buddy_idx);

551

unsigned long uninitialized_var(buddy_idx);

552

struct page *buddy;

552

struct page *buddy;

553

554

VM_BUG_ON(!zone_is_initialized(zone));

554

VM_BUG_ON(!zone_is_initialized(zone));

555

556

if (unlikely(PageCompound(page)))

556

if (unlikely(PageCompound(page)))

557

if (unlikely(destroy_compound_page(page, order)))

557

if (unlikely(destroy_compound_page(page, order)))

558

return;

558

return;

559

560

VM_BUG_ON(migratetype == -1);

560

VM_BUG_ON(migratetype == -1);

561

562

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

562

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

563

564

VM_BUG_ON(page_idx & ((1 << order) - 1));

564

VM_BUG_ON(page_idx & ((1 << order) - 1));

565

VM_BUG_ON(bad_range(zone, page));

565

VM_BUG_ON(bad_range(zone, page));

566

567

while (order < MAX_ORDER-1) {

567

while (order < MAX_ORDER-1) {

568

buddy_idx = __find_buddy_index(page_idx, order);

568

buddy_idx = __find_buddy_index(page_idx, order);

569

buddy = page + (buddy_idx - page_idx);

569

buddy = page + (buddy_idx - page_idx);

570

if (!page_is_buddy(page, buddy, order))

570

if (!page_is_buddy(page, buddy, order))

571

break;

571

break;

572

/*

572

/*

573

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

573

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

574

* merge with it and move up one order.

574

* merge with it and move up one order.

575

*/

575

*/

576

if (page_is_guard(buddy)) {

576

if (page_is_guard(buddy)) {

577

clear_page_guard_flag(buddy);

577

clear_page_guard_flag(buddy);

578

set_page_private(page, 0);

578

set_page_private(page, 0);

579

__mod_zone_freepage_state(zone, 1 << order,

579

__mod_zone_freepage_state(zone, 1 << order,

580

migratetype);

580

migratetype);

581

} else {

581

} else {

582

list_del(&buddy->lru);

582

list_del(&buddy->lru);

583

zone->free_area[order].nr_free--;

583

zone->free_area[order].nr_free--;

584

rmv_page_order(buddy);

584

rmv_page_order(buddy);

585

}

585

}

586

combined_idx = buddy_idx & page_idx;

586

combined_idx = buddy_idx & page_idx;

587

page = page + (combined_idx - page_idx);

587

page = page + (combined_idx - page_idx);

588

page_idx = combined_idx;

588

page_idx = combined_idx;

589

order++;

589

order++;

590

}

590

}

591

set_page_order(page, order);

591

set_page_order(page, order);

592

593

/*

593

/*

594

* If this is not the largest possible page, check if the buddy

594

* If this is not the largest possible page, check if the buddy

595

* of the next-highest order is free. If it is, it's possible

595

* of the next-highest order is free. If it is, it's possible

596

* that pages are being freed that will coalesce soon. In case,

596

* that pages are being freed that will coalesce soon. In case,

597

* that is happening, add the free page to the tail of the list

597

* that is happening, add the free page to the tail of the list

598

* so it's less likely to be used soon and more likely to be merged

598

* so it's less likely to be used soon and more likely to be merged

599

* as a higher order page

599

* as a higher order page

600

*/

600

*/

601

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

601

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

602

struct page *higher_page, *higher_buddy;

602

struct page *higher_page, *higher_buddy;

603

combined_idx = buddy_idx & page_idx;

603

combined_idx = buddy_idx & page_idx;

604

higher_page = page + (combined_idx - page_idx);

604

higher_page = page + (combined_idx - page_idx);

605

buddy_idx = __find_buddy_index(combined_idx, order + 1);

605

buddy_idx = __find_buddy_index(combined_idx, order + 1);

606

higher_buddy = higher_page + (buddy_idx - combined_idx);

606

higher_buddy = higher_page + (buddy_idx - combined_idx);

607

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

607

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

608

list_add_tail(&page->lru,

608

list_add_tail(&page->lru,

609

&zone->free_area[order].free_list[migratetype]);

609

&zone->free_area[order].free_list[migratetype]);

610

goto out;

610

goto out;

611

}

611

}

612

}

612

}

613

614

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

614

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

615

out:

615

out:

616

zone->free_area[order].nr_free++;

616

zone->free_area[order].nr_free++;

617

}

617

}

618

619

static inline int free_pages_check(struct page *page)

619

static inline int free_pages_check(struct page *page)

620

{

620

{

621

if (unlikely(page_mapcount(page) |

621

if (unlikely(page_mapcount(page) |

622

(page->mapping != NULL) |

622

(page->mapping != NULL) |

623

(atomic_read(&page->_count) != 0) |

623

(atomic_read(&page->_count) != 0) |

624

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

624

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

625

(mem_cgroup_bad_page_check(page)))) {

625

(mem_cgroup_bad_page_check(page)))) {

626

bad_page(page);

626

bad_page(page);

627

return 1;

627

return 1;

628

}

628

}

629

page_cpupid_reset_last(page);

629

page_cpupid_reset_last(page);

630

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

630

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

631

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

631

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

632

return 0;

632

return 0;

633

}

633

}

634

635

/*

635

/*

636

* Frees a number of pages from the PCP lists

636

* Frees a number of pages from the PCP lists

637

* Assumes all pages on list are in same zone, and of same order.

637

* Assumes all pages on list are in same zone, and of same order.

638

* count is the number of pages to free.

638

* count is the number of pages to free.

639

*

639

*

640

* If the zone was previously in an "all pages pinned" state then look to

640

* If the zone was previously in an "all pages pinned" state then look to

641

* see if this freeing clears that state.

641

* see if this freeing clears that state.

642

*

642

*

643

* And clear the zone's pages_scanned counter, to hold off the "all pages are

643

* And clear the zone's pages_scanned counter, to hold off the "all pages are

644

* pinned" detection logic.

644

* pinned" detection logic.

645

*/

645

*/

646

static void free_pcppages_bulk(struct zone *zone, int count,

646

static void free_pcppages_bulk(struct zone *zone, int count,

647

struct per_cpu_pages *pcp)

647

struct per_cpu_pages *pcp)

648

{

648

{

649

int migratetype = 0;

649

int migratetype = 0;

650

int batch_free = 0;

650

int batch_free = 0;

651

int to_free = count;

651

int to_free = count;

652

653

spin_lock(&zone->lock);

653

spin_lock(&zone->lock);

654

zone->pages_scanned = 0;

654

zone->pages_scanned = 0;

655

656

while (to_free) {

656

while (to_free) {

657

struct page *page;

657

struct page *page;

658

struct list_head *list;

658

struct list_head *list;

659

660

/*

660

/*

661

* Remove pages from lists in a round-robin fashion. A

661

* Remove pages from lists in a round-robin fashion. A

662

* batch_free count is maintained that is incremented when an

662

* batch_free count is maintained that is incremented when an

663

* empty list is encountered. This is so more pages are freed

663

* empty list is encountered. This is so more pages are freed

664

* off fuller lists instead of spinning excessively around empty

664

* off fuller lists instead of spinning excessively around empty

665

* lists

665

* lists

666

*/

666

*/

667

do {

667

do {

668

batch_free++;

668

batch_free++;

669

if (++migratetype == MIGRATE_PCPTYPES)

669

if (++migratetype == MIGRATE_PCPTYPES)

670

migratetype = 0;

670

migratetype = 0;

671

list = &pcp->lists[migratetype];

671

list = &pcp->lists[migratetype];

672

} while (list_empty(list));

672

} while (list_empty(list));

673

674

/* This is the only non-empty list. Free them all. */

674

/* This is the only non-empty list. Free them all. */

675

if (batch_free == MIGRATE_PCPTYPES)

675

if (batch_free == MIGRATE_PCPTYPES)

676

batch_free = to_free;

676

batch_free = to_free;

677

678

do {

678

do {

679

int mt; /* migratetype of the to-be-freed page */

679

int mt; /* migratetype of the to-be-freed page */

680

681

page = list_entry(list->prev, struct page, lru);

681

page = list_entry(list->prev, struct page, lru);

682

/* must delete as __free_one_page list manipulates */

682

/* must delete as __free_one_page list manipulates */

683

list_del(&page->lru);

683

list_del(&page->lru);

684

mt = get_freepage_migratetype(page);

684

mt = get_freepage_migratetype(page);

685

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

685

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

686

__free_one_page(page, zone, 0, mt);

686

__free_one_page(page, zone, 0, mt);

687

trace_mm_page_pcpu_drain(page, 0, mt);

687

trace_mm_page_pcpu_drain(page, 0, mt);

688

if (likely(!is_migrate_isolate_page(page))) {

688

if (likely(!is_migrate_isolate_page(page))) {

689

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

689

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

690

if (is_migrate_cma(mt))

690

if (is_migrate_cma(mt))

691

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

691

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

692

}

692

}

693

} while (--to_free && --batch_free && !list_empty(list));

693

} while (--to_free && --batch_free && !list_empty(list));

694

}

694

}

695

spin_unlock(&zone->lock);

695

spin_unlock(&zone->lock);

696

}

696

}

697

698

static void free_one_page(struct zone *zone, struct page *page, int order,

698

static void free_one_page(struct zone *zone, struct page *page, int order,

699

int migratetype)

699

int migratetype)

700

{

700

{

701

spin_lock(&zone->lock);

701

spin_lock(&zone->lock);

702

zone->pages_scanned = 0;

702

zone->pages_scanned = 0;

703

704

__free_one_page(page, zone, order, migratetype);

704

__free_one_page(page, zone, order, migratetype);

705

if (unlikely(!is_migrate_isolate(migratetype)))

705

if (unlikely(!is_migrate_isolate(migratetype)))

706

__mod_zone_freepage_state(zone, 1 << order, migratetype);

706

__mod_zone_freepage_state(zone, 1 << order, migratetype);

707

spin_unlock(&zone->lock);

707

spin_unlock(&zone->lock);

708

}

708

}

709

710

static bool free_pages_prepare(struct page *page, unsigned int order)

710

static bool free_pages_prepare(struct page *page, unsigned int order)

711

{

711

{

712

int i;

712

int i;

713

int bad = 0;

713

int bad = 0;

714

715

trace_mm_page_free(page, order);

715

trace_mm_page_free(page, order);

716

kmemcheck_free_shadow(page, order);

716

kmemcheck_free_shadow(page, order);

717

718

if (PageAnon(page))

718

if (PageAnon(page))

719

page->mapping = NULL;

719

page->mapping = NULL;

720

for (i = 0; i < (1 << order); i++)

720

for (i = 0; i < (1 << order); i++)

721

bad += free_pages_check(page + i);

721

bad += free_pages_check(page + i);

722

if (bad)

722

if (bad)

723

return false;

723

return false;

724

725

if (!PageHighMem(page)) {

725

if (!PageHighMem(page)) {

726

debug_check_no_locks_freed(page_address(page),

726

debug_check_no_locks_freed(page_address(page),

727

PAGE_SIZE << order);

727

PAGE_SIZE << order);

728

debug_check_no_obj_freed(page_address(page),

728

debug_check_no_obj_freed(page_address(page),

729

PAGE_SIZE << order);

729

PAGE_SIZE << order);

730

}

730

}

731

arch_free_page(page, order);

731

arch_free_page(page, order);

732

kernel_map_pages(page, 1 << order, 0);

732

kernel_map_pages(page, 1 << order, 0);

733

734

return true;

734

return true;

735

}

735

}

736

737

static void __free_pages_ok(struct page *page, unsigned int order)

737

static void __free_pages_ok(struct page *page, unsigned int order)

738

{

738

{

739

unsigned long flags;

739

unsigned long flags;

740

int migratetype;

740

int migratetype;

741

742

if (!free_pages_prepare(page, order))

742

if (!free_pages_prepare(page, order))

743

return;

743

return;

744

745

local_irq_save(flags);

745

local_irq_save(flags);

746

__count_vm_events(PGFREE, 1 << order);

746

__count_vm_events(PGFREE, 1 << order);

747

migratetype = get_pageblock_migratetype(page);

747

migratetype = get_pageblock_migratetype(page);

748

set_freepage_migratetype(page, migratetype);

748

set_freepage_migratetype(page, migratetype);

749

free_one_page(page_zone(page), page, order, migratetype);

749

free_one_page(page_zone(page), page, order, migratetype);

750

local_irq_restore(flags);

750

local_irq_restore(flags);

751

}

751

}

752

753

void __init __free_pages_bootmem(struct page *page, unsigned int order)

753

void __init __free_pages_bootmem(struct page *page, unsigned int order)

754

{

754

{

755

unsigned int nr_pages = 1 << order;

755

unsigned int nr_pages = 1 << order;

756

struct page *p = page;

756

struct page *p = page;

757

unsigned int loop;

757

unsigned int loop;

758

759

prefetchw(p);

759

prefetchw(p);

760

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

760

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

761

prefetchw(p + 1);

761

prefetchw(p + 1);

762

__ClearPageReserved(p);

762

__ClearPageReserved(p);

763

set_page_count(p, 0);

763

set_page_count(p, 0);

764

}

764

}

765

__ClearPageReserved(p);

765

__ClearPageReserved(p);

766

set_page_count(p, 0);

766

set_page_count(p, 0);

767

768

page_zone(page)->managed_pages += nr_pages;

768

page_zone(page)->managed_pages += nr_pages;

769

set_page_refcounted(page);

769

set_page_refcounted(page);

770

__free_pages(page, order);

770

__free_pages(page, order);

771

}

771

}

772

773

#ifdef CONFIG_CMA

773

#ifdef CONFIG_CMA

774

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

774

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

775

void __init init_cma_reserved_pageblock(struct page *page)

775

void __init init_cma_reserved_pageblock(struct page *page)

776

{

776

{

777

unsigned i = pageblock_nr_pages;

777

unsigned i = pageblock_nr_pages;

778

struct page *p = page;

778

struct page *p = page;

779

780

do {

780

do {

781

__ClearPageReserved(p);

781

__ClearPageReserved(p);

782

set_page_count(p, 0);

782

set_page_count(p, 0);

783

} while (++p, --i);

783

} while (++p, --i);

784

785

set_page_refcounted(page);

785

set_page_refcounted(page);

786

set_pageblock_migratetype(page, MIGRATE_CMA);

786

set_pageblock_migratetype(page, MIGRATE_CMA);

787

__free_pages(page, pageblock_order);

787

__free_pages(page, pageblock_order);

788

adjust_managed_page_count(page, pageblock_nr_pages);

788

adjust_managed_page_count(page, pageblock_nr_pages);

789

}

789

}

790

#endif

790

#endif

791

792

/*

792

/*

793

* The order of subdivision here is critical for the IO subsystem.

793

* The order of subdivision here is critical for the IO subsystem.

794

* Please do not alter this order without good reasons and regression

794

* Please do not alter this order without good reasons and regression

795

* testing. Specifically, as large blocks of memory are subdivided,

795

* testing. Specifically, as large blocks of memory are subdivided,

796

* the order in which smaller blocks are delivered depends on the order

796

* the order in which smaller blocks are delivered depends on the order

797

* they're subdivided in this function. This is the primary factor

797

* they're subdivided in this function. This is the primary factor

798

* influencing the order in which pages are delivered to the IO

798

* influencing the order in which pages are delivered to the IO

799

* subsystem according to empirical testing, and this is also justified

799

* subsystem according to empirical testing, and this is also justified

800

* by considering the behavior of a buddy system containing a single

800

* by considering the behavior of a buddy system containing a single

801

* large block of memory acted on by a series of small allocations.

801

* large block of memory acted on by a series of small allocations.

802

* This behavior is a critical factor in sglist merging's success.

802

* This behavior is a critical factor in sglist merging's success.

803

*

803

*

804

* -- nyc

804

* -- nyc

805

*/

805

*/

806

static inline void expand(struct zone *zone, struct page *page,

806

static inline void expand(struct zone *zone, struct page *page,

807

int low, int high, struct free_area *area,

807

int low, int high, struct free_area *area,

808

int migratetype)

808

int migratetype)

809

{

809

{

810

unsigned long size = 1 << high;

810

unsigned long size = 1 << high;

811

812

while (high > low) {

812

while (high > low) {

813

area--;

813

area--;

814

high--;

814

high--;

815

size >>= 1;

815

size >>= 1;

816

VM_BUG_ON(bad_range(zone, &page[size]));

816

VM_BUG_ON(bad_range(zone, &page[size]));

817

818

#ifdef CONFIG_DEBUG_PAGEALLOC

818

#ifdef CONFIG_DEBUG_PAGEALLOC

819

if (high < debug_guardpage_minorder()) {

819

if (high < debug_guardpage_minorder()) {

820

/*

820

/*

821

* Mark as guard pages (or page), that will allow to

821

* Mark as guard pages (or page), that will allow to

822

* merge back to allocator when buddy will be freed.

822

* merge back to allocator when buddy will be freed.

823

* Corresponding page table entries will not be touched,

823

* Corresponding page table entries will not be touched,

824

* pages will stay not present in virtual address space

824

* pages will stay not present in virtual address space

825

*/

825

*/

826

INIT_LIST_HEAD(&page[size].lru);

826

INIT_LIST_HEAD(&page[size].lru);

827

set_page_guard_flag(&page[size]);

827

set_page_guard_flag(&page[size]);

828

set_page_private(&page[size], high);

828

set_page_private(&page[size], high);

829

/* Guard pages are not available for any usage */

829

/* Guard pages are not available for any usage */

830

__mod_zone_freepage_state(zone, -(1 << high),

830

__mod_zone_freepage_state(zone, -(1 << high),

831

migratetype);

831

migratetype);

832

continue;

832

continue;

833

}

833

}

834

#endif

834

#endif

835

list_add(&page[size].lru, &area->free_list[migratetype]);

835

list_add(&page[size].lru, &area->free_list[migratetype]);

836

area->nr_free++;

836

area->nr_free++;

837

set_page_order(&page[size], high);

837

set_page_order(&page[size], high);

838

}

838

}

839

}

839

}

840

841

/*

841

/*

842

* This page is about to be returned from the page allocator

842

* This page is about to be returned from the page allocator

843

*/

843

*/

844

static inline int check_new_page(struct page *page)

844

static inline int check_new_page(struct page *page)

845

{

845

{

846

if (unlikely(page_mapcount(page) |

846

if (unlikely(page_mapcount(page) |

847

(page->mapping != NULL) |

847

(page->mapping != NULL) |

848

(atomic_read(&page->_count) != 0) |

848

(atomic_read(&page->_count) != 0) |

849

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

849

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

850

(mem_cgroup_bad_page_check(page)))) {

850

(mem_cgroup_bad_page_check(page)))) {

851

bad_page(page);

851

bad_page(page);

852

return 1;

852

return 1;

853

}

853

}

854

return 0;

854

return 0;

855

}

855

}

856

857

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

857

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

858

{

858

{

859

int i;

859

int i;

860

861

for (i = 0; i < (1 << order); i++) {

861

for (i = 0; i < (1 << order); i++) {

862

struct page *p = page + i;

862

struct page *p = page + i;

863

if (unlikely(check_new_page(p)))

863

if (unlikely(check_new_page(p)))

864

return 1;

864

return 1;

865

}

865

}

866

867

set_page_private(page, 0);

867

set_page_private(page, 0);

868

set_page_refcounted(page);

868

set_page_refcounted(page);

869

870

arch_alloc_page(page, order);

870

arch_alloc_page(page, order);

871

kernel_map_pages(page, 1 << order, 1);

871

kernel_map_pages(page, 1 << order, 1);

872

873

if (gfp_flags & __GFP_ZERO)

873

if (gfp_flags & __GFP_ZERO)

874

prep_zero_page(page, order, gfp_flags);

874

prep_zero_page(page, order, gfp_flags);

875

876

if (order && (gfp_flags & __GFP_COMP))

876

if (order && (gfp_flags & __GFP_COMP))

877

prep_compound_page(page, order);

877

prep_compound_page(page, order);

878

879

return 0;

879

return 0;

880

}

880

}

881

882

/*

882

/*

883

* Go through the free lists for the given migratetype and remove

883

* Go through the free lists for the given migratetype and remove

884

* the smallest available page from the freelists

884

* the smallest available page from the freelists

885

*/

885

*/

886

static inline

886

static inline

887

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

887

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

888

int migratetype)

888

int migratetype)

889

{

889

{

890

unsigned int current_order;

890

unsigned int current_order;

891

struct free_area *area;

891

struct free_area *area;

892

struct page *page;

892

struct page *page;

893

894

/* Find a page of the appropriate size in the preferred list */

894

/* Find a page of the appropriate size in the preferred list */

895

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

895

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

896

area = &(zone->free_area[current_order]);

896

area = &(zone->free_area[current_order]);

897

if (list_empty(&area->free_list[migratetype]))

897

if (list_empty(&area->free_list[migratetype]))

898

continue;

898

continue;

899

900

page = list_entry(area->free_list[migratetype].next,

900

page = list_entry(area->free_list[migratetype].next,

901

struct page, lru);

901

struct page, lru);

902

list_del(&page->lru);

902

list_del(&page->lru);

903

rmv_page_order(page);

903

rmv_page_order(page);

904

area->nr_free--;

904

area->nr_free--;

905

expand(zone, page, order, current_order, area, migratetype);

905

expand(zone, page, order, current_order, area, migratetype);

906

return page;

906

return page;

907

}

907

}

908

909

return NULL;

909

return NULL;

910

}

910

}

911

912

913

/*

913

/*

914

* This array describes the order lists are fallen back to when

914

* This array describes the order lists are fallen back to when

915

* the free lists for the desirable migrate type are depleted

915

* the free lists for the desirable migrate type are depleted

916

*/

916

*/

917

static int fallbacks[MIGRATE_TYPES][4] = {

917

static int fallbacks[MIGRATE_TYPES][4] = {

918

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

918

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

919

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

919

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

920

#ifdef CONFIG_CMA

920

#ifdef CONFIG_CMA

921

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

921

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

922

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

922

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

923

#else

923

#else

924

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

924

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

925

#endif

925

#endif

926

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

926

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

927

#ifdef CONFIG_MEMORY_ISOLATION

927

#ifdef CONFIG_MEMORY_ISOLATION

928

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

928

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

929

#endif

929

#endif

930

};

930

};

931

932

/*

932

/*

933

* Move the free pages in a range to the free lists of the requested type.

933

* Move the free pages in a range to the free lists of the requested type.

934

* Note that start_page and end_pages are not aligned on a pageblock

934

* Note that start_page and end_pages are not aligned on a pageblock

935

* boundary. If alignment is required, use move_freepages_block()

935

* boundary. If alignment is required, use move_freepages_block()

936

*/

936

*/

937

int move_freepages(struct zone *zone,

937

int move_freepages(struct zone *zone,

938

struct page *start_page, struct page *end_page,

938

struct page *start_page, struct page *end_page,

939

int migratetype)

939

int migratetype)

940

{

940

{

941

struct page *page;

941

struct page *page;

942

unsigned long order;

942

unsigned long order;

943

int pages_moved = 0;

943

int pages_moved = 0;

944

945

#ifndef CONFIG_HOLES_IN_ZONE

945

#ifndef CONFIG_HOLES_IN_ZONE

946

/*

946

/*

947

* page_zone is not safe to call in this context when

947

* page_zone is not safe to call in this context when

948

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

948

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

949

* anyway as we check zone boundaries in move_freepages_block().

949

* anyway as we check zone boundaries in move_freepages_block().

950

* Remove at a later date when no bug reports exist related to

950

* Remove at a later date when no bug reports exist related to

951

* grouping pages by mobility

951

* grouping pages by mobility

952

*/

952

*/

953

BUG_ON(page_zone(start_page) != page_zone(end_page));

953

BUG_ON(page_zone(start_page) != page_zone(end_page));

954

#endif

954

#endif

955

956

for (page = start_page; page <= end_page;) {

956

for (page = start_page; page <= end_page;) {

957

/* Make sure we are not inadvertently changing nodes */

957

/* Make sure we are not inadvertently changing nodes */

958

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

958

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

959

960

if (!pfn_valid_within(page_to_pfn(page))) {

960

if (!pfn_valid_within(page_to_pfn(page))) {

961

page++;

961

page++;

962

continue;

962

continue;

963

}

963

}

964

965

if (!PageBuddy(page)) {

965

if (!PageBuddy(page)) {

966

page++;

966

page++;

967

continue;

967

continue;

968

}

968

}

969

970

order = page_order(page);

970

order = page_order(page);

971

list_move(&page->lru,

971

list_move(&page->lru,

972

&zone->free_area[order].free_list[migratetype]);

972

&zone->free_area[order].free_list[migratetype]);

973

set_freepage_migratetype(page, migratetype);

973

set_freepage_migratetype(page, migratetype);

974

page += 1 << order;

974

page += 1 << order;

975

pages_moved += 1 << order;

975

pages_moved += 1 << order;

976

}

976

}

977

978

return pages_moved;

978

return pages_moved;

979

}

979

}

980

981

int move_freepages_block(struct zone *zone, struct page *page,

981

int move_freepages_block(struct zone *zone, struct page *page,

982

int migratetype)

982

int migratetype)

983

{

983

{

984

unsigned long start_pfn, end_pfn;

984

unsigned long start_pfn, end_pfn;

985

struct page *start_page, *end_page;

985

struct page *start_page, *end_page;

986

987

start_pfn = page_to_pfn(page);

987

start_pfn = page_to_pfn(page);

988

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

988

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

989

start_page = pfn_to_page(start_pfn);

989

start_page = pfn_to_page(start_pfn);

990

end_page = start_page + pageblock_nr_pages - 1;

990

end_page = start_page + pageblock_nr_pages - 1;

991

end_pfn = start_pfn + pageblock_nr_pages - 1;

991

end_pfn = start_pfn + pageblock_nr_pages - 1;

992

993

/* Do not cross zone boundaries */

993

/* Do not cross zone boundaries */

994

if (!zone_spans_pfn(zone, start_pfn))

994

if (!zone_spans_pfn(zone, start_pfn))

995

start_page = page;

995

start_page = page;

996

if (!zone_spans_pfn(zone, end_pfn))

996

if (!zone_spans_pfn(zone, end_pfn))

997

return 0;

997

return 0;

998

999

return move_freepages(zone, start_page, end_page, migratetype);

999

return move_freepages(zone, start_page, end_page, migratetype);

1000

}

1000

}

1001

1002

static void change_pageblock_range(struct page *pageblock_page,

1002

static void change_pageblock_range(struct page *pageblock_page,

1003

int start_order, int migratetype)

1003

int start_order, int migratetype)

1004

{

1004

{

1005

int nr_pageblocks = 1 << (start_order - pageblock_order);

1005

int nr_pageblocks = 1 << (start_order - pageblock_order);

1006

1007

while (nr_pageblocks--) {

1007

while (nr_pageblocks--) {

1008

set_pageblock_migratetype(pageblock_page, migratetype);

1008

set_pageblock_migratetype(pageblock_page, migratetype);

1009

pageblock_page += pageblock_nr_pages;

1009

pageblock_page += pageblock_nr_pages;

1010

}

1010

}

1011

}

1011

}

1012

1013

/*

1013

/*

1014

* If breaking a large block of pages, move all free pages to the preferred

1014

* If breaking a large block of pages, move all free pages to the preferred

1015

* allocation list. If falling back for a reclaimable kernel allocation, be

1015

* allocation list. If falling back for a reclaimable kernel allocation, be

1016

* more aggressive about taking ownership of free pages.

1016

* more aggressive about taking ownership of free pages.

1017

*

1017

*

1018

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1018

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1019

* nor move CMA pages to different free lists. We don't want unmovable pages

1019

* nor move CMA pages to different free lists. We don't want unmovable pages

1020

* to be allocated from MIGRATE_CMA areas.

1020

* to be allocated from MIGRATE_CMA areas.

1021

*

1021

*

1022

* Returns the new migratetype of the pageblock (or the same old migratetype

1022

* Returns the new migratetype of the pageblock (or the same old migratetype

1023

* if it was unchanged).

1023

* if it was unchanged).

1024

*/

1024

*/

1025

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1025

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1026

int start_type, int fallback_type)

1026

int start_type, int fallback_type)

1027

{

1027

{

1028

int current_order = page_order(page);

1028

int current_order = page_order(page);

1029

1030

/*

1031

* When borrowing from MIGRATE_CMA, we need to release the excess

1032

* buddy pages to CMA itself.

1033

*/

1030

if (is_migrate_cma(fallback_type))

1034

if (is_migrate_cma(fallback_type))

1031

return fallback_type;

1035

return fallback_type;

1032

1036

1033

/* Take ownership for orders >= pageblock_order */

1037

/* Take ownership for orders >= pageblock_order */

1034

if (current_order >= pageblock_order) {

1038

if (current_order >= pageblock_order) {

1035

change_pageblock_range(page, current_order, start_type);

1039

change_pageblock_range(page, current_order, start_type);

1036

return start_type;

1040

return start_type;

1037

}

1041

}

1038

1042

1039

if (current_order >= pageblock_order / 2 ||

1043

if (current_order >= pageblock_order / 2 ||

1040

start_type == MIGRATE_RECLAIMABLE ||

1044

start_type == MIGRATE_RECLAIMABLE ||

1041

page_group_by_mobility_disabled) {

1045

page_group_by_mobility_disabled) {

1042

int pages;

1046

int pages;

1043

1047

1044

pages = move_freepages_block(zone, page, start_type);

1048

pages = move_freepages_block(zone, page, start_type);

1045

1049

1046

/* Claim the whole block if over half of it is free */

1050

/* Claim the whole block if over half of it is free */

1047

if (pages >= (1 << (pageblock_order-1)) ||

1051

if (pages >= (1 << (pageblock_order-1)) ||

1048

page_group_by_mobility_disabled) {

1052

page_group_by_mobility_disabled) {

1049

1053

1050

set_pageblock_migratetype(page, start_type);

1054

set_pageblock_migratetype(page, start_type);

1051

return start_type;

1055

return start_type;

1052

}

1056

}

1053

1057

1054

}

1058

}

1055

1059

1056

return fallback_type;

1060

return fallback_type;

1057

}

1061

}

1058

1062

1059

/* Remove an element from the buddy allocator from the fallback list */

1063

/* Remove an element from the buddy allocator from the fallback list */

1060

static inline struct page *

1064

static inline struct page *

1061

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1065

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1062

{

1066

{

1063

struct free_area *area;

1067

struct free_area *area;

1064

int current_order;

1068

int current_order;

1065

struct page *page;

1069

struct page *page;

1066

int migratetype, new_type, i;

1070

int migratetype, new_type, i;

1067

1071

1068

/* Find the largest possible block of pages in the other list */

1072

/* Find the largest possible block of pages in the other list */

1069

for (current_order = MAX_ORDER-1; current_order >= order;

1073

for (current_order = MAX_ORDER-1; current_order >= order;

1070

--current_order) {

1074

--current_order) {

1071

for (i = 0;; i++) {

1075

for (i = 0;; i++) {

1072

migratetype = fallbacks[start_migratetype][i];

1076

migratetype = fallbacks[start_migratetype][i];

1073

1077

1074

/* MIGRATE_RESERVE handled later if necessary */

1078

/* MIGRATE_RESERVE handled later if necessary */

1075

if (migratetype == MIGRATE_RESERVE)

1079

if (migratetype == MIGRATE_RESERVE)

1076

break;

1080

break;

1077

1081

1078

area = &(zone->free_area[current_order]);

1082

area = &(zone->free_area[current_order]);

1079

if (list_empty(&area->free_list[migratetype]))

1083

if (list_empty(&area->free_list[migratetype]))

1080

continue;

1084

continue;

1081

1085

1082

page = list_entry(area->free_list[migratetype].next,

1086

page = list_entry(area->free_list[migratetype].next,

1083

struct page, lru);

1087

struct page, lru);

1084

area->nr_free--;

1088

area->nr_free--;

1085

1089

1086

new_type = try_to_steal_freepages(zone, page,

1090

new_type = try_to_steal_freepages(zone, page,

1087

start_migratetype,

1091

start_migratetype,

1088

migratetype);

1092

migratetype);

1089

1093

1090

/* Remove the page from the freelists */

1094

/* Remove the page from the freelists */

1091

list_del(&page->lru);

1095

list_del(&page->lru);

1092

rmv_page_order(page);

1096

rmv_page_order(page);

1093

1097

1094

/*

1095

* Borrow the excess buddy pages as well, irrespective

1096

* of whether we stole freepages, or took ownership of

1097

* the pageblock or not.

1098

*

1099

* Exception: When borrowing from MIGRATE_CMA, release

1100

* the excess buddy pages to CMA itself.

1101

*/

1102

expand(zone, page, order, current_order, area,

1098

expand(zone, page, order, current_order, area,

1103

is_migrate_cma(migratetype)

1099

new_type);

1104

? migratetype : start_migratetype);

1105

1100

1106

trace_mm_page_alloc_extfrag(page, order, current_order,

1101

trace_mm_page_alloc_extfrag(page, order, current_order,

1107

start_migratetype, migratetype, new_type);

1102

start_migratetype, migratetype, new_type);

1108

1103

1109

return page;

1104

return page;

1110

}

1105

}

1111

}

1106

}

1112

1107

1113

return NULL;

1108

return NULL;

1114

}

1109

}

1115

1110

1116

/*

1111

/*

1117

* Do the hard work of removing an element from the buddy allocator.

1112

* Do the hard work of removing an element from the buddy allocator.

1118

* Call me with the zone->lock already held.

1113

* Call me with the zone->lock already held.

1119

*/

1114

*/

1120

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1115

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1121

int migratetype)

1116

int migratetype)

1122

{

1117

{

1123

struct page *page;

1118

struct page *page;

1124

1119

1125

retry_reserve:

1120

retry_reserve:

1126

page = __rmqueue_smallest(zone, order, migratetype);

1121

page = __rmqueue_smallest(zone, order, migratetype);

1127

1122

1128

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1123

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1129

page = __rmqueue_fallback(zone, order, migratetype);

1124

page = __rmqueue_fallback(zone, order, migratetype);

1130

1125

1131

/*

1126

/*

1132

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1127

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1133

* is used because __rmqueue_smallest is an inline function

1128

* is used because __rmqueue_smallest is an inline function

1134

* and we want just one call site

1129

* and we want just one call site

1135

*/

1130

*/

1136

if (!page) {

1131

if (!page) {

1137

migratetype = MIGRATE_RESERVE;

1132

migratetype = MIGRATE_RESERVE;

1138

goto retry_reserve;

1133

goto retry_reserve;

1139

}

1134

}

1140

}

1135

}

1141

1136

1142

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1137

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1143

return page;

1138

return page;

1144

}

1139

}

1145

1140

1146

/*

1141

/*

1147

* Obtain a specified number of elements from the buddy allocator, all under

1142

* Obtain a specified number of elements from the buddy allocator, all under

1148

* a single hold of the lock, for efficiency. Add them to the supplied list.

1143

* a single hold of the lock, for efficiency. Add them to the supplied list.

1149

* Returns the number of new pages which were placed at *list.

1144

* Returns the number of new pages which were placed at *list.

1150

*/

1145

*/

1151

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1146

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1152

unsigned long count, struct list_head *list,

1147

unsigned long count, struct list_head *list,

1153

int migratetype, int cold)

1148

int migratetype, int cold)

1154

{

1149

{

1155

int mt = migratetype, i;

1150

int mt = migratetype, i;

1156

1151

1157

spin_lock(&zone->lock);

1152

spin_lock(&zone->lock);

1158

for (i = 0; i < count; ++i) {

1153

for (i = 0; i < count; ++i) {

1159

struct page *page = __rmqueue(zone, order, migratetype);

1154

struct page *page = __rmqueue(zone, order, migratetype);

1160

if (unlikely(page == NULL))

1155

if (unlikely(page == NULL))

1161

break;

1156

break;

1162

1157

1163

/*

1158

/*

1164

* Split buddy pages returned by expand() are received here

1159

* Split buddy pages returned by expand() are received here

1165

* in physical page order. The page is added to the callers and

1160

* in physical page order. The page is added to the callers and

1166

* list and the list head then moves forward. From the callers

1161

* list and the list head then moves forward. From the callers

1167

* perspective, the linked list is ordered by page number in

1162

* perspective, the linked list is ordered by page number in

1168

* some conditions. This is useful for IO devices that can

1163

* some conditions. This is useful for IO devices that can

1169

* merge IO requests if the physical pages are ordered

1164

* merge IO requests if the physical pages are ordered

1170

* properly.

1165

* properly.

1171

*/

1166

*/

1172

if (likely(cold == 0))

1167

if (likely(cold == 0))

1173

list_add(&page->lru, list);

1168

list_add(&page->lru, list);

1174

else

1169

else

1175

list_add_tail(&page->lru, list);

1170

list_add_tail(&page->lru, list);

1176

if (IS_ENABLED(CONFIG_CMA)) {

1171

if (IS_ENABLED(CONFIG_CMA)) {

1177

mt = get_pageblock_migratetype(page);

1172

mt = get_pageblock_migratetype(page);

1178

if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))

1173

if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))

1179

mt = migratetype;

1174

mt = migratetype;

1180

}

1175

}

1181

set_freepage_migratetype(page, mt);

1176

set_freepage_migratetype(page, mt);

1182

list = &page->lru;

1177

list = &page->lru;

1183

if (is_migrate_cma(mt))

1178

if (is_migrate_cma(mt))

1184

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1179

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1185

-(1 << order));

1180

-(1 << order));

1186

}

1181

}

1187

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1182

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1188

spin_unlock(&zone->lock);

1183

spin_unlock(&zone->lock);

1189

return i;

1184

return i;

1190

}

1185

}

1191

1186

1192

#ifdef CONFIG_NUMA

1187

#ifdef CONFIG_NUMA

1193

/*

1188

/*

1194

* Called from the vmstat counter updater to drain pagesets of this

1189

* Called from the vmstat counter updater to drain pagesets of this

1195

* currently executing processor on remote nodes after they have

1190

* currently executing processor on remote nodes after they have

1196

* expired.

1191

* expired.

1197

*

1192

*

1198

* Note that this function must be called with the thread pinned to

1193

* Note that this function must be called with the thread pinned to

1199

* a single processor.

1194

* a single processor.

1200

*/

1195

*/

1201

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1196

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1202

{

1197

{

1203

unsigned long flags;

1198

unsigned long flags;

1204

int to_drain;

1199

int to_drain;

1205

unsigned long batch;

1200

unsigned long batch;

1206

1201

1207

local_irq_save(flags);

1202

local_irq_save(flags);

1208

batch = ACCESS_ONCE(pcp->batch);

1203

batch = ACCESS_ONCE(pcp->batch);

1209

if (pcp->count >= batch)

1204

if (pcp->count >= batch)

1210

to_drain = batch;

1205

to_drain = batch;

1211

else

1206

else

1212

to_drain = pcp->count;

1207

to_drain = pcp->count;

1213

if (to_drain > 0) {

1208

if (to_drain > 0) {

1214

free_pcppages_bulk(zone, to_drain, pcp);

1209

free_pcppages_bulk(zone, to_drain, pcp);

1215

pcp->count -= to_drain;

1210

pcp->count -= to_drain;

1216

}

1211

}

1217

local_irq_restore(flags);

1212

local_irq_restore(flags);

1218

}

1213

}

1219

#endif

1214

#endif

1220

1215

1221

/*

1216

/*

1222

* Drain pages of the indicated processor.

1217

* Drain pages of the indicated processor.

1223

*

1218

*

1224

* The processor must either be the current processor and the

1219

* The processor must either be the current processor and the

1225

* thread pinned to the current processor or a processor that

1220

* thread pinned to the current processor or a processor that

1226

* is not online.

1221

* is not online.

1227

*/

1222

*/

1228

static void drain_pages(unsigned int cpu)

1223

static void drain_pages(unsigned int cpu)

1229

{

1224

{

1230

unsigned long flags;

1225

unsigned long flags;

1231

struct zone *zone;

1226

struct zone *zone;

1232

1227

1233

for_each_populated_zone(zone) {

1228

for_each_populated_zone(zone) {

1234

struct per_cpu_pageset *pset;

1229

struct per_cpu_pageset *pset;

1235

struct per_cpu_pages *pcp;

1230

struct per_cpu_pages *pcp;

1236

1231

1237

local_irq_save(flags);

1232

local_irq_save(flags);

1238

pset = per_cpu_ptr(zone->pageset, cpu);

1233

pset = per_cpu_ptr(zone->pageset, cpu);

1239

1234

1240

pcp = &pset->pcp;

1235

pcp = &pset->pcp;

1241

if (pcp->count) {

1236

if (pcp->count) {

1242

free_pcppages_bulk(zone, pcp->count, pcp);

1237

free_pcppages_bulk(zone, pcp->count, pcp);

1243

pcp->count = 0;

1238

pcp->count = 0;

1244

}

1239

}

1245

local_irq_restore(flags);

1240

local_irq_restore(flags);

1246

}

1241

}

1247

}

1242

}

1248

1243

1249

/*

1244

/*

1250

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1245

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1251

*/

1246

*/

1252

void drain_local_pages(void *arg)

1247

void drain_local_pages(void *arg)

1253

{

1248

{

1254

drain_pages(smp_processor_id());

1249

drain_pages(smp_processor_id());

1255

}

1250

}

1256

1251

1257

/*

1252

/*

1258

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1253

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1259

*

1254

*

1260

* Note that this code is protected against sending an IPI to an offline

1255

* Note that this code is protected against sending an IPI to an offline

1261

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1256

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1262

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1257

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1263

* nothing keeps CPUs from showing up after we populated the cpumask and

1258

* nothing keeps CPUs from showing up after we populated the cpumask and

1264

* before the call to on_each_cpu_mask().

1259

* before the call to on_each_cpu_mask().

1265

*/

1260

*/

1266

void drain_all_pages(void)

1261

void drain_all_pages(void)

1267

{

1262

{

1268

int cpu;

1263

int cpu;

1269

struct per_cpu_pageset *pcp;

1264

struct per_cpu_pageset *pcp;

1270

struct zone *zone;

1265

struct zone *zone;

1271

1266

1272

/*

1267

/*

1273

* Allocate in the BSS so we wont require allocation in

1268

* Allocate in the BSS so we wont require allocation in

1274

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1269

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1275

*/

1270

*/

1276

static cpumask_t cpus_with_pcps;

1271

static cpumask_t cpus_with_pcps;

1277

1272

1278

/*

1273

/*

1279

* We don't care about racing with CPU hotplug event

1274

* We don't care about racing with CPU hotplug event

1280

* as offline notification will cause the notified

1275

* as offline notification will cause the notified

1281

* cpu to drain that CPU pcps and on_each_cpu_mask

1276

* cpu to drain that CPU pcps and on_each_cpu_mask

1282

* disables preemption as part of its processing

1277

* disables preemption as part of its processing

1283

*/

1278

*/

1284

for_each_online_cpu(cpu) {

1279

for_each_online_cpu(cpu) {

1285

bool has_pcps = false;

1280

bool has_pcps = false;

1286

for_each_populated_zone(zone) {

1281

for_each_populated_zone(zone) {

1287

pcp = per_cpu_ptr(zone->pageset, cpu);

1282

pcp = per_cpu_ptr(zone->pageset, cpu);

1288

if (pcp->pcp.count) {

1283

if (pcp->pcp.count) {

1289

has_pcps = true;

1284

has_pcps = true;

1290

break;

1285

break;

1291

}

1286

}

1292

}

1287

}

1293

if (has_pcps)

1288

if (has_pcps)

1294

cpumask_set_cpu(cpu, &cpus_with_pcps);

1289

cpumask_set_cpu(cpu, &cpus_with_pcps);

1295

else

1290

else

1296

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1291

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1297

}

1292

}

1298

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1293

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1299

}

1294

}

1300

1295

1301

#ifdef CONFIG_HIBERNATION

1296

#ifdef CONFIG_HIBERNATION

1302

1297

1303

void mark_free_pages(struct zone *zone)

1298

void mark_free_pages(struct zone *zone)

1304

{

1299

{

1305

unsigned long pfn, max_zone_pfn;

1300

unsigned long pfn, max_zone_pfn;

1306

unsigned long flags;

1301

unsigned long flags;

1307

int order, t;

1302

int order, t;

1308

struct list_head *curr;

1303

struct list_head *curr;

1309

1304

1310

if (zone_is_empty(zone))

1305

if (zone_is_empty(zone))

1311

return;

1306

return;

1312

1307

1313

spin_lock_irqsave(&zone->lock, flags);

1308

spin_lock_irqsave(&zone->lock, flags);

1314

1309

1315

max_zone_pfn = zone_end_pfn(zone);

1310

max_zone_pfn = zone_end_pfn(zone);

1316

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1311

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1317

if (pfn_valid(pfn)) {

1312

if (pfn_valid(pfn)) {

1318

struct page *page = pfn_to_page(pfn);

1313

struct page *page = pfn_to_page(pfn);

1319

1314

1320

if (!swsusp_page_is_forbidden(page))

1315

if (!swsusp_page_is_forbidden(page))

1321

swsusp_unset_page_free(page);

1316

swsusp_unset_page_free(page);

1322

}

1317

}

1323

1318

1324

for_each_migratetype_order(order, t) {

1319

for_each_migratetype_order(order, t) {

1325

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1320

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1326

unsigned long i;

1321

unsigned long i;

1327

1322

1328

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1323

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1329

for (i = 0; i < (1UL << order); i++)

1324

for (i = 0; i < (1UL << order); i++)

1330

swsusp_set_page_free(pfn_to_page(pfn + i));

1325

swsusp_set_page_free(pfn_to_page(pfn + i));

1331

}

1326

}

1332

}

1327

}

1333

spin_unlock_irqrestore(&zone->lock, flags);

1328

spin_unlock_irqrestore(&zone->lock, flags);

1334

}

1329

}

1335

#endif /* CONFIG_PM */

1330

#endif /* CONFIG_PM */

1336

1331

1337

/*

1332

/*

1338

* Free a 0-order page

1333

* Free a 0-order page

1339

* cold == 1 ? free a cold page : free a hot page

1334

* cold == 1 ? free a cold page : free a hot page

1340

*/

1335

*/

1341

void free_hot_cold_page(struct page *page, int cold)

1336

void free_hot_cold_page(struct page *page, int cold)

1342

{

1337

{

1343

struct zone *zone = page_zone(page);

1338

struct zone *zone = page_zone(page);

1344

struct per_cpu_pages *pcp;

1339

struct per_cpu_pages *pcp;

1345

unsigned long flags;

1340

unsigned long flags;

1346

int migratetype;

1341

int migratetype;

1347

1342

1348

if (!free_pages_prepare(page, 0))

1343

if (!free_pages_prepare(page, 0))

1349

return;

1344

return;

1350

1345

1351

migratetype = get_pageblock_migratetype(page);

1346

migratetype = get_pageblock_migratetype(page);

1352

set_freepage_migratetype(page, migratetype);

1347

set_freepage_migratetype(page, migratetype);

1353

local_irq_save(flags);

1348

local_irq_save(flags);

1354

__count_vm_event(PGFREE);

1349

__count_vm_event(PGFREE);

1355

1350

1356

/*

1351

/*

1357

* We only track unmovable, reclaimable and movable on pcp lists.

1352

* We only track unmovable, reclaimable and movable on pcp lists.

1358

* Free ISOLATE pages back to the allocator because they are being

1353

* Free ISOLATE pages back to the allocator because they are being

1359

* offlined but treat RESERVE as movable pages so we can get those

1354

* offlined but treat RESERVE as movable pages so we can get those

1360

* areas back if necessary. Otherwise, we may have to free

1355

* areas back if necessary. Otherwise, we may have to free

1361

* excessively into the page allocator

1356

* excessively into the page allocator

1362

*/

1357

*/

1363

if (migratetype >= MIGRATE_PCPTYPES) {

1358

if (migratetype >= MIGRATE_PCPTYPES) {

1364

if (unlikely(is_migrate_isolate(migratetype))) {

1359

if (unlikely(is_migrate_isolate(migratetype))) {

1365

free_one_page(zone, page, 0, migratetype);

1360

free_one_page(zone, page, 0, migratetype);

1366

goto out;

1361

goto out;

1367

}

1362

}

1368

migratetype = MIGRATE_MOVABLE;

1363

migratetype = MIGRATE_MOVABLE;

1369

}

1364

}

1370

1365

1371

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1366

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1372

if (cold)

1367

if (cold)

1373

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1368

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1374

else

1369

else

1375

list_add(&page->lru, &pcp->lists[migratetype]);

1370

list_add(&page->lru, &pcp->lists[migratetype]);

1376

pcp->count++;

1371

pcp->count++;

1377

if (pcp->count >= pcp->high) {

1372

if (pcp->count >= pcp->high) {

1378

unsigned long batch = ACCESS_ONCE(pcp->batch);

1373

unsigned long batch = ACCESS_ONCE(pcp->batch);

1379

free_pcppages_bulk(zone, batch, pcp);

1374

free_pcppages_bulk(zone, batch, pcp);

1380

pcp->count -= batch;

1375

pcp->count -= batch;

1381

}

1376

}

1382

1377

1383

out:

1378

out:

1384

local_irq_restore(flags);

1379

local_irq_restore(flags);

1385

}

1380

}

1386

1381

1387

/*

1382

/*

1388

* Free a list of 0-order pages

1383

* Free a list of 0-order pages

1389

*/

1384

*/

1390

void free_hot_cold_page_list(struct list_head *list, int cold)

1385

void free_hot_cold_page_list(struct list_head *list, int cold)

1391

{

1386

{

1392

struct page *page, *next;

1387

struct page *page, *next;

1393

1388

1394

list_for_each_entry_safe(page, next, list, lru) {

1389

list_for_each_entry_safe(page, next, list, lru) {

1395

trace_mm_page_free_batched(page, cold);

1390

trace_mm_page_free_batched(page, cold);

1396

free_hot_cold_page(page, cold);

1391

free_hot_cold_page(page, cold);

1397

}

1392

}

1398

}

1393

}

1399

1394

1400

/*

1395

/*

1401

* split_page takes a non-compound higher-order page, and splits it into

1396

* split_page takes a non-compound higher-order page, and splits it into

1402

* n (1<<order) sub-pages: page[0..n]

1397

* n (1<<order) sub-pages: page[0..n]

1403

* Each sub-page must be freed individually.

1398

* Each sub-page must be freed individually.

1404

*

1399

*

1405

* Note: this is probably too low level an operation for use in drivers.

1400

* Note: this is probably too low level an operation for use in drivers.

1406

* Please consult with lkml before using this in your driver.

1401

* Please consult with lkml before using this in your driver.

1407

*/

1402

*/

1408

void split_page(struct page *page, unsigned int order)

1403

void split_page(struct page *page, unsigned int order)

1409

{

1404

{

1410

int i;

1405

int i;

1411

1406

1412

VM_BUG_ON(PageCompound(page));

1407

VM_BUG_ON(PageCompound(page));

1413

VM_BUG_ON(!page_count(page));

1408

VM_BUG_ON(!page_count(page));

1414

1409

1415

#ifdef CONFIG_KMEMCHECK

1410

#ifdef CONFIG_KMEMCHECK

1416

/*

1411

/*

1417

* Split shadow pages too, because free(page[0]) would

1412

* Split shadow pages too, because free(page[0]) would

1418

* otherwise free the whole shadow.

1413

* otherwise free the whole shadow.

1419

*/

1414

*/

1420

if (kmemcheck_page_is_tracked(page))

1415

if (kmemcheck_page_is_tracked(page))

1421

split_page(virt_to_page(page[0].shadow), order);

1416

split_page(virt_to_page(page[0].shadow), order);

1422

#endif

1417

#endif

1423

1418

1424

for (i = 1; i < (1 << order); i++)

1419

for (i = 1; i < (1 << order); i++)

1425

set_page_refcounted(page + i);

1420

set_page_refcounted(page + i);

1426

}

1421

}

1427

EXPORT_SYMBOL_GPL(split_page);

1422

EXPORT_SYMBOL_GPL(split_page);

1428

1423

1429

static int __isolate_free_page(struct page *page, unsigned int order)

1424

static int __isolate_free_page(struct page *page, unsigned int order)

1430

{

1425

{

1431

unsigned long watermark;

1426

unsigned long watermark;

1432

struct zone *zone;

1427

struct zone *zone;

1433

int mt;

1428

int mt;

1434

1429

1435

BUG_ON(!PageBuddy(page));

1430

BUG_ON(!PageBuddy(page));

1436

1431

1437

zone = page_zone(page);

1432

zone = page_zone(page);

1438

mt = get_pageblock_migratetype(page);

1433

mt = get_pageblock_migratetype(page);

1439

1434

1440

if (!is_migrate_isolate(mt)) {

1435

if (!is_migrate_isolate(mt)) {

1441

/* Obey watermarks as if the page was being allocated */

1436

/* Obey watermarks as if the page was being allocated */

1442

watermark = low_wmark_pages(zone) + (1 << order);

1437

watermark = low_wmark_pages(zone) + (1 << order);

1443

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1438

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1444

return 0;

1439

return 0;

1445

1440

1446

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1441

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1447

}

1442

}

1448

1443

1449

/* Remove page from free list */

1444

/* Remove page from free list */

1450

list_del(&page->lru);

1445

list_del(&page->lru);

1451

zone->free_area[order].nr_free--;

1446

zone->free_area[order].nr_free--;

1452

rmv_page_order(page);

1447

rmv_page_order(page);

1453

1448

1454

/* Set the pageblock if the isolated page is at least a pageblock */

1449

/* Set the pageblock if the isolated page is at least a pageblock */

1455

if (order >= pageblock_order - 1) {

1450

if (order >= pageblock_order - 1) {

1456

struct page *endpage = page + (1 << order) - 1;

1451

struct page *endpage = page + (1 << order) - 1;

1457

for (; page < endpage; page += pageblock_nr_pages) {

1452

for (; page < endpage; page += pageblock_nr_pages) {

1458

int mt = get_pageblock_migratetype(page);

1453

int mt = get_pageblock_migratetype(page);

1459

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1454

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1460

set_pageblock_migratetype(page,

1455

set_pageblock_migratetype(page,

1461

MIGRATE_MOVABLE);

1456

MIGRATE_MOVABLE);

1462

}

1457

}

1463

}

1458

}

1464

1459

1465

return 1UL << order;

1460

return 1UL << order;

1466

}

1461

}

1467

1462

1468

/*

1463

/*

1469

* Similar to split_page except the page is already free. As this is only

1464

* Similar to split_page except the page is already free. As this is only

1470

* being used for migration, the migratetype of the block also changes.

1465

* being used for migration, the migratetype of the block also changes.

1471

* As this is called with interrupts disabled, the caller is responsible

1466

* As this is called with interrupts disabled, the caller is responsible

1472

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1467

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1473

* are enabled.

1468

* are enabled.

1474

*

1469

*

1475

* Note: this is probably too low level an operation for use in drivers.

1470

* Note: this is probably too low level an operation for use in drivers.

1476

* Please consult with lkml before using this in your driver.

1471

* Please consult with lkml before using this in your driver.

1477

*/

1472

*/

1478

int split_free_page(struct page *page)

1473

int split_free_page(struct page *page)

1479

{

1474

{

1480

unsigned int order;

1475

unsigned int order;

1481

int nr_pages;

1476

int nr_pages;

1482

1477

1483

order = page_order(page);

1478

order = page_order(page);

1484

1479

1485

nr_pages = __isolate_free_page(page, order);

1480

nr_pages = __isolate_free_page(page, order);

1486

if (!nr_pages)

1481

if (!nr_pages)

1487

return 0;

1482

return 0;

1488

1483

1489

/* Split into individual pages */

1484

/* Split into individual pages */

1490

set_page_refcounted(page);

1485

set_page_refcounted(page);

1491

split_page(page, order);

1486

split_page(page, order);

1492

return nr_pages;

1487

return nr_pages;

1493

}

1488

}

1494

1489

1495

/*

1490

/*

1496

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1491

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1497

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1492

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1498

* or two.

1493

* or two.

1499

*/

1494

*/

1500

static inline

1495

static inline

1501

struct page *buffered_rmqueue(struct zone *preferred_zone,

1496

struct page *buffered_rmqueue(struct zone *preferred_zone,

1502

struct zone *zone, int order, gfp_t gfp_flags,

1497

struct zone *zone, int order, gfp_t gfp_flags,

1503

int migratetype)

1498

int migratetype)

1504

{

1499

{

1505

unsigned long flags;

1500

unsigned long flags;

1506

struct page *page;

1501

struct page *page;

1507

int cold = !!(gfp_flags & __GFP_COLD);

1502

int cold = !!(gfp_flags & __GFP_COLD);

1508

1503

1509

again:

1504

again:

1510

if (likely(order == 0)) {

1505

if (likely(order == 0)) {

1511

struct per_cpu_pages *pcp;

1506

struct per_cpu_pages *pcp;

1512

struct list_head *list;

1507

struct list_head *list;

1513

1508

1514

local_irq_save(flags);

1509

local_irq_save(flags);

1515

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1510

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1516

list = &pcp->lists[migratetype];

1511

list = &pcp->lists[migratetype];

1517

if (list_empty(list)) {

1512

if (list_empty(list)) {

1518

pcp->count += rmqueue_bulk(zone, 0,

1513

pcp->count += rmqueue_bulk(zone, 0,

1519

pcp->batch, list,

1514

pcp->batch, list,

1520

migratetype, cold);

1515

migratetype, cold);

1521

if (unlikely(list_empty(list)))

1516

if (unlikely(list_empty(list)))

1522

goto failed;

1517

goto failed;

1523

}

1518

}

1524

1519

1525

if (cold)

1520

if (cold)

1526

page = list_entry(list->prev, struct page, lru);

1521

page = list_entry(list->prev, struct page, lru);

1527

else

1522

else

1528

page = list_entry(list->next, struct page, lru);

1523

page = list_entry(list->next, struct page, lru);

1529

1524

1530

list_del(&page->lru);

1525

list_del(&page->lru);

1531

pcp->count--;

1526

pcp->count--;

1532

} else {

1527

} else {

1533

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1528

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1534

/*

1529

/*

1535

* __GFP_NOFAIL is not to be used in new code.

1530

* __GFP_NOFAIL is not to be used in new code.

1536

*

1531

*

1537

* All __GFP_NOFAIL callers should be fixed so that they

1532

* All __GFP_NOFAIL callers should be fixed so that they

1538

* properly detect and handle allocation failures.

1533

* properly detect and handle allocation failures.

1539

*

1534

*

1540

* We most definitely don't want callers attempting to

1535

* We most definitely don't want callers attempting to

1541

* allocate greater than order-1 page units with

1536

* allocate greater than order-1 page units with

1542

* __GFP_NOFAIL.

1537

* __GFP_NOFAIL.

1543

*/

1538

*/

1544

WARN_ON_ONCE(order > 1);

1539

WARN_ON_ONCE(order > 1);

1545

}

1540

}

1546

spin_lock_irqsave(&zone->lock, flags);

1541

spin_lock_irqsave(&zone->lock, flags);

1547

page = __rmqueue(zone, order, migratetype);

1542

page = __rmqueue(zone, order, migratetype);

1548

spin_unlock(&zone->lock);

1543

spin_unlock(&zone->lock);

1549

if (!page)

1544

if (!page)

1550

goto failed;

1545

goto failed;

1551

__mod_zone_freepage_state(zone, -(1 << order),

1546

__mod_zone_freepage_state(zone, -(1 << order),

1552

get_pageblock_migratetype(page));

1547

get_pageblock_migratetype(page));

1553

}

1548

}

1554

1549

1555

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1550

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1556

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1551

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1557

zone_statistics(preferred_zone, zone, gfp_flags);

1552

zone_statistics(preferred_zone, zone, gfp_flags);

1558

local_irq_restore(flags);

1553

local_irq_restore(flags);

1559

1554

1560

VM_BUG_ON(bad_range(zone, page));

1555

VM_BUG_ON(bad_range(zone, page));

1561

if (prep_new_page(page, order, gfp_flags))

1556

if (prep_new_page(page, order, gfp_flags))

1562

goto again;

1557

goto again;

1563

return page;

1558

return page;

1564

1559

1565

failed:

1560

failed:

1566

local_irq_restore(flags);

1561

local_irq_restore(flags);

1567

return NULL;

1562

return NULL;

1568

}

1563

}

1569

1564

1570

#ifdef CONFIG_FAIL_PAGE_ALLOC

1565

#ifdef CONFIG_FAIL_PAGE_ALLOC

1571

1566

1572

static struct {

1567

static struct {

1573

struct fault_attr attr;

1568

struct fault_attr attr;

1574

1569

1575

u32 ignore_gfp_highmem;

1570

u32 ignore_gfp_highmem;

1576

u32 ignore_gfp_wait;

1571

u32 ignore_gfp_wait;

1577

u32 min_order;

1572

u32 min_order;

1578

} fail_page_alloc = {

1573

} fail_page_alloc = {

1579

.attr = FAULT_ATTR_INITIALIZER,

1574

.attr = FAULT_ATTR_INITIALIZER,

1580

.ignore_gfp_wait = 1,

1575

.ignore_gfp_wait = 1,

1581

.ignore_gfp_highmem = 1,

1576

.ignore_gfp_highmem = 1,

1582

.min_order = 1,

1577

.min_order = 1,

1583

};

1578

};

1584

1579

1585

static int __init setup_fail_page_alloc(char *str)

1580

static int __init setup_fail_page_alloc(char *str)

1586

{

1581

{

1587

return setup_fault_attr(&fail_page_alloc.attr, str);

1582

return setup_fault_attr(&fail_page_alloc.attr, str);

1588

}

1583

}

1589

__setup("fail_page_alloc=", setup_fail_page_alloc);

1584

__setup("fail_page_alloc=", setup_fail_page_alloc);

1590

1585

1591

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1586

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1592

{

1587

{

1593

if (order < fail_page_alloc.min_order)

1588

if (order < fail_page_alloc.min_order)

1594

return false;

1589

return false;

1595

if (gfp_mask & __GFP_NOFAIL)

1590

if (gfp_mask & __GFP_NOFAIL)

1596

return false;

1591

return false;

1597

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1592

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1598

return false;

1593

return false;

1599

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1594

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1600

return false;

1595

return false;

1601

1596

1602

return should_fail(&fail_page_alloc.attr, 1 << order);

1597

return should_fail(&fail_page_alloc.attr, 1 << order);

1603

}

1598

}

1604

1599

1605

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1600

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1606

1601

1607

static int __init fail_page_alloc_debugfs(void)

1602

static int __init fail_page_alloc_debugfs(void)

1608

{

1603

{

1609

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1604

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1610

struct dentry *dir;

1605

struct dentry *dir;

1611

1606

1612

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1607

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1613

&fail_page_alloc.attr);

1608

&fail_page_alloc.attr);

1614

if (IS_ERR(dir))

1609

if (IS_ERR(dir))

1615

return PTR_ERR(dir);

1610

return PTR_ERR(dir);

1616

1611

1617

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1612

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1618

&fail_page_alloc.ignore_gfp_wait))

1613

&fail_page_alloc.ignore_gfp_wait))

1619

goto fail;

1614

goto fail;

1620

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1615

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1621

&fail_page_alloc.ignore_gfp_highmem))

1616

&fail_page_alloc.ignore_gfp_highmem))

1622

goto fail;

1617

goto fail;

1623

if (!debugfs_create_u32("min-order", mode, dir,

1618

if (!debugfs_create_u32("min-order", mode, dir,

1624

&fail_page_alloc.min_order))

1619

&fail_page_alloc.min_order))

1625

goto fail;

1620

goto fail;

1626

1621

1627

return 0;

1622

return 0;

1628

fail:

1623

fail:

1629

debugfs_remove_recursive(dir);

1624

debugfs_remove_recursive(dir);

1630

1625

1631

return -ENOMEM;

1626

return -ENOMEM;

1632

}

1627

}

1633

1628

1634

late_initcall(fail_page_alloc_debugfs);

1629

late_initcall(fail_page_alloc_debugfs);

1635

1630

1636

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1631

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1637

1632

1638

#else /* CONFIG_FAIL_PAGE_ALLOC */

1633

#else /* CONFIG_FAIL_PAGE_ALLOC */

1639

1634

1640

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1635

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1641

{

1636

{

1642

return false;

1637

return false;

1643

}

1638

}

1644

1639

1645

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1640

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1646

1641

1647

/*

1642

/*

1648

* Return true if free pages are above 'mark'. This takes into account the order

1643

* Return true if free pages are above 'mark'. This takes into account the order

1649

* of the allocation.

1644

* of the allocation.

1650

*/

1645

*/

1651

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1646

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1652

int classzone_idx, int alloc_flags, long free_pages)

1647

int classzone_idx, int alloc_flags, long free_pages)

1653

{

1648

{

1654

/* free_pages my go negative - that's OK */

1649

/* free_pages my go negative - that's OK */

1655

long min = mark;

1650

long min = mark;

1656

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1651

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1657

int o;

1652

int o;

1658

long free_cma = 0;

1653

long free_cma = 0;

1659

1654

1660

free_pages -= (1 << order) - 1;

1655

free_pages -= (1 << order) - 1;

1661

if (alloc_flags & ALLOC_HIGH)

1656

if (alloc_flags & ALLOC_HIGH)

1662

min -= min / 2;

1657

min -= min / 2;

1663

if (alloc_flags & ALLOC_HARDER)

1658

if (alloc_flags & ALLOC_HARDER)

1664

min -= min / 4;

1659

min -= min / 4;

1665

#ifdef CONFIG_CMA

1660

#ifdef CONFIG_CMA

1666

/* If allocation can't use CMA areas don't use free CMA pages */

1661

/* If allocation can't use CMA areas don't use free CMA pages */

1667

if (!(alloc_flags & ALLOC_CMA))

1662

if (!(alloc_flags & ALLOC_CMA))

1668

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1663

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1669

#endif

1664

#endif

1670

1665

1671

if (free_pages - free_cma <= min + lowmem_reserve)

1666

if (free_pages - free_cma <= min + lowmem_reserve)

1672

return false;

1667

return false;

1673

for (o = 0; o < order; o++) {

1668

for (o = 0; o < order; o++) {

1674

/* At the next order, this order's pages become unavailable */

1669

/* At the next order, this order's pages become unavailable */

1675

free_pages -= z->free_area[o].nr_free << o;

1670

free_pages -= z->free_area[o].nr_free << o;

1676

1671

1677

/* Require fewer higher order pages to be free */

1672

/* Require fewer higher order pages to be free */

1678

min >>= 1;

1673

min >>= 1;

1679

1674

1680

if (free_pages <= min)

1675

if (free_pages <= min)

1681

return false;

1676

return false;

1682

}

1677

}

1683

return true;

1678

return true;

1684

}

1679

}

1685

1680

1686

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1681

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1687

int classzone_idx, int alloc_flags)

1682

int classzone_idx, int alloc_flags)

1688

{

1683

{

1689

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1684

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1690

zone_page_state(z, NR_FREE_PAGES));

1685

zone_page_state(z, NR_FREE_PAGES));

1691

}

1686

}

1692

1687

1693

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1688

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1694

int classzone_idx, int alloc_flags)

1689

int classzone_idx, int alloc_flags)

1695

{

1690

{

1696

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1691

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1697

1692

1698

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1693

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1699

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1694

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1700

1695

1701

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1696

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1702

free_pages);

1697

free_pages);

1703

}

1698

}

1704

1699

1705

#ifdef CONFIG_NUMA

1700

#ifdef CONFIG_NUMA

1706

/*

1701

/*

1707

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1702

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1708

* skip over zones that are not allowed by the cpuset, or that have

1703

* skip over zones that are not allowed by the cpuset, or that have

1709

* been recently (in last second) found to be nearly full. See further

1704

* been recently (in last second) found to be nearly full. See further

1710

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1705

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1711

* that have to skip over a lot of full or unallowed zones.

1706

* that have to skip over a lot of full or unallowed zones.

1712

*

1707

*

1713

* If the zonelist cache is present in the passed in zonelist, then

1708

* If the zonelist cache is present in the passed in zonelist, then

1714

* returns a pointer to the allowed node mask (either the current

1709

* returns a pointer to the allowed node mask (either the current

1715

* tasks mems_allowed, or node_states[N_MEMORY].)

1710

* tasks mems_allowed, or node_states[N_MEMORY].)

1716

*

1711

*

1717

* If the zonelist cache is not available for this zonelist, does

1712

* If the zonelist cache is not available for this zonelist, does

1718

* nothing and returns NULL.

1713

* nothing and returns NULL.

1719

*

1714

*

1720

* If the fullzones BITMAP in the zonelist cache is stale (more than

1715

* If the fullzones BITMAP in the zonelist cache is stale (more than

1721

* a second since last zap'd) then we zap it out (clear its bits.)

1716

* a second since last zap'd) then we zap it out (clear its bits.)

1722

*

1717

*

1723

* We hold off even calling zlc_setup, until after we've checked the

1718

* We hold off even calling zlc_setup, until after we've checked the

1724

* first zone in the zonelist, on the theory that most allocations will

1719

* first zone in the zonelist, on the theory that most allocations will

1725

* be satisfied from that first zone, so best to examine that zone as

1720

* be satisfied from that first zone, so best to examine that zone as

1726

* quickly as we can.

1721

* quickly as we can.

1727

*/

1722

*/

1728

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1723

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1729

{

1724

{

1730

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1725

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1731

nodemask_t *allowednodes; /* zonelist_cache approximation */

1726

nodemask_t *allowednodes; /* zonelist_cache approximation */

1732

1727

1733

zlc = zonelist->zlcache_ptr;

1728

zlc = zonelist->zlcache_ptr;

1734

if (!zlc)

1729

if (!zlc)

1735

return NULL;

1730

return NULL;

1736

1731

1737

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1732

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1738

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1733

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1739

zlc->last_full_zap = jiffies;

1734

zlc->last_full_zap = jiffies;

1740

}

1735

}

1741

1736

1742

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1737

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1743

&cpuset_current_mems_allowed :

1738

&cpuset_current_mems_allowed :

1744

&node_states[N_MEMORY];

1739

&node_states[N_MEMORY];

1745

return allowednodes;

1740

return allowednodes;

1746

}

1741

}

1747

1742

1748

/*

1743

/*

1749

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1744

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1750

* if it is worth looking at further for free memory:

1745

* if it is worth looking at further for free memory:

1751

* 1) Check that the zone isn't thought to be full (doesn't have its

1746

* 1) Check that the zone isn't thought to be full (doesn't have its

1752

* bit set in the zonelist_cache fullzones BITMAP).

1747

* bit set in the zonelist_cache fullzones BITMAP).

1753

* 2) Check that the zones node (obtained from the zonelist_cache

1748

* 2) Check that the zones node (obtained from the zonelist_cache

1754

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1749

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1755

* Return true (non-zero) if zone is worth looking at further, or

1750

* Return true (non-zero) if zone is worth looking at further, or

1756

* else return false (zero) if it is not.

1751

* else return false (zero) if it is not.

1757

*

1752

*

1758

* This check -ignores- the distinction between various watermarks,

1753

* This check -ignores- the distinction between various watermarks,

1759

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1754

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1760

* found to be full for any variation of these watermarks, it will

1755

* found to be full for any variation of these watermarks, it will

1761

* be considered full for up to one second by all requests, unless

1756

* be considered full for up to one second by all requests, unless

1762

* we are so low on memory on all allowed nodes that we are forced

1757

* we are so low on memory on all allowed nodes that we are forced

1763

* into the second scan of the zonelist.

1758

* into the second scan of the zonelist.

1764

*

1759

*

1765

* In the second scan we ignore this zonelist cache and exactly

1760

* In the second scan we ignore this zonelist cache and exactly

1766

* apply the watermarks to all zones, even it is slower to do so.

1761

* apply the watermarks to all zones, even it is slower to do so.

1767

* We are low on memory in the second scan, and should leave no stone

1762

* We are low on memory in the second scan, and should leave no stone

1768

* unturned looking for a free page.

1763

* unturned looking for a free page.

1769

*/

1764

*/

1770

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1765

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1771

nodemask_t *allowednodes)

1766

nodemask_t *allowednodes)

1772

{

1767

{

1773

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1768

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1774

int i; /* index of *z in zonelist zones */

1769

int i; /* index of *z in zonelist zones */

1775

int n; /* node that zone *z is on */

1770

int n; /* node that zone *z is on */

1776

1771

1777

zlc = zonelist->zlcache_ptr;

1772

zlc = zonelist->zlcache_ptr;

1778

if (!zlc)

1773

if (!zlc)

1779

return 1;

1774

return 1;

1780

1775

1781

i = z - zonelist->_zonerefs;

1776

i = z - zonelist->_zonerefs;

1782

n = zlc->z_to_n[i];

1777

n = zlc->z_to_n[i];

1783

1778

1784

/* This zone is worth trying if it is allowed but not full */

1779

/* This zone is worth trying if it is allowed but not full */

1785

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1780

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1786

}

1781

}

1787

1782

1788

/*

1783

/*

1789

* Given 'z' scanning a zonelist, set the corresponding bit in

1784

* Given 'z' scanning a zonelist, set the corresponding bit in

1790

* zlc->fullzones, so that subsequent attempts to allocate a page

1785

* zlc->fullzones, so that subsequent attempts to allocate a page

1791

* from that zone don't waste time re-examining it.

1786

* from that zone don't waste time re-examining it.

1792

*/

1787

*/

1793

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1788

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1794

{

1789

{

1795

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1790

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1796

int i; /* index of *z in zonelist zones */

1791

int i; /* index of *z in zonelist zones */

1797

1792

1798

zlc = zonelist->zlcache_ptr;

1793

zlc = zonelist->zlcache_ptr;

1799

if (!zlc)

1794

if (!zlc)

1800

return;

1795

return;

1801

1796

1802

i = z - zonelist->_zonerefs;

1797

i = z - zonelist->_zonerefs;

1803

1798

1804

set_bit(i, zlc->fullzones);

1799

set_bit(i, zlc->fullzones);

1805

}

1800

}

1806

1801

1807

/*

1802

/*

1808

* clear all zones full, called after direct reclaim makes progress so that

1803

* clear all zones full, called after direct reclaim makes progress so that

1809

* a zone that was recently full is not skipped over for up to a second

1804

* a zone that was recently full is not skipped over for up to a second

1810

*/

1805

*/

1811

static void zlc_clear_zones_full(struct zonelist *zonelist)

1806

static void zlc_clear_zones_full(struct zonelist *zonelist)

1812

{

1807

{

1813

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1808

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1814

1809

1815

zlc = zonelist->zlcache_ptr;

1810

zlc = zonelist->zlcache_ptr;

1816

if (!zlc)

1811

if (!zlc)

1817

return;

1812

return;

1818

1813

1819

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1814

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1820

}

1815

}

1821

1816

1822

static bool zone_local(struct zone *local_zone, struct zone *zone)

1817

static bool zone_local(struct zone *local_zone, struct zone *zone)

1823

{

1818

{

1824

return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;

1819

return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;

1825

}

1820

}

1826

1821

1827

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1822

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1828

{

1823

{

1829

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1824

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1830

}

1825

}

1831

1826

1832

static void __paginginit init_zone_allows_reclaim(int nid)

1827

static void __paginginit init_zone_allows_reclaim(int nid)

1833

{

1828

{

1834

int i;

1829

int i;

1835

1830

1836

for_each_online_node(i)

1831

for_each_online_node(i)

1837

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1832

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1838

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1833

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1839

else

1834

else

1840

zone_reclaim_mode = 1;

1835

zone_reclaim_mode = 1;

1841

}

1836

}

1842

1837

1843

#else /* CONFIG_NUMA */

1838

#else /* CONFIG_NUMA */

1844

1839

1845

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1840

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1846

{

1841

{

1847

return NULL;

1842

return NULL;

1848

}

1843

}

1849

1844

1850

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1845

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1851

nodemask_t *allowednodes)

1846

nodemask_t *allowednodes)

1852

{

1847

{

1853

return 1;

1848

return 1;

1854

}

1849

}

1855

1850

1856

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1851

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1857

{

1852

{

1858

}

1853

}

1859

1854

1860

static void zlc_clear_zones_full(struct zonelist *zonelist)

1855

static void zlc_clear_zones_full(struct zonelist *zonelist)

1861

{

1856

{

1862

}

1857

}

1863

1858

1864

static bool zone_local(struct zone *local_zone, struct zone *zone)

1859

static bool zone_local(struct zone *local_zone, struct zone *zone)

1865

{

1860

{

1866

return true;

1861

return true;

1867

}

1862

}

1868

1863

1869

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1864

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1870

{

1865

{

1871

return true;

1866

return true;

1872

}

1867

}

1873

1868

1874

static inline void init_zone_allows_reclaim(int nid)

1869

static inline void init_zone_allows_reclaim(int nid)

1875

{

1870

{

1876

}

1871

}

1877

#endif /* CONFIG_NUMA */

1872

#endif /* CONFIG_NUMA */

1878

1873

1879

/*

1874

/*

1880

* get_page_from_freelist goes through the zonelist trying to allocate

1875

* get_page_from_freelist goes through the zonelist trying to allocate

1881

* a page.

1876

* a page.

1882

*/

1877

*/

1883

static struct page *

1878

static struct page *

1884

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1879

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1885

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1880

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1886

struct zone *preferred_zone, int migratetype)

1881

struct zone *preferred_zone, int migratetype)

1887

{

1882

{

1888

struct zoneref *z;

1883

struct zoneref *z;

1889

struct page *page = NULL;

1884

struct page *page = NULL;

1890

int classzone_idx;

1885

int classzone_idx;

1891

struct zone *zone;

1886

struct zone *zone;

1892

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1887

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1893

int zlc_active = 0; /* set if using zonelist_cache */

1888

int zlc_active = 0; /* set if using zonelist_cache */

1894

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1889

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1895

1890

1896

classzone_idx = zone_idx(preferred_zone);

1891

classzone_idx = zone_idx(preferred_zone);

1897

zonelist_scan:

1892

zonelist_scan:

1898

/*

1893

/*

1899

* Scan zonelist, looking for a zone with enough free.

1894

* Scan zonelist, looking for a zone with enough free.

1900

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1895

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1901

*/

1896

*/

1902

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1897

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1903

high_zoneidx, nodemask) {

1898

high_zoneidx, nodemask) {

1904

unsigned long mark;

1899

unsigned long mark;

1905

1900

1906

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1901

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1907

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1902

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1908

continue;

1903

continue;

1909

if ((alloc_flags & ALLOC_CPUSET) &&

1904

if ((alloc_flags & ALLOC_CPUSET) &&

1910

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1905

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1911

continue;

1906

continue;

1912

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1907

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1913

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1908

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1914

goto try_this_zone;

1909

goto try_this_zone;

1915

/*

1910

/*

1916

* Distribute pages in proportion to the individual

1911

* Distribute pages in proportion to the individual

1917

* zone size to ensure fair page aging. The zone a

1912

* zone size to ensure fair page aging. The zone a

1918

* page was allocated in should have no effect on the

1913

* page was allocated in should have no effect on the

1919

* time the page has in memory before being reclaimed.

1914

* time the page has in memory before being reclaimed.

1920

*

1915

*

1921

* When zone_reclaim_mode is enabled, try to stay in

1916

* When zone_reclaim_mode is enabled, try to stay in

1922

* local zones in the fastpath. If that fails, the

1917

* local zones in the fastpath. If that fails, the

1923

* slowpath is entered, which will do another pass

1918

* slowpath is entered, which will do another pass

1924

* starting with the local zones, but ultimately fall

1919

* starting with the local zones, but ultimately fall

1925

* back to remote zones that do not partake in the

1920

* back to remote zones that do not partake in the

1926

* fairness round-robin cycle of this zonelist.

1921

* fairness round-robin cycle of this zonelist.

1927

*/

1922

*/

1928

if (alloc_flags & ALLOC_WMARK_LOW) {

1923

if (alloc_flags & ALLOC_WMARK_LOW) {

1929

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1924

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1930

continue;

1925

continue;

1931

if (zone_reclaim_mode &&

1926

if (zone_reclaim_mode &&

1932

!zone_local(preferred_zone, zone))

1927

!zone_local(preferred_zone, zone))

1933

continue;

1928

continue;

1934

}

1929

}

1935

/*

1930

/*

1936

* When allocating a page cache page for writing, we

1931

* When allocating a page cache page for writing, we

1937

* want to get it from a zone that is within its dirty

1932

* want to get it from a zone that is within its dirty

1938

* limit, such that no single zone holds more than its

1933

* limit, such that no single zone holds more than its

1939

* proportional share of globally allowed dirty pages.

1934

* proportional share of globally allowed dirty pages.

1940

* The dirty limits take into account the zone's

1935

* The dirty limits take into account the zone's

1941

* lowmem reserves and high watermark so that kswapd

1936

* lowmem reserves and high watermark so that kswapd

1942

* should be able to balance it without having to

1937

* should be able to balance it without having to

1943

* write pages from its LRU list.

1938

* write pages from its LRU list.

1944

*

1939

*

1945

* This may look like it could increase pressure on

1940

* This may look like it could increase pressure on

1946

* lower zones by failing allocations in higher zones

1941

* lower zones by failing allocations in higher zones

1947

* before they are full. But the pages that do spill

1942

* before they are full. But the pages that do spill

1948

* over are limited as the lower zones are protected

1943

* over are limited as the lower zones are protected

1949

* by this very same mechanism. It should not become

1944

* by this very same mechanism. It should not become

1950

* a practical burden to them.

1945

* a practical burden to them.

1951

*

1946

*

1952

* XXX: For now, allow allocations to potentially

1947

* XXX: For now, allow allocations to potentially

1953

* exceed the per-zone dirty limit in the slowpath

1948

* exceed the per-zone dirty limit in the slowpath

1954

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1949

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1955

* which is important when on a NUMA setup the allowed

1950

* which is important when on a NUMA setup the allowed

1956

* zones are together not big enough to reach the

1951

* zones are together not big enough to reach the

1957

* global limit. The proper fix for these situations

1952

* global limit. The proper fix for these situations

1958

* will require awareness of zones in the

1953

* will require awareness of zones in the

1959

* dirty-throttling and the flusher threads.

1954

* dirty-throttling and the flusher threads.

1960

*/

1955

*/

1961

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1956

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1962

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1957

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1963

goto this_zone_full;

1958

goto this_zone_full;

1964

1959

1965

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1960

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1966

if (!zone_watermark_ok(zone, order, mark,

1961

if (!zone_watermark_ok(zone, order, mark,

1967

classzone_idx, alloc_flags)) {

1962

classzone_idx, alloc_flags)) {

1968

int ret;

1963

int ret;

1969

1964

1970

if (IS_ENABLED(CONFIG_NUMA) &&

1965

if (IS_ENABLED(CONFIG_NUMA) &&

1971

!did_zlc_setup && nr_online_nodes > 1) {

1966

!did_zlc_setup && nr_online_nodes > 1) {

1972

/*

1967

/*

1973

* we do zlc_setup if there are multiple nodes

1968

* we do zlc_setup if there are multiple nodes

1974

* and before considering the first zone allowed

1969

* and before considering the first zone allowed

1975

* by the cpuset.

1970

* by the cpuset.

1976

*/

1971

*/

1977

allowednodes = zlc_setup(zonelist, alloc_flags);

1972

allowednodes = zlc_setup(zonelist, alloc_flags);

1978

zlc_active = 1;

1973

zlc_active = 1;

1979

did_zlc_setup = 1;

1974

did_zlc_setup = 1;

1980

}

1975

}

1981

1976

1982

if (zone_reclaim_mode == 0 ||

1977

if (zone_reclaim_mode == 0 ||

1983

!zone_allows_reclaim(preferred_zone, zone))

1978

!zone_allows_reclaim(preferred_zone, zone))

1984

goto this_zone_full;

1979

goto this_zone_full;

1985

1980

1986

/*

1981

/*

1987

* As we may have just activated ZLC, check if the first

1982

* As we may have just activated ZLC, check if the first

1988

* eligible zone has failed zone_reclaim recently.

1983

* eligible zone has failed zone_reclaim recently.

1989

*/

1984

*/

1990

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1985

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1991

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1986

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1992

continue;

1987

continue;

1993

1988

1994

ret = zone_reclaim(zone, gfp_mask, order);

1989

ret = zone_reclaim(zone, gfp_mask, order);

1995

switch (ret) {

1990

switch (ret) {

1996

case ZONE_RECLAIM_NOSCAN:

1991

case ZONE_RECLAIM_NOSCAN:

1997

/* did not scan */

1992

/* did not scan */

1998

continue;

1993

continue;

1999

case ZONE_RECLAIM_FULL:

1994

case ZONE_RECLAIM_FULL:

2000

/* scanned but unreclaimable */

1995

/* scanned but unreclaimable */

2001

continue;

1996

continue;

2002

default:

1997

default:

2003

/* did we reclaim enough */

1998

/* did we reclaim enough */

2004

if (zone_watermark_ok(zone, order, mark,

1999

if (zone_watermark_ok(zone, order, mark,

2005

classzone_idx, alloc_flags))

2000

classzone_idx, alloc_flags))

2006

goto try_this_zone;

2001

goto try_this_zone;

2007

2002

2008

/*

2003

/*

2009

* Failed to reclaim enough to meet watermark.

2004

* Failed to reclaim enough to meet watermark.

2010

* Only mark the zone full if checking the min

2005

* Only mark the zone full if checking the min

2011

* watermark or if we failed to reclaim just

2006

* watermark or if we failed to reclaim just

2012

* 1<<order pages or else the page allocator

2007

* 1<<order pages or else the page allocator

2013

* fastpath will prematurely mark zones full

2008

* fastpath will prematurely mark zones full

2014

* when the watermark is between the low and

2009

* when the watermark is between the low and

2015

* min watermarks.

2010

* min watermarks.

2016

*/

2011

*/

2017

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2012

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2018

ret == ZONE_RECLAIM_SOME)

2013

ret == ZONE_RECLAIM_SOME)

2019

goto this_zone_full;

2014

goto this_zone_full;

2020

2015

2021

continue;

2016

continue;

2022

}

2017

}

2023

}

2018

}

2024

2019

2025

try_this_zone:

2020

try_this_zone:

2026

page = buffered_rmqueue(preferred_zone, zone, order,

2021

page = buffered_rmqueue(preferred_zone, zone, order,

2027

gfp_mask, migratetype);

2022

gfp_mask, migratetype);

2028

if (page)

2023

if (page)

2029

break;

2024

break;

2030

this_zone_full:

2025

this_zone_full:

2031

if (IS_ENABLED(CONFIG_NUMA))

2026

if (IS_ENABLED(CONFIG_NUMA))

2032

zlc_mark_zone_full(zonelist, z);

2027

zlc_mark_zone_full(zonelist, z);

2033

}

2028

}

2034

2029

2035

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2030

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2036

/* Disable zlc cache for second zonelist scan */

2031

/* Disable zlc cache for second zonelist scan */

2037

zlc_active = 0;

2032

zlc_active = 0;

2038

goto zonelist_scan;

2033

goto zonelist_scan;

2039

}

2034

}

2040

2035

2041

if (page)

2036

if (page)

2042

/*

2037

/*

2043

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2038

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2044

* necessary to allocate the page. The expectation is

2039

* necessary to allocate the page. The expectation is

2045

* that the caller is taking steps that will free more

2040

* that the caller is taking steps that will free more

2046

* memory. The caller should avoid the page being used

2041

* memory. The caller should avoid the page being used

2047

* for !PFMEMALLOC purposes.

2042

* for !PFMEMALLOC purposes.

2048

*/

2043

*/

2049

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2044

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2050

2045

2051

return page;

2046

return page;

2052

}

2047

}

2053

2048

2054

/*

2049

/*

2055

* Large machines with many possible nodes should not always dump per-node

2050

* Large machines with many possible nodes should not always dump per-node

2056

* meminfo in irq context.

2051

* meminfo in irq context.

2057

*/

2052

*/

2058

static inline bool should_suppress_show_mem(void)

2053

static inline bool should_suppress_show_mem(void)

2059

{

2054

{

2060

bool ret = false;

2055

bool ret = false;

2061

2056

2062

#if NODES_SHIFT > 8

2057

#if NODES_SHIFT > 8

2063

ret = in_interrupt();

2058

ret = in_interrupt();

2064

#endif

2059

#endif

2065

return ret;

2060

return ret;

2066

}

2061

}

2067

2062

2068

static DEFINE_RATELIMIT_STATE(nopage_rs,

2063

static DEFINE_RATELIMIT_STATE(nopage_rs,

2069

DEFAULT_RATELIMIT_INTERVAL,

2064

DEFAULT_RATELIMIT_INTERVAL,

2070

DEFAULT_RATELIMIT_BURST);

2065

DEFAULT_RATELIMIT_BURST);

2071

2066

2072

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2067

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2073

{

2068

{

2074

unsigned int filter = SHOW_MEM_FILTER_NODES;

2069

unsigned int filter = SHOW_MEM_FILTER_NODES;

2075

2070

2076

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2071

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2077

debug_guardpage_minorder() > 0)

2072

debug_guardpage_minorder() > 0)

2078

return;

2073

return;

2079

2074

2080

/*

2075

/*

2081

* Walking all memory to count page types is very expensive and should

2076

* Walking all memory to count page types is very expensive and should

2082

* be inhibited in non-blockable contexts.

2077

* be inhibited in non-blockable contexts.

2083

*/

2078

*/

2084

if (!(gfp_mask & __GFP_WAIT))

2079

if (!(gfp_mask & __GFP_WAIT))

2085

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2080

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2086

2081

2087

/*

2082

/*

2088

* This documents exceptions given to allocations in certain

2083

* This documents exceptions given to allocations in certain

2089

* contexts that are allowed to allocate outside current's set

2084

* contexts that are allowed to allocate outside current's set

2090

* of allowed nodes.

2085

* of allowed nodes.

2091

*/

2086

*/

2092

if (!(gfp_mask & __GFP_NOMEMALLOC))

2087

if (!(gfp_mask & __GFP_NOMEMALLOC))

2093

if (test_thread_flag(TIF_MEMDIE) ||

2088

if (test_thread_flag(TIF_MEMDIE) ||

2094

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2089

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2095

filter &= ~SHOW_MEM_FILTER_NODES;

2090

filter &= ~SHOW_MEM_FILTER_NODES;

2096

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2091

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2097

filter &= ~SHOW_MEM_FILTER_NODES;

2092

filter &= ~SHOW_MEM_FILTER_NODES;

2098

2093

2099

if (fmt) {

2094

if (fmt) {

2100

struct va_format vaf;

2095

struct va_format vaf;

2101

va_list args;

2096

va_list args;

2102

2097

2103

va_start(args, fmt);

2098

va_start(args, fmt);

2104

2099

2105

vaf.fmt = fmt;

2100

vaf.fmt = fmt;

2106

vaf.va = &args;

2101

vaf.va = &args;

2107

2102

2108

pr_warn("%pV", &vaf);

2103

pr_warn("%pV", &vaf);

2109

2104

2110

va_end(args);

2105

va_end(args);

2111

}

2106

}

2112

2107

2113

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2108

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2114

current->comm, order, gfp_mask);

2109

current->comm, order, gfp_mask);

2115

2110

2116

dump_stack();

2111

dump_stack();

2117

if (!should_suppress_show_mem())

2112

if (!should_suppress_show_mem())

2118

show_mem(filter);

2113

show_mem(filter);

2119

}

2114

}

2120

2115

2121

static inline int

2116

static inline int

2122

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2117

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2123

unsigned long did_some_progress,

2118

unsigned long did_some_progress,

2124

unsigned long pages_reclaimed)

2119

unsigned long pages_reclaimed)

2125

{

2120

{

2126

/* Do not loop if specifically requested */

2121

/* Do not loop if specifically requested */

2127

if (gfp_mask & __GFP_NORETRY)

2122

if (gfp_mask & __GFP_NORETRY)

2128

return 0;

2123

return 0;

2129

2124

2130

/* Always retry if specifically requested */

2125

/* Always retry if specifically requested */

2131

if (gfp_mask & __GFP_NOFAIL)

2126

if (gfp_mask & __GFP_NOFAIL)

2132

return 1;

2127

return 1;

2133

2128

2134

/*

2129

/*

2135

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2130

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2136

* making forward progress without invoking OOM. Suspend also disables

2131

* making forward progress without invoking OOM. Suspend also disables

2137

* storage devices so kswapd will not help. Bail if we are suspending.

2132

* storage devices so kswapd will not help. Bail if we are suspending.

2138

*/

2133

*/

2139

if (!did_some_progress && pm_suspended_storage())

2134

if (!did_some_progress && pm_suspended_storage())

2140

return 0;

2135

return 0;

2141

2136

2142

/*

2137

/*

2143

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2138

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2144

* means __GFP_NOFAIL, but that may not be true in other

2139

* means __GFP_NOFAIL, but that may not be true in other

2145

* implementations.

2140

* implementations.

2146

*/

2141

*/

2147

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2142

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2148

return 1;

2143

return 1;

2149

2144

2150

/*

2145

/*

2151

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2146

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2152

* specified, then we retry until we no longer reclaim any pages

2147

* specified, then we retry until we no longer reclaim any pages

2153

* (above), or we've reclaimed an order of pages at least as

2148

* (above), or we've reclaimed an order of pages at least as

2154

* large as the allocation's order. In both cases, if the

2149

* large as the allocation's order. In both cases, if the

2155

* allocation still fails, we stop retrying.

2150

* allocation still fails, we stop retrying.

2156

*/

2151

*/

2157

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2152

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2158

return 1;

2153

return 1;

2159

2154

2160

return 0;

2155

return 0;

2161

}

2156

}

2162

2157

2163

static inline struct page *

2158

static inline struct page *

2164

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2159

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2165

struct zonelist *zonelist, enum zone_type high_zoneidx,

2160

struct zonelist *zonelist, enum zone_type high_zoneidx,

2166

nodemask_t *nodemask, struct zone *preferred_zone,

2161

nodemask_t *nodemask, struct zone *preferred_zone,

2167

int migratetype)

2162

int migratetype)

2168

{

2163

{

2169

struct page *page;

2164

struct page *page;

2170

2165

2171

/* Acquire the OOM killer lock for the zones in zonelist */

2166

/* Acquire the OOM killer lock for the zones in zonelist */

2172

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2167

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2173

schedule_timeout_uninterruptible(1);

2168

schedule_timeout_uninterruptible(1);

2174

return NULL;

2169

return NULL;

2175

}

2170

}

2176

2171

2177

/*

2172

/*

2178

* Go through the zonelist yet one more time, keep very high watermark

2173

* Go through the zonelist yet one more time, keep very high watermark

2179

* here, this is only to catch a parallel oom killing, we must fail if

2174

* here, this is only to catch a parallel oom killing, we must fail if

2180

* we're still under heavy pressure.

2175

* we're still under heavy pressure.

2181

*/

2176

*/

2182

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2177

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2183

order, zonelist, high_zoneidx,

2178

order, zonelist, high_zoneidx,

2184

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2179

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2185

preferred_zone, migratetype);

2180

preferred_zone, migratetype);

2186

if (page)

2181

if (page)

2187

goto out;

2182

goto out;

2188

2183

2189

if (!(gfp_mask & __GFP_NOFAIL)) {

2184

if (!(gfp_mask & __GFP_NOFAIL)) {

2190

/* The OOM killer will not help higher order allocs */

2185

/* The OOM killer will not help higher order allocs */

2191

if (order > PAGE_ALLOC_COSTLY_ORDER)

2186

if (order > PAGE_ALLOC_COSTLY_ORDER)

2192

goto out;

2187

goto out;

2193

/* The OOM killer does not needlessly kill tasks for lowmem */

2188

/* The OOM killer does not needlessly kill tasks for lowmem */

2194

if (high_zoneidx < ZONE_NORMAL)

2189

if (high_zoneidx < ZONE_NORMAL)

2195

goto out;

2190

goto out;

2196

/*

2191

/*

2197

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2192

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2198

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2193

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2199

* The caller should handle page allocation failure by itself if

2194

* The caller should handle page allocation failure by itself if

2200

* it specifies __GFP_THISNODE.

2195

* it specifies __GFP_THISNODE.

2201

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2196

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2202

*/

2197

*/

2203

if (gfp_mask & __GFP_THISNODE)

2198

if (gfp_mask & __GFP_THISNODE)

2204

goto out;

2199

goto out;

2205

}

2200

}

2206

/* Exhausted what can be done so it's blamo time */

2201

/* Exhausted what can be done so it's blamo time */

2207

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2202

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2208

2203

2209

out:

2204

out:

2210

clear_zonelist_oom(zonelist, gfp_mask);

2205

clear_zonelist_oom(zonelist, gfp_mask);

2211

return page;

2206

return page;

2212

}

2207

}

2213

2208

2214

#ifdef CONFIG_COMPACTION

2209

#ifdef CONFIG_COMPACTION

2215

/* Try memory compaction for high-order allocations before reclaim */

2210

/* Try memory compaction for high-order allocations before reclaim */

2216

static struct page *

2211

static struct page *

2217

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2212

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2218

struct zonelist *zonelist, enum zone_type high_zoneidx,

2213

struct zonelist *zonelist, enum zone_type high_zoneidx,

2219

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2214

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2220

int migratetype, bool sync_migration,

2215

int migratetype, bool sync_migration,

2221

bool *contended_compaction, bool *deferred_compaction,

2216

bool *contended_compaction, bool *deferred_compaction,

2222

unsigned long *did_some_progress)

2217

unsigned long *did_some_progress)

2223

{

2218

{

2224

if (!order)

2219

if (!order)

2225

return NULL;

2220

return NULL;

2226

2221

2227

if (compaction_deferred(preferred_zone, order)) {

2222

if (compaction_deferred(preferred_zone, order)) {

2228

*deferred_compaction = true;

2223

*deferred_compaction = true;

2229

return NULL;

2224

return NULL;

2230

}

2225

}

2231

2226

2232

current->flags |= PF_MEMALLOC;

2227

current->flags |= PF_MEMALLOC;

2233

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2228

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2234

nodemask, sync_migration,

2229

nodemask, sync_migration,

2235

contended_compaction);

2230

contended_compaction);

2236

current->flags &= ~PF_MEMALLOC;

2231

current->flags &= ~PF_MEMALLOC;

2237

2232

2238

if (*did_some_progress != COMPACT_SKIPPED) {

2233

if (*did_some_progress != COMPACT_SKIPPED) {

2239

struct page *page;

2234

struct page *page;

2240

2235

2241

/* Page migration frees to the PCP lists but we want merging */

2236

/* Page migration frees to the PCP lists but we want merging */

2242

drain_pages(get_cpu());

2237

drain_pages(get_cpu());

2243

put_cpu();

2238

put_cpu();

2244

2239

2245

page = get_page_from_freelist(gfp_mask, nodemask,

2240

page = get_page_from_freelist(gfp_mask, nodemask,

2246

order, zonelist, high_zoneidx,

2241

order, zonelist, high_zoneidx,

2247

alloc_flags & ~ALLOC_NO_WATERMARKS,

2242

alloc_flags & ~ALLOC_NO_WATERMARKS,

2248

preferred_zone, migratetype);

2243

preferred_zone, migratetype);

2249

if (page) {

2244

if (page) {

2250

preferred_zone->compact_blockskip_flush = false;

2245

preferred_zone->compact_blockskip_flush = false;

2251

preferred_zone->compact_considered = 0;

2246

preferred_zone->compact_considered = 0;

2252

preferred_zone->compact_defer_shift = 0;

2247

preferred_zone->compact_defer_shift = 0;

2253

if (order >= preferred_zone->compact_order_failed)

2248

if (order >= preferred_zone->compact_order_failed)

2254

preferred_zone->compact_order_failed = order + 1;

2249

preferred_zone->compact_order_failed = order + 1;

2255

count_vm_event(COMPACTSUCCESS);

2250

count_vm_event(COMPACTSUCCESS);

2256

return page;

2251

return page;

2257

}

2252

}

2258

2253

2259

/*

2254

/*

2260

* It's bad if compaction run occurs and fails.

2255

* It's bad if compaction run occurs and fails.

2261

* The most likely reason is that pages exist,

2256

* The most likely reason is that pages exist,

2262

* but not enough to satisfy watermarks.

2257

* but not enough to satisfy watermarks.

2263

*/

2258

*/

2264

count_vm_event(COMPACTFAIL);

2259

count_vm_event(COMPACTFAIL);

2265

2260

2266

/*

2261

/*

2267

* As async compaction considers a subset of pageblocks, only

2262

* As async compaction considers a subset of pageblocks, only

2268

* defer if the failure was a sync compaction failure.

2263

* defer if the failure was a sync compaction failure.

2269

*/

2264

*/

2270

if (sync_migration)

2265

if (sync_migration)

2271

defer_compaction(preferred_zone, order);

2266

defer_compaction(preferred_zone, order);

2272

2267

2273

cond_resched();

2268

cond_resched();

2274

}

2269

}

2275

2270

2276

return NULL;

2271

return NULL;

2277

}

2272

}

2278

#else

2273

#else

2279

static inline struct page *

2274

static inline struct page *

2280

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2275

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2281

struct zonelist *zonelist, enum zone_type high_zoneidx,

2276

struct zonelist *zonelist, enum zone_type high_zoneidx,

2282

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2277

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2283

int migratetype, bool sync_migration,

2278

int migratetype, bool sync_migration,

2284

bool *contended_compaction, bool *deferred_compaction,

2279

bool *contended_compaction, bool *deferred_compaction,

2285

unsigned long *did_some_progress)

2280

unsigned long *did_some_progress)

2286

{

2281

{

2287

return NULL;

2282

return NULL;

2288

}

2283

}

2289

#endif /* CONFIG_COMPACTION */

2284

#endif /* CONFIG_COMPACTION */

2290

2285

2291

/* Perform direct synchronous page reclaim */

2286

/* Perform direct synchronous page reclaim */

2292

static int

2287

static int

2293

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2288

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2294

nodemask_t *nodemask)

2289

nodemask_t *nodemask)

2295

{

2290

{

2296

struct reclaim_state reclaim_state;

2291

struct reclaim_state reclaim_state;

2297

int progress;

2292

int progress;

2298

2293

2299

cond_resched();

2294

cond_resched();

2300

2295

2301

/* We now go into synchronous reclaim */

2296

/* We now go into synchronous reclaim */

2302

cpuset_memory_pressure_bump();

2297

cpuset_memory_pressure_bump();

2303

current->flags |= PF_MEMALLOC;

2298

current->flags |= PF_MEMALLOC;

2304

lockdep_set_current_reclaim_state(gfp_mask);

2299

lockdep_set_current_reclaim_state(gfp_mask);

2305

reclaim_state.reclaimed_slab = 0;

2300

reclaim_state.reclaimed_slab = 0;

2306

current->reclaim_state = &reclaim_state;

2301

current->reclaim_state = &reclaim_state;

2307

2302

2308

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2303

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2309

2304

2310

current->reclaim_state = NULL;

2305

current->reclaim_state = NULL;

2311

lockdep_clear_current_reclaim_state();

2306

lockdep_clear_current_reclaim_state();

2312

current->flags &= ~PF_MEMALLOC;

2307

current->flags &= ~PF_MEMALLOC;

2313

2308

2314

cond_resched();

2309

cond_resched();

2315

2310

2316

return progress;

2311

return progress;

2317

}

2312

}

2318

2313

2319

/* The really slow allocator path where we enter direct reclaim */

2314

/* The really slow allocator path where we enter direct reclaim */

2320

static inline struct page *

2315

static inline struct page *

2321

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2316

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2322

struct zonelist *zonelist, enum zone_type high_zoneidx,

2317

struct zonelist *zonelist, enum zone_type high_zoneidx,

2323

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2318

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2324

int migratetype, unsigned long *did_some_progress)

2319

int migratetype, unsigned long *did_some_progress)

2325

{

2320

{

2326

struct page *page = NULL;

2321

struct page *page = NULL;

2327

bool drained = false;

2322

bool drained = false;

2328

2323

2329

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2324

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2330

nodemask);

2325

nodemask);

2331

if (unlikely(!(*did_some_progress)))

2326

if (unlikely(!(*did_some_progress)))

2332

return NULL;

2327

return NULL;

2333

2328

2334

/* After successful reclaim, reconsider all zones for allocation */

2329

/* After successful reclaim, reconsider all zones for allocation */

2335

if (IS_ENABLED(CONFIG_NUMA))

2330

if (IS_ENABLED(CONFIG_NUMA))

2336

zlc_clear_zones_full(zonelist);

2331

zlc_clear_zones_full(zonelist);

2337

2332

2338

retry:

2333

retry:

2339

page = get_page_from_freelist(gfp_mask, nodemask, order,

2334

page = get_page_from_freelist(gfp_mask, nodemask, order,

2340

zonelist, high_zoneidx,

2335

zonelist, high_zoneidx,

2341

alloc_flags & ~ALLOC_NO_WATERMARKS,

2336

alloc_flags & ~ALLOC_NO_WATERMARKS,

2342

preferred_zone, migratetype);

2337

preferred_zone, migratetype);

2343

2338

2344

/*

2339

/*

2345

* If an allocation failed after direct reclaim, it could be because

2340

* If an allocation failed after direct reclaim, it could be because

2346

* pages are pinned on the per-cpu lists. Drain them and try again

2341

* pages are pinned on the per-cpu lists. Drain them and try again

2347

*/

2342

*/

2348

if (!page && !drained) {

2343

if (!page && !drained) {

2349

drain_all_pages();

2344

drain_all_pages();

2350

drained = true;

2345

drained = true;

2351

goto retry;

2346

goto retry;

2352

}

2347

}

2353

2348

2354

return page;

2349

return page;

2355

}

2350

}

2356

2351

2357

/*

2352

/*

2358

* This is called in the allocator slow-path if the allocation request is of

2353

* This is called in the allocator slow-path if the allocation request is of

2359

* sufficient urgency to ignore watermarks and take other desperate measures

2354

* sufficient urgency to ignore watermarks and take other desperate measures

2360

*/

2355

*/

2361

static inline struct page *

2356

static inline struct page *

2362

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2357

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2363

struct zonelist *zonelist, enum zone_type high_zoneidx,

2358

struct zonelist *zonelist, enum zone_type high_zoneidx,

2364

nodemask_t *nodemask, struct zone *preferred_zone,

2359

nodemask_t *nodemask, struct zone *preferred_zone,

2365

int migratetype)

2360

int migratetype)

2366

{

2361

{

2367

struct page *page;

2362

struct page *page;

2368

2363

2369

do {

2364

do {

2370

page = get_page_from_freelist(gfp_mask, nodemask, order,

2365

page = get_page_from_freelist(gfp_mask, nodemask, order,

2371

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2366

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2372

preferred_zone, migratetype);

2367

preferred_zone, migratetype);

2373

2368

2374

if (!page && gfp_mask & __GFP_NOFAIL)

2369

if (!page && gfp_mask & __GFP_NOFAIL)

2375

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2370

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2376

} while (!page && (gfp_mask & __GFP_NOFAIL));

2371

} while (!page && (gfp_mask & __GFP_NOFAIL));

2377

2372

2378

return page;

2373

return page;

2379

}

2374

}

2380

2375

2381

static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,

2376

static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,

2382

struct zonelist *zonelist,

2377

struct zonelist *zonelist,

2383

enum zone_type high_zoneidx,

2378

enum zone_type high_zoneidx,

2384

struct zone *preferred_zone)

2379

struct zone *preferred_zone)

2385

{

2380

{

2386

struct zoneref *z;

2381

struct zoneref *z;

2387

struct zone *zone;

2382

struct zone *zone;

2388

2383

2389

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2384

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2390

if (!(gfp_mask & __GFP_NO_KSWAPD))

2385

if (!(gfp_mask & __GFP_NO_KSWAPD))

2391

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2386

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2392

/*

2387

/*

2393

* Only reset the batches of zones that were actually

2388

* Only reset the batches of zones that were actually

2394

* considered in the fast path, we don't want to

2389

* considered in the fast path, we don't want to

2395

* thrash fairness information for zones that are not

2390

* thrash fairness information for zones that are not

2396

* actually part of this zonelist's round-robin cycle.

2391

* actually part of this zonelist's round-robin cycle.

2397

*/

2392

*/

2398

if (zone_reclaim_mode && !zone_local(preferred_zone, zone))

2393

if (zone_reclaim_mode && !zone_local(preferred_zone, zone))

2399

continue;

2394

continue;

2400

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2395

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2401

high_wmark_pages(zone) -

2396

high_wmark_pages(zone) -

2402

low_wmark_pages(zone) -

2397

low_wmark_pages(zone) -

2403

zone_page_state(zone, NR_ALLOC_BATCH));

2398

zone_page_state(zone, NR_ALLOC_BATCH));

2404

}

2399

}

2405

}

2400

}

2406

2401

2407

static inline int

2402

static inline int

2408

gfp_to_alloc_flags(gfp_t gfp_mask)

2403

gfp_to_alloc_flags(gfp_t gfp_mask)

2409

{

2404

{

2410

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2405

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2411

const gfp_t wait = gfp_mask & __GFP_WAIT;

2406

const gfp_t wait = gfp_mask & __GFP_WAIT;

2412

2407

2413

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2408

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2414

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2409

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2415

2410

2416

/*

2411

/*

2417

* The caller may dip into page reserves a bit more if the caller

2412

* The caller may dip into page reserves a bit more if the caller

2418

* cannot run direct reclaim, or if the caller has realtime scheduling

2413

* cannot run direct reclaim, or if the caller has realtime scheduling

2419

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2414

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2420

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2415

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2421

*/

2416

*/

2422

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2417

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2423

2418

2424

if (!wait) {

2419

if (!wait) {

2425

/*

2420

/*

2426

* Not worth trying to allocate harder for

2421

* Not worth trying to allocate harder for

2427

* __GFP_NOMEMALLOC even if it can't schedule.

2422

* __GFP_NOMEMALLOC even if it can't schedule.

2428

*/

2423

*/

2429

if (!(gfp_mask & __GFP_NOMEMALLOC))

2424

if (!(gfp_mask & __GFP_NOMEMALLOC))

2430

alloc_flags |= ALLOC_HARDER;

2425

alloc_flags |= ALLOC_HARDER;

2431

/*

2426

/*

2432

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2427

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2433

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2428

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2434

*/

2429

*/

2435

alloc_flags &= ~ALLOC_CPUSET;

2430

alloc_flags &= ~ALLOC_CPUSET;

2436

} else if (unlikely(rt_task(current)) && !in_interrupt())

2431

} else if (unlikely(rt_task(current)) && !in_interrupt())

2437

alloc_flags |= ALLOC_HARDER;

2432

alloc_flags |= ALLOC_HARDER;

2438

2433

2439

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2434

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2440

if (gfp_mask & __GFP_MEMALLOC)

2435

if (gfp_mask & __GFP_MEMALLOC)

2441

alloc_flags |= ALLOC_NO_WATERMARKS;

2436

alloc_flags |= ALLOC_NO_WATERMARKS;

2442

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2437

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2443

alloc_flags |= ALLOC_NO_WATERMARKS;

2438

alloc_flags |= ALLOC_NO_WATERMARKS;

2444

else if (!in_interrupt() &&

2439

else if (!in_interrupt() &&

2445

((current->flags & PF_MEMALLOC) ||

2440

((current->flags & PF_MEMALLOC) ||

2446

unlikely(test_thread_flag(TIF_MEMDIE))))

2441

unlikely(test_thread_flag(TIF_MEMDIE))))

2447

alloc_flags |= ALLOC_NO_WATERMARKS;

2442

alloc_flags |= ALLOC_NO_WATERMARKS;

2448

}

2443

}

2449

#ifdef CONFIG_CMA

2444

#ifdef CONFIG_CMA

2450

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2445

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2451

alloc_flags |= ALLOC_CMA;

2446

alloc_flags |= ALLOC_CMA;

2452

#endif

2447

#endif

2453

return alloc_flags;

2448

return alloc_flags;

2454

}

2449

}

2455

2450

2456

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2451

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2457

{

2452

{

2458

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2453

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2459

}

2454

}

2460

2455

2461

static inline struct page *

2456

static inline struct page *

2462

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2457

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2463

struct zonelist *zonelist, enum zone_type high_zoneidx,

2458

struct zonelist *zonelist, enum zone_type high_zoneidx,

2464

nodemask_t *nodemask, struct zone *preferred_zone,

2459

nodemask_t *nodemask, struct zone *preferred_zone,

2465

int migratetype)

2460

int migratetype)

2466

{

2461

{

2467

const gfp_t wait = gfp_mask & __GFP_WAIT;

2462

const gfp_t wait = gfp_mask & __GFP_WAIT;

2468

struct page *page = NULL;

2463

struct page *page = NULL;

2469

int alloc_flags;

2464

int alloc_flags;

2470

unsigned long pages_reclaimed = 0;

2465

unsigned long pages_reclaimed = 0;

2471

unsigned long did_some_progress;

2466

unsigned long did_some_progress;

2472

bool sync_migration = false;

2467

bool sync_migration = false;

2473

bool deferred_compaction = false;

2468

bool deferred_compaction = false;

2474

bool contended_compaction = false;

2469

bool contended_compaction = false;

2475

2470

2476

/*

2471

/*

2477

* In the slowpath, we sanity check order to avoid ever trying to

2472

* In the slowpath, we sanity check order to avoid ever trying to

2478

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2473

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2479

* be using allocators in order of preference for an area that is

2474

* be using allocators in order of preference for an area that is

2480

* too large.

2475

* too large.

2481

*/

2476

*/

2482

if (order >= MAX_ORDER) {

2477

if (order >= MAX_ORDER) {

2483

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2478

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2484

return NULL;

2479

return NULL;

2485

}

2480

}

2486

2481

2487

/*

2482

/*

2488

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2483

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2489

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2484

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2490

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2485

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2491

* using a larger set of nodes after it has established that the

2486

* using a larger set of nodes after it has established that the

2492

* allowed per node queues are empty and that nodes are

2487

* allowed per node queues are empty and that nodes are

2493

* over allocated.

2488

* over allocated.

2494

*/

2489

*/

2495

if (IS_ENABLED(CONFIG_NUMA) &&

2490

if (IS_ENABLED(CONFIG_NUMA) &&

2496

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2491

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2497

goto nopage;

2492

goto nopage;

2498

2493

2499

restart:

2494

restart:

2500

prepare_slowpath(gfp_mask, order, zonelist,

2495

prepare_slowpath(gfp_mask, order, zonelist,

2501

high_zoneidx, preferred_zone);

2496

high_zoneidx, preferred_zone);

2502

2497

2503

/*

2498

/*

2504

* OK, we're below the kswapd watermark and have kicked background

2499

* OK, we're below the kswapd watermark and have kicked background

2505

* reclaim. Now things get more complex, so set up alloc_flags according

2500

* reclaim. Now things get more complex, so set up alloc_flags according

2506

* to how we want to proceed.

2501

* to how we want to proceed.

2507

*/

2502

*/

2508

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2503

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2509

2504

2510

/*

2505

/*

2511

* Find the true preferred zone if the allocation is unconstrained by

2506

* Find the true preferred zone if the allocation is unconstrained by

2512

* cpusets.

2507

* cpusets.

2513

*/

2508

*/

2514

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2509

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2515

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2510

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2516

&preferred_zone);

2511

&preferred_zone);

2517

2512

2518

rebalance:

2513

rebalance:

2519

/* This is the last chance, in general, before the goto nopage. */

2514

/* This is the last chance, in general, before the goto nopage. */

2520

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2515

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2521

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2516

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2522

preferred_zone, migratetype);

2517

preferred_zone, migratetype);

2523

if (page)

2518

if (page)

2524

goto got_pg;

2519

goto got_pg;

2525

2520

2526

/* Allocate without watermarks if the context allows */

2521

/* Allocate without watermarks if the context allows */

2527

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2522

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2528

/*

2523

/*

2529

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2524

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2530

* the allocation is high priority and these type of

2525

* the allocation is high priority and these type of

2531

* allocations are system rather than user orientated

2526

* allocations are system rather than user orientated

2532

*/

2527

*/

2533

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2528

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2534

2529

2535

page = __alloc_pages_high_priority(gfp_mask, order,

2530

page = __alloc_pages_high_priority(gfp_mask, order,

2536

zonelist, high_zoneidx, nodemask,

2531

zonelist, high_zoneidx, nodemask,

2537

preferred_zone, migratetype);

2532

preferred_zone, migratetype);

2538

if (page) {

2533

if (page) {

2539

goto got_pg;

2534

goto got_pg;

2540

}

2535

}

2541

}

2536

}

2542

2537

2543

/* Atomic allocations - we can't balance anything */

2538

/* Atomic allocations - we can't balance anything */

2544

if (!wait)

2539

if (!wait)

2545

goto nopage;

2540

goto nopage;

2546

2541

2547

/* Avoid recursion of direct reclaim */

2542

/* Avoid recursion of direct reclaim */

2548

if (current->flags & PF_MEMALLOC)

2543

if (current->flags & PF_MEMALLOC)

2549

goto nopage;

2544

goto nopage;

2550

2545

2551

/* Avoid allocations with no watermarks from looping endlessly */

2546

/* Avoid allocations with no watermarks from looping endlessly */

2552

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2547

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2553

goto nopage;

2548

goto nopage;

2554

2549

2555

/*

2550

/*

2556

* Try direct compaction. The first pass is asynchronous. Subsequent

2551

* Try direct compaction. The first pass is asynchronous. Subsequent

2557

* attempts after direct reclaim are synchronous

2552

* attempts after direct reclaim are synchronous

2558

*/

2553

*/

2559

page = __alloc_pages_direct_compact(gfp_mask, order,

2554

page = __alloc_pages_direct_compact(gfp_mask, order,

2560

zonelist, high_zoneidx,

2555

zonelist, high_zoneidx,

2561

nodemask,

2556

nodemask,

2562

alloc_flags, preferred_zone,

2557

alloc_flags, preferred_zone,

2563

migratetype, sync_migration,

2558

migratetype, sync_migration,

2564

&contended_compaction,

2559

&contended_compaction,

2565

&deferred_compaction,

2560

&deferred_compaction,

2566

&did_some_progress);

2561

&did_some_progress);

2567

if (page)

2562

if (page)

2568

goto got_pg;

2563

goto got_pg;

2569

sync_migration = true;

2564

sync_migration = true;

2570

2565

2571

/*

2566

/*

2572

* If compaction is deferred for high-order allocations, it is because

2567

* If compaction is deferred for high-order allocations, it is because

2573

* sync compaction recently failed. In this is the case and the caller

2568

* sync compaction recently failed. In this is the case and the caller

2574

* requested a movable allocation that does not heavily disrupt the

2569

* requested a movable allocation that does not heavily disrupt the

2575

* system then fail the allocation instead of entering direct reclaim.

2570

* system then fail the allocation instead of entering direct reclaim.

2576

*/

2571

*/

2577

if ((deferred_compaction || contended_compaction) &&

2572

if ((deferred_compaction || contended_compaction) &&

2578

(gfp_mask & __GFP_NO_KSWAPD))

2573

(gfp_mask & __GFP_NO_KSWAPD))

2579

goto nopage;

2574

goto nopage;

2580

2575

2581

/* Try direct reclaim and then allocating */

2576

/* Try direct reclaim and then allocating */

2582

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2577

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2583

zonelist, high_zoneidx,

2578

zonelist, high_zoneidx,

2584

nodemask,

2579

nodemask,

2585

alloc_flags, preferred_zone,

2580

alloc_flags, preferred_zone,

2586

migratetype, &did_some_progress);

2581

migratetype, &did_some_progress);

2587

if (page)

2582

if (page)

2588

goto got_pg;

2583

goto got_pg;

2589

2584

2590

/*

2585

/*

2591

* If we failed to make any progress reclaiming, then we are

2586

* If we failed to make any progress reclaiming, then we are

2592

* running out of options and have to consider going OOM

2587

* running out of options and have to consider going OOM

2593

*/

2588

*/

2594

if (!did_some_progress) {

2589

if (!did_some_progress) {

2595

if (oom_gfp_allowed(gfp_mask)) {

2590

if (oom_gfp_allowed(gfp_mask)) {

2596

if (oom_killer_disabled)

2591

if (oom_killer_disabled)

2597

goto nopage;

2592

goto nopage;

2598

/* Coredumps can quickly deplete all memory reserves */

2593

/* Coredumps can quickly deplete all memory reserves */

2599

if ((current->flags & PF_DUMPCORE) &&

2594

if ((current->flags & PF_DUMPCORE) &&

2600

!(gfp_mask & __GFP_NOFAIL))

2595

!(gfp_mask & __GFP_NOFAIL))

2601

goto nopage;

2596

goto nopage;

2602

page = __alloc_pages_may_oom(gfp_mask, order,

2597

page = __alloc_pages_may_oom(gfp_mask, order,

2603

zonelist, high_zoneidx,

2598

zonelist, high_zoneidx,

2604

nodemask, preferred_zone,

2599

nodemask, preferred_zone,

2605

migratetype);

2600

migratetype);

2606

if (page)

2601

if (page)

2607

goto got_pg;

2602

goto got_pg;

2608

2603

2609

if (!(gfp_mask & __GFP_NOFAIL)) {

2604

if (!(gfp_mask & __GFP_NOFAIL)) {

2610

/*

2605

/*

2611

* The oom killer is not called for high-order

2606

* The oom killer is not called for high-order

2612

* allocations that may fail, so if no progress

2607

* allocations that may fail, so if no progress

2613

* is being made, there are no other options and

2608

* is being made, there are no other options and

2614

* retrying is unlikely to help.

2609

* retrying is unlikely to help.

2615

*/

2610

*/

2616

if (order > PAGE_ALLOC_COSTLY_ORDER)

2611

if (order > PAGE_ALLOC_COSTLY_ORDER)

2617

goto nopage;

2612

goto nopage;

2618

/*

2613

/*

2619

* The oom killer is not called for lowmem

2614

* The oom killer is not called for lowmem

2620

* allocations to prevent needlessly killing

2615

* allocations to prevent needlessly killing

2621

* innocent tasks.

2616

* innocent tasks.

2622

*/

2617

*/

2623

if (high_zoneidx < ZONE_NORMAL)

2618

if (high_zoneidx < ZONE_NORMAL)

2624

goto nopage;

2619

goto nopage;

2625

}

2620

}

2626

2621

2627

goto restart;

2622

goto restart;

2628

}

2623

}

2629

}

2624

}

2630

2625

2631

/* Check if we should retry the allocation */

2626

/* Check if we should retry the allocation */

2632

pages_reclaimed += did_some_progress;

2627

pages_reclaimed += did_some_progress;

2633

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2628

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2634

pages_reclaimed)) {

2629

pages_reclaimed)) {

2635

/* Wait for some write requests to complete then retry */

2630

/* Wait for some write requests to complete then retry */

2636

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2631

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2637

goto rebalance;

2632

goto rebalance;

2638

} else {

2633

} else {

2639

/*

2634

/*

2640

* High-order allocations do not necessarily loop after

2635

* High-order allocations do not necessarily loop after

2641

* direct reclaim and reclaim/compaction depends on compaction

2636

* direct reclaim and reclaim/compaction depends on compaction

2642

* being called after reclaim so call directly if necessary

2637

* being called after reclaim so call directly if necessary

2643

*/

2638

*/

2644

page = __alloc_pages_direct_compact(gfp_mask, order,

2639

page = __alloc_pages_direct_compact(gfp_mask, order,

2645

zonelist, high_zoneidx,

2640

zonelist, high_zoneidx,

2646

nodemask,

2641

nodemask,

2647

alloc_flags, preferred_zone,

2642

alloc_flags, preferred_zone,

2648

migratetype, sync_migration,

2643

migratetype, sync_migration,

2649

&contended_compaction,

2644

&contended_compaction,

2650

&deferred_compaction,

2645

&deferred_compaction,

2651

&did_some_progress);

2646

&did_some_progress);

2652

if (page)

2647

if (page)

2653

goto got_pg;

2648

goto got_pg;

2654

}

2649

}

2655

2650

2656

nopage:

2651

nopage:

2657

warn_alloc_failed(gfp_mask, order, NULL);

2652

warn_alloc_failed(gfp_mask, order, NULL);

2658

return page;

2653

return page;

2659

got_pg:

2654

got_pg:

2660

if (kmemcheck_enabled)

2655

if (kmemcheck_enabled)

2661

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2656

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2662

2657

2663

return page;

2658

return page;

2664

}

2659

}

2665

2660

2666

/*

2661

/*

2667

* This is the 'heart' of the zoned buddy allocator.

2662

* This is the 'heart' of the zoned buddy allocator.

2668

*/

2663

*/

2669

struct page *

2664

struct page *

2670

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2665

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2671

struct zonelist *zonelist, nodemask_t *nodemask)

2666

struct zonelist *zonelist, nodemask_t *nodemask)

2672

{

2667

{

2673

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2668

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2674

struct zone *preferred_zone;

2669

struct zone *preferred_zone;

2675

struct page *page = NULL;

2670

struct page *page = NULL;

2676

int migratetype = allocflags_to_migratetype(gfp_mask);

2671

int migratetype = allocflags_to_migratetype(gfp_mask);

2677

unsigned int cpuset_mems_cookie;

2672

unsigned int cpuset_mems_cookie;

2678

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;

2673

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;

2679

struct mem_cgroup *memcg = NULL;

2674

struct mem_cgroup *memcg = NULL;

2680

2675

2681

gfp_mask &= gfp_allowed_mask;

2676

gfp_mask &= gfp_allowed_mask;

2682

2677

2683

lockdep_trace_alloc(gfp_mask);

2678

lockdep_trace_alloc(gfp_mask);

2684

2679

2685

might_sleep_if(gfp_mask & __GFP_WAIT);

2680

might_sleep_if(gfp_mask & __GFP_WAIT);

2686

2681

2687

if (should_fail_alloc_page(gfp_mask, order))

2682

if (should_fail_alloc_page(gfp_mask, order))

2688

return NULL;

2683

return NULL;

2689

2684

2690

/*

2685

/*

2691

* Check the zones suitable for the gfp_mask contain at least one

2686

* Check the zones suitable for the gfp_mask contain at least one

2692

* valid zone. It's possible to have an empty zonelist as a result

2687

* valid zone. It's possible to have an empty zonelist as a result

2693

* of GFP_THISNODE and a memoryless node

2688

* of GFP_THISNODE and a memoryless node

2694

*/

2689

*/

2695

if (unlikely(!zonelist->_zonerefs->zone))

2690

if (unlikely(!zonelist->_zonerefs->zone))

2696

return NULL;

2691

return NULL;

2697

2692

2698

/*

2693

/*

2699

* Will only have any effect when __GFP_KMEMCG is set. This is

2694

* Will only have any effect when __GFP_KMEMCG is set. This is

2700

* verified in the (always inline) callee

2695

* verified in the (always inline) callee

2701

*/

2696

*/

2702

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2697

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2703

return NULL;

2698

return NULL;

2704

2699

2705

retry_cpuset:

2700

retry_cpuset:

2706

cpuset_mems_cookie = get_mems_allowed();

2701

cpuset_mems_cookie = get_mems_allowed();

2707

2702

2708

/* The preferred zone is used for statistics later */

2703

/* The preferred zone is used for statistics later */

2709

first_zones_zonelist(zonelist, high_zoneidx,

2704

first_zones_zonelist(zonelist, high_zoneidx,

2710

nodemask ? : &cpuset_current_mems_allowed,

2705

nodemask ? : &cpuset_current_mems_allowed,

2711

&preferred_zone);

2706

&preferred_zone);

2712

if (!preferred_zone)

2707

if (!preferred_zone)

2713

goto out;

2708

goto out;

2714

2709

2715

#ifdef CONFIG_CMA

2710

#ifdef CONFIG_CMA

2716

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2711

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2717

alloc_flags |= ALLOC_CMA;

2712

alloc_flags |= ALLOC_CMA;

2718

#endif

2713

#endif

2719

/* First allocation attempt */

2714

/* First allocation attempt */

2720

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2715

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2721

zonelist, high_zoneidx, alloc_flags,

2716

zonelist, high_zoneidx, alloc_flags,

2722

preferred_zone, migratetype);

2717

preferred_zone, migratetype);

2723

if (unlikely(!page)) {

2718

if (unlikely(!page)) {

2724

/*

2719

/*

2725

* Runtime PM, block IO and its error handling path

2720

* Runtime PM, block IO and its error handling path

2726

* can deadlock because I/O on the device might not

2721

* can deadlock because I/O on the device might not

2727

* complete.

2722

* complete.

2728

*/

2723

*/

2729

gfp_mask = memalloc_noio_flags(gfp_mask);

2724

gfp_mask = memalloc_noio_flags(gfp_mask);

2730

page = __alloc_pages_slowpath(gfp_mask, order,

2725

page = __alloc_pages_slowpath(gfp_mask, order,

2731

zonelist, high_zoneidx, nodemask,

2726

zonelist, high_zoneidx, nodemask,

2732

preferred_zone, migratetype);

2727

preferred_zone, migratetype);

2733

}

2728

}

2734

2729

2735

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2730

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2736

2731

2737

out:

2732

out:

2738

/*

2733

/*

2739

* When updating a task's mems_allowed, it is possible to race with

2734

* When updating a task's mems_allowed, it is possible to race with

2740

* parallel threads in such a way that an allocation can fail while

2735

* parallel threads in such a way that an allocation can fail while

2741

* the mask is being updated. If a page allocation is about to fail,

2736

* the mask is being updated. If a page allocation is about to fail,

2742

* check if the cpuset changed during allocation and if so, retry.

2737

* check if the cpuset changed during allocation and if so, retry.

2743

*/

2738

*/

2744

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2739

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2745

goto retry_cpuset;

2740

goto retry_cpuset;

2746

2741

2747

memcg_kmem_commit_charge(page, memcg, order);

2742

memcg_kmem_commit_charge(page, memcg, order);

2748

2743

2749

return page;

2744

return page;

2750

}

2745

}

2751

EXPORT_SYMBOL(__alloc_pages_nodemask);

2746

EXPORT_SYMBOL(__alloc_pages_nodemask);

2752

2747

2753

/*

2748

/*

2754

* Common helper functions.

2749

* Common helper functions.

2755

*/

2750

*/

2756

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2751

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2757

{

2752

{

2758

struct page *page;

2753

struct page *page;

2759

2754

2760

/*

2755

/*

2761

* __get_free_pages() returns a 32-bit address, which cannot represent

2756

* __get_free_pages() returns a 32-bit address, which cannot represent

2762

* a highmem page

2757

* a highmem page

2763

*/

2758

*/

2764

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2759

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2765

2760

2766

page = alloc_pages(gfp_mask, order);

2761

page = alloc_pages(gfp_mask, order);

2767

if (!page)

2762

if (!page)

2768

return 0;

2763

return 0;

2769

return (unsigned long) page_address(page);

2764

return (unsigned long) page_address(page);

2770

}

2765

}

2771

EXPORT_SYMBOL(__get_free_pages);

2766

EXPORT_SYMBOL(__get_free_pages);

2772

2767

2773

unsigned long get_zeroed_page(gfp_t gfp_mask)

2768

unsigned long get_zeroed_page(gfp_t gfp_mask)

2774

{

2769

{

2775

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2770

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2776

}

2771

}

2777

EXPORT_SYMBOL(get_zeroed_page);

2772

EXPORT_SYMBOL(get_zeroed_page);

2778

2773

2779

void __free_pages(struct page *page, unsigned int order)

2774

void __free_pages(struct page *page, unsigned int order)

2780

{

2775

{

2781

if (put_page_testzero(page)) {

2776

if (put_page_testzero(page)) {

2782

if (order == 0)

2777

if (order == 0)

2783

free_hot_cold_page(page, 0);

2778

free_hot_cold_page(page, 0);

2784

else

2779

else

2785

__free_pages_ok(page, order);

2780

__free_pages_ok(page, order);

2786

}

2781

}

2787

}

2782

}

2788

2783

2789

EXPORT_SYMBOL(__free_pages);

2784

EXPORT_SYMBOL(__free_pages);

2790

2785

2791

void free_pages(unsigned long addr, unsigned int order)

2786

void free_pages(unsigned long addr, unsigned int order)

2792

{

2787

{

2793

if (addr != 0) {

2788

if (addr != 0) {

2794

VM_BUG_ON(!virt_addr_valid((void *)addr));

2789

VM_BUG_ON(!virt_addr_valid((void *)addr));

2795

__free_pages(virt_to_page((void *)addr), order);

2790

__free_pages(virt_to_page((void *)addr), order);

2796

}

2791

}

2797

}

2792

}

2798

2793

2799

EXPORT_SYMBOL(free_pages);

2794

EXPORT_SYMBOL(free_pages);

2800

2795

2801

/*

2796

/*

2802

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2797

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2803

* pages allocated with __GFP_KMEMCG.

2798

* pages allocated with __GFP_KMEMCG.

2804

*

2799

*

2805

* Those pages are accounted to a particular memcg, embedded in the

2800

* Those pages are accounted to a particular memcg, embedded in the

2806

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2801

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2807

* for that information only to find out that it is NULL for users who have no

2802

* for that information only to find out that it is NULL for users who have no

2808

* interest in that whatsoever, we provide these functions.

2803

* interest in that whatsoever, we provide these functions.

2809

*

2804

*

2810

* The caller knows better which flags it relies on.

2805

* The caller knows better which flags it relies on.

2811

*/

2806

*/

2812

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2807

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2813

{

2808

{

2814

memcg_kmem_uncharge_pages(page, order);

2809

memcg_kmem_uncharge_pages(page, order);

2815

__free_pages(page, order);

2810

__free_pages(page, order);

2816

}

2811

}

2817

2812

2818

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2813

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2819

{

2814

{

2820

if (addr != 0) {

2815

if (addr != 0) {

2821

VM_BUG_ON(!virt_addr_valid((void *)addr));

2816

VM_BUG_ON(!virt_addr_valid((void *)addr));

2822

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2817

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2823

}

2818

}

2824

}

2819

}

2825

2820

2826

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2821

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2827

{

2822

{

2828

if (addr) {

2823

if (addr) {

2829

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2824

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2830

unsigned long used = addr + PAGE_ALIGN(size);

2825

unsigned long used = addr + PAGE_ALIGN(size);

2831

2826

2832

split_page(virt_to_page((void *)addr), order);

2827

split_page(virt_to_page((void *)addr), order);

2833

while (used < alloc_end) {

2828

while (used < alloc_end) {

2834

free_page(used);

2829

free_page(used);

2835

used += PAGE_SIZE;

2830

used += PAGE_SIZE;

2836

}

2831

}

2837

}

2832

}

2838

return (void *)addr;

2833

return (void *)addr;

2839

}

2834

}

2840

2835

2841

/**

2836

/**

2842

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2837

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2843

* @size: the number of bytes to allocate

2838

* @size: the number of bytes to allocate

2844

* @gfp_mask: GFP flags for the allocation

2839

* @gfp_mask: GFP flags for the allocation

2845

*

2840

*

2846

* This function is similar to alloc_pages(), except that it allocates the

2841

* This function is similar to alloc_pages(), except that it allocates the

2847

* minimum number of pages to satisfy the request. alloc_pages() can only

2842

* minimum number of pages to satisfy the request. alloc_pages() can only

2848

* allocate memory in power-of-two pages.

2843

* allocate memory in power-of-two pages.

2849

*

2844

*

2850

* This function is also limited by MAX_ORDER.

2845

* This function is also limited by MAX_ORDER.

2851

*

2846

*

2852

* Memory allocated by this function must be released by free_pages_exact().

2847

* Memory allocated by this function must be released by free_pages_exact().

2853

*/

2848

*/

2854

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2849

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2855

{

2850

{

2856

unsigned int order = get_order(size);

2851

unsigned int order = get_order(size);

2857

unsigned long addr;

2852

unsigned long addr;

2858

2853

2859

addr = __get_free_pages(gfp_mask, order);

2854

addr = __get_free_pages(gfp_mask, order);

2860

return make_alloc_exact(addr, order, size);

2855

return make_alloc_exact(addr, order, size);

2861

}

2856

}

2862

EXPORT_SYMBOL(alloc_pages_exact);

2857

EXPORT_SYMBOL(alloc_pages_exact);

2863

2858

2864

/**

2859

/**

2865

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2860

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2866

* pages on a node.

2861

* pages on a node.

2867

* @nid: the preferred node ID where memory should be allocated

2862

* @nid: the preferred node ID where memory should be allocated

2868

* @size: the number of bytes to allocate

2863

* @size: the number of bytes to allocate

2869

* @gfp_mask: GFP flags for the allocation

2864

* @gfp_mask: GFP flags for the allocation

2870

*

2865

*

2871

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2866

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2872

* back.

2867

* back.

2873

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2868

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2874

* but is not exact.

2869

* but is not exact.

2875

*/

2870

*/

2876

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2871

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2877

{

2872

{

2878

unsigned order = get_order(size);

2873

unsigned order = get_order(size);

2879

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2874

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2880

if (!p)

2875

if (!p)

2881

return NULL;

2876

return NULL;

2882

return make_alloc_exact((unsigned long)page_address(p), order, size);

2877

return make_alloc_exact((unsigned long)page_address(p), order, size);

2883

}

2878

}

2884

EXPORT_SYMBOL(alloc_pages_exact_nid);

2879

EXPORT_SYMBOL(alloc_pages_exact_nid);

2885

2880

2886

/**

2881

/**

2887

* free_pages_exact - release memory allocated via alloc_pages_exact()

2882

* free_pages_exact - release memory allocated via alloc_pages_exact()

2888

* @virt: the value returned by alloc_pages_exact.

2883

* @virt: the value returned by alloc_pages_exact.

2889

* @size: size of allocation, same value as passed to alloc_pages_exact().

2884

* @size: size of allocation, same value as passed to alloc_pages_exact().

2890

*

2885

*

2891

* Release the memory allocated by a previous call to alloc_pages_exact.

2886

* Release the memory allocated by a previous call to alloc_pages_exact.

2892

*/

2887

*/

2893

void free_pages_exact(void *virt, size_t size)

2888

void free_pages_exact(void *virt, size_t size)

2894

{

2889

{

2895

unsigned long addr = (unsigned long)virt;

2890

unsigned long addr = (unsigned long)virt;

2896

unsigned long end = addr + PAGE_ALIGN(size);

2891

unsigned long end = addr + PAGE_ALIGN(size);

2897

2892

2898

while (addr < end) {

2893

while (addr < end) {

2899

free_page(addr);

2894

free_page(addr);

2900

addr += PAGE_SIZE;

2895

addr += PAGE_SIZE;

2901

}

2896

}

2902

}

2897

}

2903

EXPORT_SYMBOL(free_pages_exact);

2898

EXPORT_SYMBOL(free_pages_exact);

2904

2899

2905

/**

2900

/**

2906

* nr_free_zone_pages - count number of pages beyond high watermark

2901

* nr_free_zone_pages - count number of pages beyond high watermark

2907

* @offset: The zone index of the highest zone

2902

* @offset: The zone index of the highest zone

2908

*

2903

*

2909

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2904

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2910

* high watermark within all zones at or below a given zone index. For each

2905

* high watermark within all zones at or below a given zone index. For each

2911

* zone, the number of pages is calculated as:

2906

* zone, the number of pages is calculated as:

2912

* managed_pages - high_pages

2907

* managed_pages - high_pages

2913

*/

2908

*/

2914

static unsigned long nr_free_zone_pages(int offset)

2909

static unsigned long nr_free_zone_pages(int offset)

2915

{

2910

{

2916

struct zoneref *z;

2911

struct zoneref *z;

2917

struct zone *zone;

2912

struct zone *zone;

2918

2913

2919

/* Just pick one node, since fallback list is circular */

2914

/* Just pick one node, since fallback list is circular */

2920

unsigned long sum = 0;

2915

unsigned long sum = 0;

2921

2916

2922

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2917

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2923

2918

2924

for_each_zone_zonelist(zone, z, zonelist, offset) {

2919

for_each_zone_zonelist(zone, z, zonelist, offset) {

2925

unsigned long size = zone->managed_pages;

2920

unsigned long size = zone->managed_pages;

2926

unsigned long high = high_wmark_pages(zone);

2921

unsigned long high = high_wmark_pages(zone);

2927

if (size > high)

2922

if (size > high)

2928

sum += size - high;

2923

sum += size - high;

2929

}

2924

}

2930

2925

2931

return sum;

2926

return sum;

2932

}

2927

}

2933

2928

2934

/**

2929

/**

2935

* nr_free_buffer_pages - count number of pages beyond high watermark

2930

* nr_free_buffer_pages - count number of pages beyond high watermark

2936

*

2931

*

2937

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2932

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2938

* watermark within ZONE_DMA and ZONE_NORMAL.

2933

* watermark within ZONE_DMA and ZONE_NORMAL.

2939

*/

2934

*/

2940

unsigned long nr_free_buffer_pages(void)

2935

unsigned long nr_free_buffer_pages(void)

2941

{

2936

{

2942

return nr_free_zone_pages(gfp_zone(GFP_USER));

2937

return nr_free_zone_pages(gfp_zone(GFP_USER));

2943

}

2938

}

2944

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2939

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2945

2940

2946

/**

2941

/**

2947

* nr_free_pagecache_pages - count number of pages beyond high watermark

2942

* nr_free_pagecache_pages - count number of pages beyond high watermark

2948

*

2943

*

2949

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2944

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2950

* high watermark within all zones.

2945

* high watermark within all zones.

2951

*/

2946

*/

2952

unsigned long nr_free_pagecache_pages(void)

2947

unsigned long nr_free_pagecache_pages(void)

2953

{

2948

{

2954

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2949

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2955

}

2950

}

2956

2951

2957

static inline void show_node(struct zone *zone)

2952

static inline void show_node(struct zone *zone)

2958

{

2953

{

2959

if (IS_ENABLED(CONFIG_NUMA))

2954

if (IS_ENABLED(CONFIG_NUMA))

2960

printk("Node %d ", zone_to_nid(zone));

2955

printk("Node %d ", zone_to_nid(zone));

2961

}

2956

}

2962

2957

2963

void si_meminfo(struct sysinfo *val)

2958

void si_meminfo(struct sysinfo *val)

2964

{

2959

{

2965

val->totalram = totalram_pages;

2960

val->totalram = totalram_pages;

2966

val->sharedram = 0;

2961

val->sharedram = 0;

2967

val->freeram = global_page_state(NR_FREE_PAGES);

2962

val->freeram = global_page_state(NR_FREE_PAGES);

2968

val->bufferram = nr_blockdev_pages();

2963

val->bufferram = nr_blockdev_pages();

2969

val->totalhigh = totalhigh_pages;

2964

val->totalhigh = totalhigh_pages;

2970

val->freehigh = nr_free_highpages();

2965

val->freehigh = nr_free_highpages();

2971

val->mem_unit = PAGE_SIZE;

2966

val->mem_unit = PAGE_SIZE;

2972

}

2967

}

2973

2968

2974

EXPORT_SYMBOL(si_meminfo);

2969

EXPORT_SYMBOL(si_meminfo);

2975

2970

2976

#ifdef CONFIG_NUMA

2971

#ifdef CONFIG_NUMA

2977

void si_meminfo_node(struct sysinfo *val, int nid)

2972

void si_meminfo_node(struct sysinfo *val, int nid)

2978

{

2973

{

2979

int zone_type; /* needs to be signed */

2974

int zone_type; /* needs to be signed */

2980

unsigned long managed_pages = 0;

2975

unsigned long managed_pages = 0;

2981

pg_data_t *pgdat = NODE_DATA(nid);

2976

pg_data_t *pgdat = NODE_DATA(nid);

2982

2977

2983

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

2978

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

2984

managed_pages += pgdat->node_zones[zone_type].managed_pages;

2979

managed_pages += pgdat->node_zones[zone_type].managed_pages;

2985

val->totalram = managed_pages;

2980

val->totalram = managed_pages;

2986

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2981

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2987

#ifdef CONFIG_HIGHMEM

2982

#ifdef CONFIG_HIGHMEM

2988

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

2983

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

2989

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2984

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2990

NR_FREE_PAGES);

2985

NR_FREE_PAGES);

2991

#else

2986

#else

2992

val->totalhigh = 0;

2987

val->totalhigh = 0;

2993

val->freehigh = 0;

2988

val->freehigh = 0;

2994

#endif

2989

#endif

2995

val->mem_unit = PAGE_SIZE;

2990

val->mem_unit = PAGE_SIZE;

2996

}

2991

}

2997

#endif

2992

#endif

2998

2993

2999

/*

2994

/*

3000

* Determine whether the node should be displayed or not, depending on whether

2995

* Determine whether the node should be displayed or not, depending on whether

3001

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2996

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3002

*/

2997

*/

3003

bool skip_free_areas_node(unsigned int flags, int nid)

2998

bool skip_free_areas_node(unsigned int flags, int nid)

3004

{

2999

{

3005

bool ret = false;

3000

bool ret = false;

3006

unsigned int cpuset_mems_cookie;

3001

unsigned int cpuset_mems_cookie;

3007

3002

3008

if (!(flags & SHOW_MEM_FILTER_NODES))

3003

if (!(flags & SHOW_MEM_FILTER_NODES))

3009

goto out;

3004

goto out;

3010

3005

3011

do {

3006

do {

3012

cpuset_mems_cookie = get_mems_allowed();

3007

cpuset_mems_cookie = get_mems_allowed();

3013

ret = !node_isset(nid, cpuset_current_mems_allowed);

3008

ret = !node_isset(nid, cpuset_current_mems_allowed);

3014

} while (!put_mems_allowed(cpuset_mems_cookie));

3009

} while (!put_mems_allowed(cpuset_mems_cookie));

3015

out:

3010

out:

3016

return ret;

3011

return ret;

3017

}

3012

}

3018

3013

3019

#define K(x) ((x) << (PAGE_SHIFT-10))

3014

#define K(x) ((x) << (PAGE_SHIFT-10))

3020

3015

3021

static void show_migration_types(unsigned char type)

3016

static void show_migration_types(unsigned char type)

3022

{

3017

{

3023

static const char types[MIGRATE_TYPES] = {

3018

static const char types[MIGRATE_TYPES] = {

3024

[MIGRATE_UNMOVABLE] = 'U',

3019

[MIGRATE_UNMOVABLE] = 'U',

3025

[MIGRATE_RECLAIMABLE] = 'E',

3020

[MIGRATE_RECLAIMABLE] = 'E',

3026

[MIGRATE_MOVABLE] = 'M',

3021

[MIGRATE_MOVABLE] = 'M',

3027

[MIGRATE_RESERVE] = 'R',

3022

[MIGRATE_RESERVE] = 'R',

3028

#ifdef CONFIG_CMA

3023

#ifdef CONFIG_CMA

3029

[MIGRATE_CMA] = 'C',

3024

[MIGRATE_CMA] = 'C',

3030

#endif

3025

#endif

3031

#ifdef CONFIG_MEMORY_ISOLATION

3026

#ifdef CONFIG_MEMORY_ISOLATION

3032

[MIGRATE_ISOLATE] = 'I',

3027

[MIGRATE_ISOLATE] = 'I',

3033

#endif

3028

#endif

3034

};

3029

};

3035

char tmp[MIGRATE_TYPES + 1];

3030

char tmp[MIGRATE_TYPES + 1];

3036

char *p = tmp;

3031

char *p = tmp;

3037

int i;

3032

int i;

3038

3033

3039

for (i = 0; i < MIGRATE_TYPES; i++) {

3034

for (i = 0; i < MIGRATE_TYPES; i++) {

3040

if (type & (1 << i))

3035

if (type & (1 << i))

3041

*p++ = types[i];

3036

*p++ = types[i];

3042

}

3037

}

3043

3038

3044

*p = '\0';

3039

*p = '\0';

3045

printk("(%s) ", tmp);

3040

printk("(%s) ", tmp);

3046

}

3041

}

3047

3042

3048

/*

3043

/*

3049

* Show free area list (used inside shift_scroll-lock stuff)

3044

* Show free area list (used inside shift_scroll-lock stuff)

3050

* We also calculate the percentage fragmentation. We do this by counting the

3045

* We also calculate the percentage fragmentation. We do this by counting the

3051

* memory on each free list with the exception of the first item on the list.

3046

* memory on each free list with the exception of the first item on the list.

3052

* Suppresses nodes that are not allowed by current's cpuset if

3047

* Suppresses nodes that are not allowed by current's cpuset if

3053

* SHOW_MEM_FILTER_NODES is passed.

3048

* SHOW_MEM_FILTER_NODES is passed.

3054

*/

3049

*/

3055

void show_free_areas(unsigned int filter)

3050

void show_free_areas(unsigned int filter)

3056

{

3051

{

3057

int cpu;

3052

int cpu;

3058

struct zone *zone;

3053

struct zone *zone;

3059

3054

3060

for_each_populated_zone(zone) {

3055

for_each_populated_zone(zone) {

3061

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3056

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3062

continue;

3057

continue;

3063

show_node(zone);

3058

show_node(zone);

3064

printk("%s per-cpu:\n", zone->name);

3059

printk("%s per-cpu:\n", zone->name);

3065

3060

3066

for_each_online_cpu(cpu) {

3061

for_each_online_cpu(cpu) {

3067

struct per_cpu_pageset *pageset;

3062

struct per_cpu_pageset *pageset;

3068

3063

3069

pageset = per_cpu_ptr(zone->pageset, cpu);

3064

pageset = per_cpu_ptr(zone->pageset, cpu);

3070

3065

3071

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3066

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3072

cpu, pageset->pcp.high,

3067

cpu, pageset->pcp.high,

3073

pageset->pcp.batch, pageset->pcp.count);

3068

pageset->pcp.batch, pageset->pcp.count);

3074

}

3069

}

3075

}

3070

}

3076

3071

3077

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3072

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3078

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3073

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3079

" unevictable:%lu"

3074

" unevictable:%lu"

3080

" dirty:%lu writeback:%lu unstable:%lu\n"

3075

" dirty:%lu writeback:%lu unstable:%lu\n"

3081

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3076

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3082

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3077

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3083

" free_cma:%lu\n",

3078

" free_cma:%lu\n",

3084

global_page_state(NR_ACTIVE_ANON),

3079

global_page_state(NR_ACTIVE_ANON),

3085

global_page_state(NR_INACTIVE_ANON),

3080

global_page_state(NR_INACTIVE_ANON),

3086

global_page_state(NR_ISOLATED_ANON),

3081

global_page_state(NR_ISOLATED_ANON),

3087

global_page_state(NR_ACTIVE_FILE),

3082

global_page_state(NR_ACTIVE_FILE),

3088

global_page_state(NR_INACTIVE_FILE),

3083

global_page_state(NR_INACTIVE_FILE),

3089

global_page_state(NR_ISOLATED_FILE),

3084

global_page_state(NR_ISOLATED_FILE),

3090

global_page_state(NR_UNEVICTABLE),

3085

global_page_state(NR_UNEVICTABLE),

3091

global_page_state(NR_FILE_DIRTY),

3086

global_page_state(NR_FILE_DIRTY),

3092

global_page_state(NR_WRITEBACK),

3087

global_page_state(NR_WRITEBACK),

3093

global_page_state(NR_UNSTABLE_NFS),

3088

global_page_state(NR_UNSTABLE_NFS),

3094

global_page_state(NR_FREE_PAGES),

3089

global_page_state(NR_FREE_PAGES),

3095

global_page_state(NR_SLAB_RECLAIMABLE),

3090

global_page_state(NR_SLAB_RECLAIMABLE),

3096

global_page_state(NR_SLAB_UNRECLAIMABLE),

3091

global_page_state(NR_SLAB_UNRECLAIMABLE),

3097

global_page_state(NR_FILE_MAPPED),

3092

global_page_state(NR_FILE_MAPPED),

3098

global_page_state(NR_SHMEM),

3093

global_page_state(NR_SHMEM),

3099

global_page_state(NR_PAGETABLE),

3094

global_page_state(NR_PAGETABLE),

3100

global_page_state(NR_BOUNCE),

3095

global_page_state(NR_BOUNCE),

3101

global_page_state(NR_FREE_CMA_PAGES));

3096

global_page_state(NR_FREE_CMA_PAGES));

3102

3097

3103

for_each_populated_zone(zone) {

3098

for_each_populated_zone(zone) {

3104

int i;

3099

int i;

3105

3100

3106

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3101

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3107

continue;

3102

continue;

3108

show_node(zone);

3103

show_node(zone);

3109

printk("%s"

3104

printk("%s"

3110

" free:%lukB"

3105

" free:%lukB"

3111

" min:%lukB"

3106

" min:%lukB"

3112

" low:%lukB"

3107

" low:%lukB"

3113

" high:%lukB"

3108

" high:%lukB"

3114

" active_anon:%lukB"

3109

" active_anon:%lukB"

3115

" inactive_anon:%lukB"

3110

" inactive_anon:%lukB"

3116

" active_file:%lukB"

3111

" active_file:%lukB"

3117

" inactive_file:%lukB"

3112

" inactive_file:%lukB"

3118

" unevictable:%lukB"

3113

" unevictable:%lukB"

3119

" isolated(anon):%lukB"

3114

" isolated(anon):%lukB"

3120

" isolated(file):%lukB"

3115

" isolated(file):%lukB"

3121

" present:%lukB"

3116

" present:%lukB"

3122

" managed:%lukB"

3117

" managed:%lukB"

3123

" mlocked:%lukB"

3118

" mlocked:%lukB"

3124

" dirty:%lukB"

3119

" dirty:%lukB"

3125

" writeback:%lukB"

3120

" writeback:%lukB"

3126

" mapped:%lukB"

3121

" mapped:%lukB"

3127

" shmem:%lukB"

3122

" shmem:%lukB"

3128

" slab_reclaimable:%lukB"

3123

" slab_reclaimable:%lukB"

3129

" slab_unreclaimable:%lukB"

3124

" slab_unreclaimable:%lukB"

3130

" kernel_stack:%lukB"

3125

" kernel_stack:%lukB"

3131

" pagetables:%lukB"

3126

" pagetables:%lukB"

3132

" unstable:%lukB"

3127

" unstable:%lukB"

3133

" bounce:%lukB"

3128

" bounce:%lukB"

3134

" free_cma:%lukB"

3129

" free_cma:%lukB"

3135

" writeback_tmp:%lukB"

3130

" writeback_tmp:%lukB"

3136

" pages_scanned:%lu"

3131

" pages_scanned:%lu"

3137

" all_unreclaimable? %s"

3132

" all_unreclaimable? %s"

3138

"\n",

3133

"\n",

3139

zone->name,

3134

zone->name,

3140

K(zone_page_state(zone, NR_FREE_PAGES)),

3135

K(zone_page_state(zone, NR_FREE_PAGES)),

3141

K(min_wmark_pages(zone)),

3136

K(min_wmark_pages(zone)),

3142

K(low_wmark_pages(zone)),

3137

K(low_wmark_pages(zone)),

3143

K(high_wmark_pages(zone)),

3138

K(high_wmark_pages(zone)),

3144

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3139

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3145

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3140

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3146

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3141

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3147

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3142

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3148

K(zone_page_state(zone, NR_UNEVICTABLE)),

3143

K(zone_page_state(zone, NR_UNEVICTABLE)),

3149

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3144

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3150

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3145

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3151

K(zone->present_pages),

3146

K(zone->present_pages),

3152

K(zone->managed_pages),

3147

K(zone->managed_pages),

3153

K(zone_page_state(zone, NR_MLOCK)),

3148

K(zone_page_state(zone, NR_MLOCK)),

3154

K(zone_page_state(zone, NR_FILE_DIRTY)),

3149

K(zone_page_state(zone, NR_FILE_DIRTY)),

3155

K(zone_page_state(zone, NR_WRITEBACK)),

3150

K(zone_page_state(zone, NR_WRITEBACK)),

3156

K(zone_page_state(zone, NR_FILE_MAPPED)),

3151

K(zone_page_state(zone, NR_FILE_MAPPED)),

3157

K(zone_page_state(zone, NR_SHMEM)),

3152

K(zone_page_state(zone, NR_SHMEM)),

3158

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3153

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3159

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3154

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3160

zone_page_state(zone, NR_KERNEL_STACK) *

3155

zone_page_state(zone, NR_KERNEL_STACK) *

3161

THREAD_SIZE / 1024,

3156

THREAD_SIZE / 1024,

3162

K(zone_page_state(zone, NR_PAGETABLE)),

3157

K(zone_page_state(zone, NR_PAGETABLE)),

3163

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3158

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3164

K(zone_page_state(zone, NR_BOUNCE)),

3159

K(zone_page_state(zone, NR_BOUNCE)),

3165

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3160

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3166

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3161

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3167

zone->pages_scanned,

3162

zone->pages_scanned,

3168

(!zone_reclaimable(zone) ? "yes" : "no")

3163

(!zone_reclaimable(zone) ? "yes" : "no")

3169

);

3164

);

3170

printk("lowmem_reserve[]:");

3165

printk("lowmem_reserve[]:");

3171

for (i = 0; i < MAX_NR_ZONES; i++)

3166

for (i = 0; i < MAX_NR_ZONES; i++)

3172

printk(" %lu", zone->lowmem_reserve[i]);

3167

printk(" %lu", zone->lowmem_reserve[i]);

3173

printk("\n");

3168

printk("\n");

3174

}

3169

}

3175

3170

3176

for_each_populated_zone(zone) {

3171

for_each_populated_zone(zone) {

3177

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3172

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3178

unsigned char types[MAX_ORDER];

3173

unsigned char types[MAX_ORDER];

3179

3174

3180

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3175

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3181

continue;

3176

continue;

3182

show_node(zone);

3177

show_node(zone);

3183

printk("%s: ", zone->name);

3178

printk("%s: ", zone->name);

3184

3179

3185

spin_lock_irqsave(&zone->lock, flags);

3180

spin_lock_irqsave(&zone->lock, flags);

3186

for (order = 0; order < MAX_ORDER; order++) {

3181

for (order = 0; order < MAX_ORDER; order++) {

3187

struct free_area *area = &zone->free_area[order];

3182

struct free_area *area = &zone->free_area[order];

3188

int type;

3183

int type;

3189

3184

3190

nr[order] = area->nr_free;

3185

nr[order] = area->nr_free;

3191

total += nr[order] << order;

3186

total += nr[order] << order;

3192

3187

3193

types[order] = 0;

3188

types[order] = 0;

3194

for (type = 0; type < MIGRATE_TYPES; type++) {

3189

for (type = 0; type < MIGRATE_TYPES; type++) {

3195

if (!list_empty(&area->free_list[type]))

3190

if (!list_empty(&area->free_list[type]))

3196

types[order] |= 1 << type;

3191

types[order] |= 1 << type;

3197

}

3192

}

3198

}

3193

}

3199

spin_unlock_irqrestore(&zone->lock, flags);

3194

spin_unlock_irqrestore(&zone->lock, flags);

3200

for (order = 0; order < MAX_ORDER; order++) {

3195

for (order = 0; order < MAX_ORDER; order++) {

3201

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3196

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3202

if (nr[order])

3197

if (nr[order])

3203

show_migration_types(types[order]);

3198

show_migration_types(types[order]);

3204

}

3199

}

3205

printk("= %lukB\n", K(total));

3200

printk("= %lukB\n", K(total));

3206

}

3201

}

3207

3202

3208

hugetlb_show_meminfo();

3203

hugetlb_show_meminfo();

3209

3204

3210

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3205

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3211

3206

3212

show_swap_cache_info();

3207

show_swap_cache_info();

3213

}

3208

}

3214

3209

3215

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3210

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3216

{

3211

{

3217

zoneref->zone = zone;

3212

zoneref->zone = zone;

3218

zoneref->zone_idx = zone_idx(zone);

3213

zoneref->zone_idx = zone_idx(zone);

3219

}

3214

}

3220

3215

3221

/*

3216

/*

3222

* Builds allocation fallback zone lists.

3217

* Builds allocation fallback zone lists.

3223

*

3218

*

3224

* Add all populated zones of a node to the zonelist.

3219

* Add all populated zones of a node to the zonelist.

3225

*/

3220

*/

3226

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3221

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3227

int nr_zones)

3222

int nr_zones)

3228

{

3223

{

3229

struct zone *zone;

3224

struct zone *zone;

3230

enum zone_type zone_type = MAX_NR_ZONES;

3225

enum zone_type zone_type = MAX_NR_ZONES;

3231

3226

3232

do {

3227

do {

3233

zone_type--;

3228

zone_type--;

3234

zone = pgdat->node_zones + zone_type;

3229

zone = pgdat->node_zones + zone_type;

3235

if (populated_zone(zone)) {

3230

if (populated_zone(zone)) {

3236

zoneref_set_zone(zone,

3231

zoneref_set_zone(zone,

3237

&zonelist->_zonerefs[nr_zones++]);

3232

&zonelist->_zonerefs[nr_zones++]);

3238

check_highest_zone(zone_type);

3233

check_highest_zone(zone_type);

3239

}

3234

}

3240

} while (zone_type);

3235

} while (zone_type);

3241

3236

3242

return nr_zones;

3237

return nr_zones;

3243

}

3238

}

3244

3239

3245

3240

3246

/*

3241

/*

3247

* zonelist_order:

3242

* zonelist_order:

3248

* 0 = automatic detection of better ordering.

3243

* 0 = automatic detection of better ordering.

3249

* 1 = order by ([node] distance, -zonetype)

3244

* 1 = order by ([node] distance, -zonetype)

3250

* 2 = order by (-zonetype, [node] distance)

3245

* 2 = order by (-zonetype, [node] distance)

3251

*

3246

*

3252

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3247

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3253

* the same zonelist. So only NUMA can configure this param.

3248

* the same zonelist. So only NUMA can configure this param.

3254

*/

3249

*/

3255

#define ZONELIST_ORDER_DEFAULT 0

3250

#define ZONELIST_ORDER_DEFAULT 0

3256

#define ZONELIST_ORDER_NODE 1

3251

#define ZONELIST_ORDER_NODE 1

3257

#define ZONELIST_ORDER_ZONE 2

3252

#define ZONELIST_ORDER_ZONE 2

3258

3253

3259

/* zonelist order in the kernel.

3254

/* zonelist order in the kernel.

3260

* set_zonelist_order() will set this to NODE or ZONE.

3255

* set_zonelist_order() will set this to NODE or ZONE.

3261

*/

3256

*/

3262

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3257

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3263

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3258

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3264

3259

3265

3260

3266

#ifdef CONFIG_NUMA

3261

#ifdef CONFIG_NUMA

3267

/* The value user specified ....changed by config */

3262

/* The value user specified ....changed by config */

3268

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3263

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3269

/* string for sysctl */

3264

/* string for sysctl */

3270

#define NUMA_ZONELIST_ORDER_LEN 16

3265

#define NUMA_ZONELIST_ORDER_LEN 16

3271

char numa_zonelist_order[16] = "default";

3266

char numa_zonelist_order[16] = "default";

3272

3267

3273

/*

3268

/*

3274

* interface for configure zonelist ordering.

3269

* interface for configure zonelist ordering.

3275

* command line option "numa_zonelist_order"

3270

* command line option "numa_zonelist_order"

3276

* = "[dD]efault - default, automatic configuration.

3271

* = "[dD]efault - default, automatic configuration.

3277

* = "[nN]ode - order by node locality, then by zone within node

3272

* = "[nN]ode - order by node locality, then by zone within node

3278

* = "[zZ]one - order by zone, then by locality within zone

3273

* = "[zZ]one - order by zone, then by locality within zone

3279

*/

3274

*/

3280

3275

3281

static int __parse_numa_zonelist_order(char *s)

3276

static int __parse_numa_zonelist_order(char *s)

3282

{

3277

{

3283

if (*s == 'd' || *s == 'D') {

3278

if (*s == 'd' || *s == 'D') {

3284

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3279

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3285

} else if (*s == 'n' || *s == 'N') {

3280

} else if (*s == 'n' || *s == 'N') {

3286

user_zonelist_order = ZONELIST_ORDER_NODE;

3281

user_zonelist_order = ZONELIST_ORDER_NODE;

3287

} else if (*s == 'z' || *s == 'Z') {

3282

} else if (*s == 'z' || *s == 'Z') {

3288

user_zonelist_order = ZONELIST_ORDER_ZONE;

3283

user_zonelist_order = ZONELIST_ORDER_ZONE;

3289

} else {

3284

} else {

3290

printk(KERN_WARNING

3285

printk(KERN_WARNING

3291

"Ignoring invalid numa_zonelist_order value: "

3286

"Ignoring invalid numa_zonelist_order value: "

3292

"%s\n", s);

3287

"%s\n", s);

3293

return -EINVAL;

3288

return -EINVAL;

3294

}

3289

}

3295

return 0;

3290

return 0;

3296

}

3291

}

3297

3292

3298

static __init int setup_numa_zonelist_order(char *s)

3293

static __init int setup_numa_zonelist_order(char *s)

3299

{

3294

{

3300

int ret;

3295

int ret;

3301

3296

3302

if (!s)

3297

if (!s)

3303

return 0;

3298

return 0;

3304

3299

3305

ret = __parse_numa_zonelist_order(s);

3300

ret = __parse_numa_zonelist_order(s);

3306

if (ret == 0)

3301

if (ret == 0)

3307

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3302

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3308

3303

3309

return ret;

3304

return ret;

3310

}

3305

}

3311

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3306

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3312

3307

3313

/*

3308

/*

3314

* sysctl handler for numa_zonelist_order

3309

* sysctl handler for numa_zonelist_order

3315

*/

3310

*/

3316

int numa_zonelist_order_handler(ctl_table *table, int write,

3311

int numa_zonelist_order_handler(ctl_table *table, int write,

3317

void __user *buffer, size_t *length,

3312

void __user *buffer, size_t *length,

3318

loff_t *ppos)

3313

loff_t *ppos)

3319

{

3314

{

3320

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3315

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3321

int ret;

3316

int ret;

3322

static DEFINE_MUTEX(zl_order_mutex);

3317

static DEFINE_MUTEX(zl_order_mutex);

3323

3318

3324

mutex_lock(&zl_order_mutex);

3319

mutex_lock(&zl_order_mutex);

3325

if (write) {

3320

if (write) {

3326

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3321

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3327

ret = -EINVAL;

3322

ret = -EINVAL;

3328

goto out;

3323

goto out;

3329

}

3324

}

3330

strcpy(saved_string, (char *)table->data);

3325

strcpy(saved_string, (char *)table->data);

3331

}

3326

}

3332

ret = proc_dostring(table, write, buffer, length, ppos);

3327

ret = proc_dostring(table, write, buffer, length, ppos);

3333

if (ret)

3328

if (ret)

3334

goto out;

3329

goto out;

3335

if (write) {

3330

if (write) {

3336

int oldval = user_zonelist_order;

3331

int oldval = user_zonelist_order;

3337

3332

3338

ret = __parse_numa_zonelist_order((char *)table->data);

3333

ret = __parse_numa_zonelist_order((char *)table->data);

3339

if (ret) {

3334

if (ret) {

3340

/*

3335

/*

3341

* bogus value. restore saved string

3336

* bogus value. restore saved string

3342

*/

3337

*/

3343

strncpy((char *)table->data, saved_string,

3338

strncpy((char *)table->data, saved_string,

3344

NUMA_ZONELIST_ORDER_LEN);

3339

NUMA_ZONELIST_ORDER_LEN);

3345

user_zonelist_order = oldval;

3340

user_zonelist_order = oldval;

3346

} else if (oldval != user_zonelist_order) {

3341

} else if (oldval != user_zonelist_order) {

3347

mutex_lock(&zonelists_mutex);

3342

mutex_lock(&zonelists_mutex);

3348

build_all_zonelists(NULL, NULL);

3343

build_all_zonelists(NULL, NULL);

3349

mutex_unlock(&zonelists_mutex);

3344

mutex_unlock(&zonelists_mutex);

3350

}

3345

}

3351

}

3346

}

3352

out:

3347

out:

3353

mutex_unlock(&zl_order_mutex);

3348

mutex_unlock(&zl_order_mutex);

3354

return ret;

3349

return ret;

3355

}

3350

}

3356

3351

3357

3352

3358

#define MAX_NODE_LOAD (nr_online_nodes)

3353

#define MAX_NODE_LOAD (nr_online_nodes)

3359

static int node_load[MAX_NUMNODES];

3354

static int node_load[MAX_NUMNODES];

3360

3355

3361

/**

3356

/**

3362

* find_next_best_node - find the next node that should appear in a given node's fallback list

3357

* find_next_best_node - find the next node that should appear in a given node's fallback list

3363

* @node: node whose fallback list we're appending

3358

* @node: node whose fallback list we're appending

3364

* @used_node_mask: nodemask_t of already used nodes

3359

* @used_node_mask: nodemask_t of already used nodes

3365

*

3360

*

3366

* We use a number of factors to determine which is the next node that should

3361

* We use a number of factors to determine which is the next node that should

3367

* appear on a given node's fallback list. The node should not have appeared

3362

* appear on a given node's fallback list. The node should not have appeared

3368

* already in @node's fallback list, and it should be the next closest node

3363

* already in @node's fallback list, and it should be the next closest node

3369

* according to the distance array (which contains arbitrary distance values

3364

* according to the distance array (which contains arbitrary distance values

3370

* from each node to each node in the system), and should also prefer nodes

3365

* from each node to each node in the system), and should also prefer nodes

3371

* with no CPUs, since presumably they'll have very little allocation pressure

3366

* with no CPUs, since presumably they'll have very little allocation pressure

3372

* on them otherwise.

3367

* on them otherwise.

3373

* It returns -1 if no node is found.

3368

* It returns -1 if no node is found.

3374

*/

3369

*/

3375

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3370

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3376

{

3371

{

3377

int n, val;

3372

int n, val;

3378

int min_val = INT_MAX;

3373

int min_val = INT_MAX;

3379

int best_node = NUMA_NO_NODE;

3374

int best_node = NUMA_NO_NODE;

3380

const struct cpumask *tmp = cpumask_of_node(0);

3375

const struct cpumask *tmp = cpumask_of_node(0);

3381

3376

3382

/* Use the local node if we haven't already */

3377

/* Use the local node if we haven't already */

3383

if (!node_isset(node, *used_node_mask)) {

3378

if (!node_isset(node, *used_node_mask)) {

3384

node_set(node, *used_node_mask);

3379

node_set(node, *used_node_mask);

3385

return node;

3380

return node;

3386

}

3381

}

3387

3382

3388

for_each_node_state(n, N_MEMORY) {

3383

for_each_node_state(n, N_MEMORY) {

3389

3384

3390

/* Don't want a node to appear more than once */

3385

/* Don't want a node to appear more than once */

3391

if (node_isset(n, *used_node_mask))

3386

if (node_isset(n, *used_node_mask))

3392

continue;

3387

continue;

3393

3388

3394

/* Use the distance array to find the distance */

3389

/* Use the distance array to find the distance */

3395

val = node_distance(node, n);

3390

val = node_distance(node, n);

3396

3391

3397

/* Penalize nodes under us ("prefer the next node") */

3392

/* Penalize nodes under us ("prefer the next node") */

3398

val += (n < node);

3393

val += (n < node);

3399

3394

3400

/* Give preference to headless and unused nodes */

3395

/* Give preference to headless and unused nodes */

3401

tmp = cpumask_of_node(n);

3396

tmp = cpumask_of_node(n);

3402

if (!cpumask_empty(tmp))

3397

if (!cpumask_empty(tmp))

3403

val += PENALTY_FOR_NODE_WITH_CPUS;

3398

val += PENALTY_FOR_NODE_WITH_CPUS;

3404

3399

3405

/* Slight preference for less loaded node */

3400

/* Slight preference for less loaded node */

3406

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3401

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3407

val += node_load[n];

3402

val += node_load[n];

3408

3403

3409

if (val < min_val) {

3404

if (val < min_val) {

3410

min_val = val;

3405

min_val = val;

3411

best_node = n;

3406

best_node = n;

3412

}

3407

}

3413

}

3408

}

3414

3409

3415

if (best_node >= 0)

3410

if (best_node >= 0)

3416

node_set(best_node, *used_node_mask);

3411

node_set(best_node, *used_node_mask);

3417

3412

3418

return best_node;

3413

return best_node;

3419

}

3414

}

3420

3415

3421

3416

3422

/*

3417

/*

3423

* Build zonelists ordered by node and zones within node.

3418

* Build zonelists ordered by node and zones within node.

3424

* This results in maximum locality--normal zone overflows into local

3419

* This results in maximum locality--normal zone overflows into local

3425

* DMA zone, if any--but risks exhausting DMA zone.

3420

* DMA zone, if any--but risks exhausting DMA zone.

3426

*/

3421

*/

3427

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3422

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3428

{

3423

{

3429

int j;

3424

int j;

3430

struct zonelist *zonelist;

3425

struct zonelist *zonelist;

3431

3426

3432

zonelist = &pgdat->node_zonelists[0];

3427

zonelist = &pgdat->node_zonelists[0];

3433

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3428

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3434

;

3429

;

3435

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3430

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3436

zonelist->_zonerefs[j].zone = NULL;

3431

zonelist->_zonerefs[j].zone = NULL;

3437

zonelist->_zonerefs[j].zone_idx = 0;

3432

zonelist->_zonerefs[j].zone_idx = 0;

3438

}

3433

}

3439

3434

3440

/*

3435

/*

3441

* Build gfp_thisnode zonelists

3436

* Build gfp_thisnode zonelists

3442

*/

3437

*/

3443

static void build_thisnode_zonelists(pg_data_t *pgdat)

3438

static void build_thisnode_zonelists(pg_data_t *pgdat)

3444

{

3439

{

3445

int j;

3440

int j;

3446

struct zonelist *zonelist;

3441

struct zonelist *zonelist;

3447

3442

3448

zonelist = &pgdat->node_zonelists[1];

3443

zonelist = &pgdat->node_zonelists[1];

3449

j = build_zonelists_node(pgdat, zonelist, 0);

3444

j = build_zonelists_node(pgdat, zonelist, 0);

3450

zonelist->_zonerefs[j].zone = NULL;

3445

zonelist->_zonerefs[j].zone = NULL;

3451

zonelist->_zonerefs[j].zone_idx = 0;

3446

zonelist->_zonerefs[j].zone_idx = 0;

3452

}

3447

}

3453

3448

3454

/*

3449

/*

3455

* Build zonelists ordered by zone and nodes within zones.

3450

* Build zonelists ordered by zone and nodes within zones.

3456

* This results in conserving DMA zone[s] until all Normal memory is

3451

* This results in conserving DMA zone[s] until all Normal memory is

3457

* exhausted, but results in overflowing to remote node while memory

3452

* exhausted, but results in overflowing to remote node while memory

3458

* may still exist in local DMA zone.

3453

* may still exist in local DMA zone.

3459

*/

3454

*/

3460

static int node_order[MAX_NUMNODES];

3455

static int node_order[MAX_NUMNODES];

3461

3456

3462

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3457

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3463

{

3458

{

3464

int pos, j, node;

3459

int pos, j, node;

3465

int zone_type; /* needs to be signed */

3460

int zone_type; /* needs to be signed */

3466

struct zone *z;

3461

struct zone *z;

3467

struct zonelist *zonelist;

3462

struct zonelist *zonelist;

3468

3463

3469

zonelist = &pgdat->node_zonelists[0];

3464

zonelist = &pgdat->node_zonelists[0];

3470

pos = 0;

3465

pos = 0;

3471

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3466

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3472

for (j = 0; j < nr_nodes; j++) {

3467

for (j = 0; j < nr_nodes; j++) {

3473

node = node_order[j];

3468

node = node_order[j];

3474

z = &NODE_DATA(node)->node_zones[zone_type];

3469

z = &NODE_DATA(node)->node_zones[zone_type];

3475

if (populated_zone(z)) {

3470

if (populated_zone(z)) {

3476

zoneref_set_zone(z,

3471

zoneref_set_zone(z,

3477

&zonelist->_zonerefs[pos++]);

3472

&zonelist->_zonerefs[pos++]);

3478

check_highest_zone(zone_type);

3473

check_highest_zone(zone_type);

3479

}

3474

}

3480

}

3475

}

3481

}

3476

}

3482

zonelist->_zonerefs[pos].zone = NULL;

3477

zonelist->_zonerefs[pos].zone = NULL;

3483

zonelist->_zonerefs[pos].zone_idx = 0;

3478

zonelist->_zonerefs[pos].zone_idx = 0;

3484

}

3479

}

3485

3480

3486

static int default_zonelist_order(void)

3481

static int default_zonelist_order(void)

3487

{

3482

{

3488

int nid, zone_type;

3483

int nid, zone_type;

3489

unsigned long low_kmem_size, total_size;

3484

unsigned long low_kmem_size, total_size;

3490

struct zone *z;

3485

struct zone *z;

3491

int average_size;

3486

int average_size;

3492

/*

3487

/*

3493

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3488

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3494

* If they are really small and used heavily, the system can fall

3489

* If they are really small and used heavily, the system can fall

3495

* into OOM very easily.

3490

* into OOM very easily.

3496

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3491

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3497

*/

3492

*/

3498

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3493

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3499

low_kmem_size = 0;

3494

low_kmem_size = 0;

3500

total_size = 0;

3495

total_size = 0;

3501

for_each_online_node(nid) {

3496

for_each_online_node(nid) {

3502

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3497

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3503

z = &NODE_DATA(nid)->node_zones[zone_type];

3498

z = &NODE_DATA(nid)->node_zones[zone_type];

3504

if (populated_zone(z)) {

3499

if (populated_zone(z)) {

3505

if (zone_type < ZONE_NORMAL)

3500

if (zone_type < ZONE_NORMAL)

3506

low_kmem_size += z->managed_pages;

3501

low_kmem_size += z->managed_pages;

3507

total_size += z->managed_pages;

3502

total_size += z->managed_pages;

3508

} else if (zone_type == ZONE_NORMAL) {

3503

} else if (zone_type == ZONE_NORMAL) {

3509

/*

3504

/*

3510

* If any node has only lowmem, then node order

3505

* If any node has only lowmem, then node order

3511

* is preferred to allow kernel allocations

3506

* is preferred to allow kernel allocations

3512

* locally; otherwise, they can easily infringe

3507

* locally; otherwise, they can easily infringe

3513

* on other nodes when there is an abundance of

3508

* on other nodes when there is an abundance of

3514

* lowmem available to allocate from.

3509

* lowmem available to allocate from.

3515

*/

3510

*/

3516

return ZONELIST_ORDER_NODE;

3511

return ZONELIST_ORDER_NODE;

3517

}

3512

}

3518

}

3513

}

3519

}

3514

}

3520

if (!low_kmem_size || /* there are no DMA area. */

3515

if (!low_kmem_size || /* there are no DMA area. */

3521

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3516

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3522

return ZONELIST_ORDER_NODE;

3517

return ZONELIST_ORDER_NODE;

3523

/*

3518

/*

3524

* look into each node's config.

3519

* look into each node's config.

3525

* If there is a node whose DMA/DMA32 memory is very big area on

3520

* If there is a node whose DMA/DMA32 memory is very big area on

3526

* local memory, NODE_ORDER may be suitable.

3521

* local memory, NODE_ORDER may be suitable.

3527

*/

3522

*/

3528

average_size = total_size /

3523

average_size = total_size /

3529

(nodes_weight(node_states[N_MEMORY]) + 1);

3524

(nodes_weight(node_states[N_MEMORY]) + 1);

3530

for_each_online_node(nid) {

3525

for_each_online_node(nid) {

3531

low_kmem_size = 0;

3526

low_kmem_size = 0;

3532

total_size = 0;

3527

total_size = 0;

3533

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3528

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3534

z = &NODE_DATA(nid)->node_zones[zone_type];

3529

z = &NODE_DATA(nid)->node_zones[zone_type];

3535

if (populated_zone(z)) {

3530

if (populated_zone(z)) {

3536

if (zone_type < ZONE_NORMAL)

3531

if (zone_type < ZONE_NORMAL)

3537

low_kmem_size += z->present_pages;

3532

low_kmem_size += z->present_pages;

3538

total_size += z->present_pages;

3533

total_size += z->present_pages;

3539

}

3534

}

3540

}

3535

}

3541

if (low_kmem_size &&

3536

if (low_kmem_size &&

3542

total_size > average_size && /* ignore small node */

3537

total_size > average_size && /* ignore small node */

3543

low_kmem_size > total_size * 70/100)

3538

low_kmem_size > total_size * 70/100)

3544

return ZONELIST_ORDER_NODE;

3539

return ZONELIST_ORDER_NODE;

3545

}

3540

}

3546

return ZONELIST_ORDER_ZONE;

3541

return ZONELIST_ORDER_ZONE;

3547

}

3542

}

3548

3543

3549

static void set_zonelist_order(void)

3544

static void set_zonelist_order(void)

3550

{

3545

{

3551

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3546

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3552

current_zonelist_order = default_zonelist_order();

3547

current_zonelist_order = default_zonelist_order();

3553

else

3548

else

3554

current_zonelist_order = user_zonelist_order;

3549

current_zonelist_order = user_zonelist_order;

3555

}

3550

}

3556

3551

3557

static void build_zonelists(pg_data_t *pgdat)

3552

static void build_zonelists(pg_data_t *pgdat)

3558

{

3553

{

3559

int j, node, load;

3554

int j, node, load;

3560

enum zone_type i;

3555

enum zone_type i;

3561

nodemask_t used_mask;

3556

nodemask_t used_mask;

3562

int local_node, prev_node;

3557

int local_node, prev_node;

3563

struct zonelist *zonelist;

3558

struct zonelist *zonelist;

3564

int order = current_zonelist_order;

3559

int order = current_zonelist_order;

3565

3560

3566

/* initialize zonelists */

3561

/* initialize zonelists */

3567

for (i = 0; i < MAX_ZONELISTS; i++) {

3562

for (i = 0; i < MAX_ZONELISTS; i++) {

3568

zonelist = pgdat->node_zonelists + i;

3563

zonelist = pgdat->node_zonelists + i;

3569

zonelist->_zonerefs[0].zone = NULL;

3564

zonelist->_zonerefs[0].zone = NULL;

3570

zonelist->_zonerefs[0].zone_idx = 0;

3565

zonelist->_zonerefs[0].zone_idx = 0;

3571

}

3566

}

3572

3567

3573

/* NUMA-aware ordering of nodes */

3568

/* NUMA-aware ordering of nodes */

3574

local_node = pgdat->node_id;

3569

local_node = pgdat->node_id;

3575

load = nr_online_nodes;

3570

load = nr_online_nodes;

3576

prev_node = local_node;

3571

prev_node = local_node;

3577

nodes_clear(used_mask);

3572

nodes_clear(used_mask);

3578

3573

3579

memset(node_order, 0, sizeof(node_order));

3574

memset(node_order, 0, sizeof(node_order));

3580

j = 0;

3575

j = 0;

3581

3576

3582

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3577

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3583

/*

3578

/*

3584

* We don't want to pressure a particular node.

3579

* We don't want to pressure a particular node.

3585

* So adding penalty to the first node in same

3580

* So adding penalty to the first node in same

3586

* distance group to make it round-robin.

3581

* distance group to make it round-robin.

3587

*/

3582

*/

3588

if (node_distance(local_node, node) !=

3583

if (node_distance(local_node, node) !=

3589

node_distance(local_node, prev_node))

3584

node_distance(local_node, prev_node))

3590

node_load[node] = load;

3585

node_load[node] = load;

3591

3586

3592

prev_node = node;

3587

prev_node = node;

3593

load--;

3588

load--;

3594

if (order == ZONELIST_ORDER_NODE)

3589

if (order == ZONELIST_ORDER_NODE)

3595

build_zonelists_in_node_order(pgdat, node);

3590

build_zonelists_in_node_order(pgdat, node);

3596

else

3591

else

3597

node_order[j++] = node; /* remember order */

3592

node_order[j++] = node; /* remember order */

3598

}

3593

}

3599

3594

3600

if (order == ZONELIST_ORDER_ZONE) {

3595

if (order == ZONELIST_ORDER_ZONE) {

3601

/* calculate node order -- i.e., DMA last! */

3596

/* calculate node order -- i.e., DMA last! */

3602

build_zonelists_in_zone_order(pgdat, j);

3597

build_zonelists_in_zone_order(pgdat, j);

3603

}

3598

}

3604

3599

3605

build_thisnode_zonelists(pgdat);

3600

build_thisnode_zonelists(pgdat);

3606

}

3601

}

3607

3602

3608

/* Construct the zonelist performance cache - see further mmzone.h */

3603

/* Construct the zonelist performance cache - see further mmzone.h */

3609

static void build_zonelist_cache(pg_data_t *pgdat)

3604

static void build_zonelist_cache(pg_data_t *pgdat)

3610

{

3605

{

3611

struct zonelist *zonelist;

3606

struct zonelist *zonelist;

3612

struct zonelist_cache *zlc;

3607

struct zonelist_cache *zlc;

3613

struct zoneref *z;

3608

struct zoneref *z;

3614

3609

3615

zonelist = &pgdat->node_zonelists[0];

3610

zonelist = &pgdat->node_zonelists[0];

3616

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3611

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3617

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3612

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3618

for (z = zonelist->_zonerefs; z->zone; z++)

3613

for (z = zonelist->_zonerefs; z->zone; z++)

3619

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3614

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3620

}

3615

}

3621

3616

3622

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3617

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3623

/*

3618

/*

3624

* Return node id of node used for "local" allocations.

3619

* Return node id of node used for "local" allocations.

3625

* I.e., first node id of first zone in arg node's generic zonelist.

3620

* I.e., first node id of first zone in arg node's generic zonelist.

3626

* Used for initializing percpu 'numa_mem', which is used primarily

3621

* Used for initializing percpu 'numa_mem', which is used primarily

3627

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3622

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3628

*/

3623

*/

3629

int local_memory_node(int node)

3624

int local_memory_node(int node)

3630

{

3625

{

3631

struct zone *zone;

3626

struct zone *zone;

3632

3627

3633

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3628

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3634

gfp_zone(GFP_KERNEL),

3629

gfp_zone(GFP_KERNEL),

3635

NULL,

3630

NULL,

3636

&zone);

3631

&zone);

3637

return zone->node;

3632

return zone->node;

3638

}

3633

}

3639

#endif

3634

#endif

3640

3635

3641

#else /* CONFIG_NUMA */

3636

#else /* CONFIG_NUMA */

3642

3637

3643

static void set_zonelist_order(void)

3638

static void set_zonelist_order(void)

3644

{

3639

{

3645

current_zonelist_order = ZONELIST_ORDER_ZONE;

3640

current_zonelist_order = ZONELIST_ORDER_ZONE;

3646

}

3641

}

3647

3642

3648

static void build_zonelists(pg_data_t *pgdat)

3643

static void build_zonelists(pg_data_t *pgdat)

3649

{

3644

{

3650

int node, local_node;

3645

int node, local_node;

3651

enum zone_type j;

3646

enum zone_type j;

3652

struct zonelist *zonelist;

3647

struct zonelist *zonelist;

3653

3648

3654

local_node = pgdat->node_id;

3649

local_node = pgdat->node_id;

3655

3650

3656

zonelist = &pgdat->node_zonelists[0];

3651

zonelist = &pgdat->node_zonelists[0];

3657

j = build_zonelists_node(pgdat, zonelist, 0);

3652

j = build_zonelists_node(pgdat, zonelist, 0);

3658

3653

3659

/*

3654

/*

3660

* Now we build the zonelist so that it contains the zones

3655

* Now we build the zonelist so that it contains the zones

3661

* of all the other nodes.

3656

* of all the other nodes.

3662

* We don't want to pressure a particular node, so when

3657

* We don't want to pressure a particular node, so when

3663

* building the zones for node N, we make sure that the

3658

* building the zones for node N, we make sure that the

3664

* zones coming right after the local ones are those from

3659

* zones coming right after the local ones are those from

3665

* node N+1 (modulo N)

3660

* node N+1 (modulo N)

3666

*/

3661

*/

3667

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3662

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3668

if (!node_online(node))

3663

if (!node_online(node))

3669

continue;

3664

continue;

3670

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3665

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3671

}

3666

}

3672

for (node = 0; node < local_node; node++) {

3667

for (node = 0; node < local_node; node++) {

3673

if (!node_online(node))

3668

if (!node_online(node))

3674

continue;

3669

continue;

3675

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3670

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3676

}

3671

}

3677

3672

3678

zonelist->_zonerefs[j].zone = NULL;

3673

zonelist->_zonerefs[j].zone = NULL;

3679

zonelist->_zonerefs[j].zone_idx = 0;

3674

zonelist->_zonerefs[j].zone_idx = 0;

3680

}

3675

}

3681

3676

3682

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3677

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3683

static void build_zonelist_cache(pg_data_t *pgdat)

3678

static void build_zonelist_cache(pg_data_t *pgdat)

3684

{

3679

{

3685

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3680

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3686

}

3681

}

3687

3682

3688

#endif /* CONFIG_NUMA */

3683

#endif /* CONFIG_NUMA */

3689

3684

3690

/*

3685

/*

3691

* Boot pageset table. One per cpu which is going to be used for all

3686

* Boot pageset table. One per cpu which is going to be used for all

3692

* zones and all nodes. The parameters will be set in such a way

3687

* zones and all nodes. The parameters will be set in such a way

3693

* that an item put on a list will immediately be handed over to

3688

* that an item put on a list will immediately be handed over to

3694

* the buddy list. This is safe since pageset manipulation is done

3689

* the buddy list. This is safe since pageset manipulation is done

3695

* with interrupts disabled.

3690

* with interrupts disabled.

3696

*

3691

*

3697

* The boot_pagesets must be kept even after bootup is complete for

3692

* The boot_pagesets must be kept even after bootup is complete for

3698

* unused processors and/or zones. They do play a role for bootstrapping

3693

* unused processors and/or zones. They do play a role for bootstrapping

3699

* hotplugged processors.

3694

* hotplugged processors.

3700

*

3695

*

3701

* zoneinfo_show() and maybe other functions do

3696

* zoneinfo_show() and maybe other functions do

3702

* not check if the processor is online before following the pageset pointer.

3697

* not check if the processor is online before following the pageset pointer.

3703

* Other parts of the kernel may not check if the zone is available.

3698

* Other parts of the kernel may not check if the zone is available.

3704

*/

3699

*/

3705

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3700

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3706

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3701

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3707

static void setup_zone_pageset(struct zone *zone);

3702

static void setup_zone_pageset(struct zone *zone);

3708

3703

3709

/*

3704

/*

3710

* Global mutex to protect against size modification of zonelists

3705

* Global mutex to protect against size modification of zonelists

3711

* as well as to serialize pageset setup for the new populated zone.

3706

* as well as to serialize pageset setup for the new populated zone.

3712

*/

3707

*/

3713

DEFINE_MUTEX(zonelists_mutex);

3708

DEFINE_MUTEX(zonelists_mutex);

3714

3709

3715

/* return values int ....just for stop_machine() */

3710

/* return values int ....just for stop_machine() */

3716

static int __build_all_zonelists(void *data)

3711

static int __build_all_zonelists(void *data)

3717

{

3712

{

3718

int nid;

3713

int nid;

3719

int cpu;

3714

int cpu;

3720

pg_data_t *self = data;

3715

pg_data_t *self = data;

3721

3716

3722

#ifdef CONFIG_NUMA

3717

#ifdef CONFIG_NUMA

3723

memset(node_load, 0, sizeof(node_load));

3718

memset(node_load, 0, sizeof(node_load));

3724

#endif

3719

#endif

3725

3720

3726

if (self && !node_online(self->node_id)) {

3721

if (self && !node_online(self->node_id)) {

3727

build_zonelists(self);

3722

build_zonelists(self);

3728

build_zonelist_cache(self);

3723

build_zonelist_cache(self);

3729

}

3724

}

3730

3725

3731

for_each_online_node(nid) {

3726

for_each_online_node(nid) {

3732

pg_data_t *pgdat = NODE_DATA(nid);

3727

pg_data_t *pgdat = NODE_DATA(nid);

3733

3728

3734

build_zonelists(pgdat);

3729

build_zonelists(pgdat);

3735

build_zonelist_cache(pgdat);

3730

build_zonelist_cache(pgdat);

3736

}

3731

}

3737

3732

3738

/*

3733

/*

3739

* Initialize the boot_pagesets that are going to be used

3734

* Initialize the boot_pagesets that are going to be used

3740

* for bootstrapping processors. The real pagesets for

3735

* for bootstrapping processors. The real pagesets for

3741

* each zone will be allocated later when the per cpu

3736

* each zone will be allocated later when the per cpu

3742

* allocator is available.

3737

* allocator is available.

3743

*

3738

*

3744

* boot_pagesets are used also for bootstrapping offline

3739

* boot_pagesets are used also for bootstrapping offline

3745

* cpus if the system is already booted because the pagesets

3740

* cpus if the system is already booted because the pagesets

3746

* are needed to initialize allocators on a specific cpu too.

3741

* are needed to initialize allocators on a specific cpu too.

3747

* F.e. the percpu allocator needs the page allocator which

3742

* F.e. the percpu allocator needs the page allocator which

3748

* needs the percpu allocator in order to allocate its pagesets

3743

* needs the percpu allocator in order to allocate its pagesets

3749

* (a chicken-egg dilemma).

3744

* (a chicken-egg dilemma).

3750

*/

3745

*/

3751

for_each_possible_cpu(cpu) {

3746

for_each_possible_cpu(cpu) {

3752

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3747

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3753

3748

3754

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3749

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3755

/*

3750

/*

3756

* We now know the "local memory node" for each node--

3751

* We now know the "local memory node" for each node--

3757

* i.e., the node of the first zone in the generic zonelist.

3752

* i.e., the node of the first zone in the generic zonelist.

3758

* Set up numa_mem percpu variable for on-line cpus. During

3753

* Set up numa_mem percpu variable for on-line cpus. During

3759

* boot, only the boot cpu should be on-line; we'll init the

3754

* boot, only the boot cpu should be on-line; we'll init the

3760

* secondary cpus' numa_mem as they come on-line. During

3755

* secondary cpus' numa_mem as they come on-line. During

3761

* node/memory hotplug, we'll fixup all on-line cpus.

3756

* node/memory hotplug, we'll fixup all on-line cpus.

3762

*/

3757

*/

3763

if (cpu_online(cpu))

3758

if (cpu_online(cpu))

3764

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3759

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3765

#endif

3760

#endif

3766

}

3761

}

3767

3762

3768

return 0;

3763

return 0;

3769

}

3764

}

3770

3765

3771

/*

3766

/*

3772

* Called with zonelists_mutex held always

3767

* Called with zonelists_mutex held always

3773

* unless system_state == SYSTEM_BOOTING.

3768

* unless system_state == SYSTEM_BOOTING.

3774

*/

3769

*/

3775

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3770

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3776

{

3771

{

3777

set_zonelist_order();

3772

set_zonelist_order();

3778

3773

3779

if (system_state == SYSTEM_BOOTING) {

3774

if (system_state == SYSTEM_BOOTING) {

3780

__build_all_zonelists(NULL);

3775

__build_all_zonelists(NULL);

3781

mminit_verify_zonelist();

3776

mminit_verify_zonelist();

3782

cpuset_init_current_mems_allowed();

3777

cpuset_init_current_mems_allowed();

3783

} else {

3778

} else {

3784

#ifdef CONFIG_MEMORY_HOTPLUG

3779

#ifdef CONFIG_MEMORY_HOTPLUG

3785

if (zone)

3780

if (zone)

3786

setup_zone_pageset(zone);

3781

setup_zone_pageset(zone);

3787

#endif

3782

#endif

3788

/* we have to stop all cpus to guarantee there is no user

3783

/* we have to stop all cpus to guarantee there is no user

3789

of zonelist */

3784

of zonelist */

3790

stop_machine(__build_all_zonelists, pgdat, NULL);

3785

stop_machine(__build_all_zonelists, pgdat, NULL);

3791

/* cpuset refresh routine should be here */

3786

/* cpuset refresh routine should be here */

3792

}

3787

}

3793

vm_total_pages = nr_free_pagecache_pages();

3788

vm_total_pages = nr_free_pagecache_pages();

3794

/*

3789

/*

3795

* Disable grouping by mobility if the number of pages in the

3790

* Disable grouping by mobility if the number of pages in the

3796

* system is too low to allow the mechanism to work. It would be

3791

* system is too low to allow the mechanism to work. It would be

3797

* more accurate, but expensive to check per-zone. This check is

3792

* more accurate, but expensive to check per-zone. This check is

3798

* made on memory-hotadd so a system can start with mobility

3793

* made on memory-hotadd so a system can start with mobility

3799

* disabled and enable it later

3794

* disabled and enable it later

3800

*/

3795

*/

3801

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3796

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3802

page_group_by_mobility_disabled = 1;

3797

page_group_by_mobility_disabled = 1;

3803

else

3798

else

3804

page_group_by_mobility_disabled = 0;

3799

page_group_by_mobility_disabled = 0;

3805

3800

3806

printk("Built %i zonelists in %s order, mobility grouping %s. "

3801

printk("Built %i zonelists in %s order, mobility grouping %s. "

3807

"Total pages: %ld\n",

3802

"Total pages: %ld\n",

3808

nr_online_nodes,

3803

nr_online_nodes,

3809

zonelist_order_name[current_zonelist_order],

3804

zonelist_order_name[current_zonelist_order],

3810

page_group_by_mobility_disabled ? "off" : "on",

3805

page_group_by_mobility_disabled ? "off" : "on",

3811

vm_total_pages);

3806

vm_total_pages);

3812

#ifdef CONFIG_NUMA

3807

#ifdef CONFIG_NUMA

3813

printk("Policy zone: %s\n", zone_names[policy_zone]);

3808

printk("Policy zone: %s\n", zone_names[policy_zone]);

3814

#endif

3809

#endif

3815

}

3810

}

3816

3811

3817

/*

3812

/*

3818

* Helper functions to size the waitqueue hash table.

3813

* Helper functions to size the waitqueue hash table.

3819

* Essentially these want to choose hash table sizes sufficiently

3814

* Essentially these want to choose hash table sizes sufficiently

3820

* large so that collisions trying to wait on pages are rare.

3815

* large so that collisions trying to wait on pages are rare.

3821

* But in fact, the number of active page waitqueues on typical

3816

* But in fact, the number of active page waitqueues on typical

3822

* systems is ridiculously low, less than 200. So this is even

3817

* systems is ridiculously low, less than 200. So this is even

3823

* conservative, even though it seems large.

3818

* conservative, even though it seems large.

3824

*

3819

*

3825

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3820

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3826

* waitqueues, i.e. the size of the waitq table given the number of pages.

3821

* waitqueues, i.e. the size of the waitq table given the number of pages.

3827

*/

3822

*/

3828

#define PAGES_PER_WAITQUEUE 256

3823

#define PAGES_PER_WAITQUEUE 256

3829

3824

3830

#ifndef CONFIG_MEMORY_HOTPLUG

3825

#ifndef CONFIG_MEMORY_HOTPLUG

3831

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3826

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3832

{

3827

{

3833

unsigned long size = 1;

3828

unsigned long size = 1;

3834

3829

3835

pages /= PAGES_PER_WAITQUEUE;

3830

pages /= PAGES_PER_WAITQUEUE;

3836

3831

3837

while (size < pages)

3832

while (size < pages)

3838

size <<= 1;

3833

size <<= 1;

3839

3834

3840

/*

3835

/*

3841

* Once we have dozens or even hundreds of threads sleeping

3836

* Once we have dozens or even hundreds of threads sleeping

3842

* on IO we've got bigger problems than wait queue collision.

3837

* on IO we've got bigger problems than wait queue collision.

3843

* Limit the size of the wait table to a reasonable size.

3838

* Limit the size of the wait table to a reasonable size.

3844

*/

3839

*/

3845

size = min(size, 4096UL);

3840

size = min(size, 4096UL);

3846

3841

3847

return max(size, 4UL);

3842

return max(size, 4UL);

3848

}

3843

}

3849

#else

3844

#else

3850

/*

3845

/*

3851

* A zone's size might be changed by hot-add, so it is not possible to determine

3846

* A zone's size might be changed by hot-add, so it is not possible to determine

3852

* a suitable size for its wait_table. So we use the maximum size now.

3847

* a suitable size for its wait_table. So we use the maximum size now.

3853

*

3848

*

3854

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3849

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3855

*

3850

*

3856

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3851

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3857

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3852

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3858

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3853

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3859

*

3854

*

3860

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3855

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3861

* or more by the traditional way. (See above). It equals:

3856

* or more by the traditional way. (See above). It equals:

3862

*

3857

*

3863

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3858

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3864

* ia64(16K page size) : = ( 8G + 4M)byte.

3859

* ia64(16K page size) : = ( 8G + 4M)byte.

3865

* powerpc (64K page size) : = (32G +16M)byte.

3860

* powerpc (64K page size) : = (32G +16M)byte.

3866

*/

3861

*/

3867

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3862

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3868

{

3863

{

3869

return 4096UL;

3864

return 4096UL;

3870

}

3865

}

3871

#endif

3866

#endif

3872

3867

3873

/*

3868

/*

3874

* This is an integer logarithm so that shifts can be used later

3869

* This is an integer logarithm so that shifts can be used later

3875

* to extract the more random high bits from the multiplicative

3870

* to extract the more random high bits from the multiplicative

3876

* hash function before the remainder is taken.

3871

* hash function before the remainder is taken.

3877

*/

3872

*/

3878

static inline unsigned long wait_table_bits(unsigned long size)

3873

static inline unsigned long wait_table_bits(unsigned long size)

3879

{

3874

{

3880

return ffz(~size);

3875

return ffz(~size);

3881

}

3876

}

3882

3877

3883

/*

3878

/*

3884

* Check if a pageblock contains reserved pages

3879

* Check if a pageblock contains reserved pages

3885

*/

3880

*/

3886

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3881

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3887

{

3882

{

3888

unsigned long pfn;

3883

unsigned long pfn;

3889

3884

3890

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3885

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3891

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3886

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3892

return 1;

3887

return 1;

3893

}

3888

}

3894

return 0;

3889

return 0;

3895

}

3890

}

3896

3891

3897

/*

3892

/*

3898

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3893

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3899

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3894

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3900

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3895

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3901

* higher will lead to a bigger reserve which will get freed as contiguous

3896

* higher will lead to a bigger reserve which will get freed as contiguous

3902

* blocks as reclaim kicks in

3897

* blocks as reclaim kicks in

3903

*/

3898

*/

3904

static void setup_zone_migrate_reserve(struct zone *zone)

3899

static void setup_zone_migrate_reserve(struct zone *zone)

3905

{

3900

{

3906

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3901

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3907

struct page *page;

3902

struct page *page;

3908

unsigned long block_migratetype;

3903

unsigned long block_migratetype;

3909

int reserve;

3904

int reserve;

3910

3905

3911

/*

3906

/*

3912

* Get the start pfn, end pfn and the number of blocks to reserve

3907

* Get the start pfn, end pfn and the number of blocks to reserve

3913

* We have to be careful to be aligned to pageblock_nr_pages to

3908

* We have to be careful to be aligned to pageblock_nr_pages to

3914

* make sure that we always check pfn_valid for the first page in

3909

* make sure that we always check pfn_valid for the first page in

3915

* the block.

3910

* the block.

3916

*/

3911

*/

3917

start_pfn = zone->zone_start_pfn;

3912

start_pfn = zone->zone_start_pfn;

3918

end_pfn = zone_end_pfn(zone);

3913

end_pfn = zone_end_pfn(zone);

3919

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3914

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3920

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3915

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3921

pageblock_order;

3916

pageblock_order;

3922

3917

3923

/*

3918

/*

3924

* Reserve blocks are generally in place to help high-order atomic

3919

* Reserve blocks are generally in place to help high-order atomic

3925

* allocations that are short-lived. A min_free_kbytes value that

3920

* allocations that are short-lived. A min_free_kbytes value that

3926

* would result in more than 2 reserve blocks for atomic allocations

3921

* would result in more than 2 reserve blocks for atomic allocations

3927

* is assumed to be in place to help anti-fragmentation for the

3922

* is assumed to be in place to help anti-fragmentation for the

3928

* future allocation of hugepages at runtime.

3923

* future allocation of hugepages at runtime.

3929

*/

3924

*/

3930

reserve = min(2, reserve);

3925

reserve = min(2, reserve);

3931

3926

3932

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3927

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3933

if (!pfn_valid(pfn))

3928

if (!pfn_valid(pfn))

3934

continue;

3929

continue;

3935

page = pfn_to_page(pfn);

3930

page = pfn_to_page(pfn);

3936

3931

3937

/* Watch out for overlapping nodes */

3932

/* Watch out for overlapping nodes */

3938

if (page_to_nid(page) != zone_to_nid(zone))

3933

if (page_to_nid(page) != zone_to_nid(zone))

3939

continue;

3934

continue;

3940

3935

3941

block_migratetype = get_pageblock_migratetype(page);

3936

block_migratetype = get_pageblock_migratetype(page);

3942

3937

3943

/* Only test what is necessary when the reserves are not met */

3938

/* Only test what is necessary when the reserves are not met */

3944

if (reserve > 0) {

3939

if (reserve > 0) {

3945

/*

3940

/*

3946

* Blocks with reserved pages will never free, skip

3941

* Blocks with reserved pages will never free, skip

3947

* them.

3942

* them.

3948

*/

3943

*/

3949

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3944

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3950

if (pageblock_is_reserved(pfn, block_end_pfn))

3945

if (pageblock_is_reserved(pfn, block_end_pfn))

3951

continue;

3946

continue;

3952

3947

3953

/* If this block is reserved, account for it */

3948

/* If this block is reserved, account for it */

3954

if (block_migratetype == MIGRATE_RESERVE) {

3949

if (block_migratetype == MIGRATE_RESERVE) {

3955

reserve--;

3950

reserve--;

3956

continue;

3951

continue;

3957

}

3952

}

3958

3953

3959

/* Suitable for reserving if this block is movable */

3954

/* Suitable for reserving if this block is movable */

3960

if (block_migratetype == MIGRATE_MOVABLE) {

3955

if (block_migratetype == MIGRATE_MOVABLE) {

3961

set_pageblock_migratetype(page,

3956

set_pageblock_migratetype(page,

3962

MIGRATE_RESERVE);

3957

MIGRATE_RESERVE);

3963

move_freepages_block(zone, page,

3958

move_freepages_block(zone, page,

3964

MIGRATE_RESERVE);

3959

MIGRATE_RESERVE);

3965

reserve--;

3960

reserve--;

3966

continue;

3961

continue;

3967

}

3962

}

3968

}

3963

}

3969

3964

3970

/*

3965

/*

3971

* If the reserve is met and this is a previous reserved block,

3966

* If the reserve is met and this is a previous reserved block,

3972

* take it back

3967

* take it back

3973

*/

3968

*/

3974

if (block_migratetype == MIGRATE_RESERVE) {

3969

if (block_migratetype == MIGRATE_RESERVE) {

3975

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3970

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3976

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3971

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3977

}

3972

}

3978

}

3973

}

3979

}

3974

}

3980

3975

3981

/*

3976

/*

3982

* Initially all pages are reserved - free ones are freed

3977

* Initially all pages are reserved - free ones are freed

3983

* up by free_all_bootmem() once the early boot process is

3978

* up by free_all_bootmem() once the early boot process is

3984

* done. Non-atomic initialization, single-pass.

3979

* done. Non-atomic initialization, single-pass.

3985

*/

3980

*/

3986

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3981

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3987

unsigned long start_pfn, enum memmap_context context)

3982

unsigned long start_pfn, enum memmap_context context)

3988

{

3983

{

3989

struct page *page;

3984

struct page *page;

3990

unsigned long end_pfn = start_pfn + size;

3985

unsigned long end_pfn = start_pfn + size;

3991

unsigned long pfn;

3986

unsigned long pfn;

3992

struct zone *z;

3987

struct zone *z;

3993

3988

3994

if (highest_memmap_pfn < end_pfn - 1)

3989

if (highest_memmap_pfn < end_pfn - 1)

3995

highest_memmap_pfn = end_pfn - 1;

3990

highest_memmap_pfn = end_pfn - 1;

3996

3991

3997

z = &NODE_DATA(nid)->node_zones[zone];

3992

z = &NODE_DATA(nid)->node_zones[zone];

3998

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3993

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3999

/*

3994

/*

4000

* There can be holes in boot-time mem_map[]s

3995

* There can be holes in boot-time mem_map[]s

4001

* handed to this function. They do not

3996

* handed to this function. They do not

4002

* exist on hotplugged memory.

3997

* exist on hotplugged memory.

4003

*/

3998

*/

4004

if (context == MEMMAP_EARLY) {

3999

if (context == MEMMAP_EARLY) {

4005

if (!early_pfn_valid(pfn))

4000

if (!early_pfn_valid(pfn))

4006

continue;

4001

continue;

4007

if (!early_pfn_in_nid(pfn, nid))

4002

if (!early_pfn_in_nid(pfn, nid))

4008

continue;

4003

continue;

4009

}

4004

}

4010

page = pfn_to_page(pfn);

4005

page = pfn_to_page(pfn);

4011

set_page_links(page, zone, nid, pfn);

4006

set_page_links(page, zone, nid, pfn);

4012

mminit_verify_page_links(page, zone, nid, pfn);

4007

mminit_verify_page_links(page, zone, nid, pfn);

4013

init_page_count(page);

4008

init_page_count(page);

4014

page_mapcount_reset(page);

4009

page_mapcount_reset(page);

4015

page_cpupid_reset_last(page);

4010

page_cpupid_reset_last(page);

4016

SetPageReserved(page);

4011

SetPageReserved(page);

4017

/*

4012

/*

4018

* Mark the block movable so that blocks are reserved for

4013

* Mark the block movable so that blocks are reserved for

4019

* movable at startup. This will force kernel allocations

4014

* movable at startup. This will force kernel allocations

4020

* to reserve their blocks rather than leaking throughout

4015

* to reserve their blocks rather than leaking throughout

4021

* the address space during boot when many long-lived

4016

* the address space during boot when many long-lived

4022

* kernel allocations are made. Later some blocks near

4017

* kernel allocations are made. Later some blocks near

4023

* the start are marked MIGRATE_RESERVE by

4018

* the start are marked MIGRATE_RESERVE by

4024

* setup_zone_migrate_reserve()

4019

* setup_zone_migrate_reserve()

4025

*

4020

*

4026

* bitmap is created for zone's valid pfn range. but memmap

4021

* bitmap is created for zone's valid pfn range. but memmap

4027

* can be created for invalid pages (for alignment)

4022

* can be created for invalid pages (for alignment)

4028

* check here not to call set_pageblock_migratetype() against

4023

* check here not to call set_pageblock_migratetype() against

4029

* pfn out of zone.

4024

* pfn out of zone.

4030

*/

4025

*/

4031

if ((z->zone_start_pfn <= pfn)

4026

if ((z->zone_start_pfn <= pfn)

4032

&& (pfn < zone_end_pfn(z))

4027

&& (pfn < zone_end_pfn(z))

4033

&& !(pfn & (pageblock_nr_pages - 1)))

4028

&& !(pfn & (pageblock_nr_pages - 1)))

4034

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4029

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4035

4030

4036

INIT_LIST_HEAD(&page->lru);

4031

INIT_LIST_HEAD(&page->lru);

4037

#ifdef WANT_PAGE_VIRTUAL

4032

#ifdef WANT_PAGE_VIRTUAL

4038

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4033

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4039

if (!is_highmem_idx(zone))

4034

if (!is_highmem_idx(zone))

4040

set_page_address(page, __va(pfn << PAGE_SHIFT));

4035

set_page_address(page, __va(pfn << PAGE_SHIFT));

4041

#endif

4036

#endif

4042

}

4037

}

4043

}

4038

}

4044

4039

4045

static void __meminit zone_init_free_lists(struct zone *zone)

4040

static void __meminit zone_init_free_lists(struct zone *zone)

4046

{

4041

{

4047

int order, t;

4042

int order, t;

4048

for_each_migratetype_order(order, t) {

4043

for_each_migratetype_order(order, t) {

4049

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4044

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4050

zone->free_area[order].nr_free = 0;

4045

zone->free_area[order].nr_free = 0;

4051

}

4046

}

4052

}

4047

}

4053

4048

4054

#ifndef __HAVE_ARCH_MEMMAP_INIT

4049

#ifndef __HAVE_ARCH_MEMMAP_INIT

4055

#define memmap_init(size, nid, zone, start_pfn) \

4050

#define memmap_init(size, nid, zone, start_pfn) \

4056

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4051

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4057

#endif

4052

#endif

4058

4053

4059

static int __meminit zone_batchsize(struct zone *zone)

4054

static int __meminit zone_batchsize(struct zone *zone)

4060

{

4055

{

4061

#ifdef CONFIG_MMU

4056

#ifdef CONFIG_MMU

4062

int batch;

4057

int batch;

4063

4058

4064

/*

4059

/*

4065

* The per-cpu-pages pools are set to around 1000th of the

4060

* The per-cpu-pages pools are set to around 1000th of the

4066

* size of the zone. But no more than 1/2 of a meg.

4061

* size of the zone. But no more than 1/2 of a meg.

4067

*

4062

*

4068

* OK, so we don't know how big the cache is. So guess.

4063

* OK, so we don't know how big the cache is. So guess.

4069

*/

4064

*/

4070

batch = zone->managed_pages / 1024;

4065

batch = zone->managed_pages / 1024;

4071

if (batch * PAGE_SIZE > 512 * 1024)

4066

if (batch * PAGE_SIZE > 512 * 1024)

4072

batch = (512 * 1024) / PAGE_SIZE;

4067

batch = (512 * 1024) / PAGE_SIZE;

4073

batch /= 4; /* We effectively *= 4 below */

4068

batch /= 4; /* We effectively *= 4 below */

4074

if (batch < 1)

4069

if (batch < 1)

4075

batch = 1;

4070

batch = 1;

4076

4071

4077

/*

4072

/*

4078

* Clamp the batch to a 2^n - 1 value. Having a power

4073

* Clamp the batch to a 2^n - 1 value. Having a power

4079

* of 2 value was found to be more likely to have

4074

* of 2 value was found to be more likely to have

4080

* suboptimal cache aliasing properties in some cases.

4075

* suboptimal cache aliasing properties in some cases.

4081

*

4076

*

4082

* For example if 2 tasks are alternately allocating

4077

* For example if 2 tasks are alternately allocating

4083

* batches of pages, one task can end up with a lot

4078

* batches of pages, one task can end up with a lot

4084

* of pages of one half of the possible page colors

4079

* of pages of one half of the possible page colors

4085

* and the other with pages of the other colors.

4080

* and the other with pages of the other colors.

4086

*/

4081

*/

4087

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4082

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4088

4083

4089

return batch;

4084

return batch;

4090

4085

4091

#else

4086

#else

4092

/* The deferral and batching of frees should be suppressed under NOMMU

4087

/* The deferral and batching of frees should be suppressed under NOMMU

4093

* conditions.

4088

* conditions.

4094

*

4089

*

4095

* The problem is that NOMMU needs to be able to allocate large chunks

4090

* The problem is that NOMMU needs to be able to allocate large chunks

4096

* of contiguous memory as there's no hardware page translation to

4091

* of contiguous memory as there's no hardware page translation to

4097

* assemble apparent contiguous memory from discontiguous pages.

4092

* assemble apparent contiguous memory from discontiguous pages.

4098

*

4093

*

4099

* Queueing large contiguous runs of pages for batching, however,

4094

* Queueing large contiguous runs of pages for batching, however,

4100

* causes the pages to actually be freed in smaller chunks. As there

4095

* causes the pages to actually be freed in smaller chunks. As there

4101

* can be a significant delay between the individual batches being

4096

* can be a significant delay between the individual batches being

4102

* recycled, this leads to the once large chunks of space being

4097

* recycled, this leads to the once large chunks of space being

4103

* fragmented and becoming unavailable for high-order allocations.

4098

* fragmented and becoming unavailable for high-order allocations.

4104

*/

4099

*/

4105

return 0;

4100

return 0;

4106

#endif

4101

#endif

4107

}

4102

}

4108

4103

4109

/*

4104

/*

4110

* pcp->high and pcp->batch values are related and dependent on one another:

4105

* pcp->high and pcp->batch values are related and dependent on one another:

4111

* ->batch must never be higher then ->high.

4106

* ->batch must never be higher then ->high.

4112

* The following function updates them in a safe manner without read side

4107

* The following function updates them in a safe manner without read side

4113

* locking.

4108

* locking.

4114

*

4109

*

4115

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4110

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4116

* those fields changing asynchronously (acording the the above rule).

4111

* those fields changing asynchronously (acording the the above rule).

4117

*

4112

*

4118

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4113

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4119

* outside of boot time (or some other assurance that no concurrent updaters

4114

* outside of boot time (or some other assurance that no concurrent updaters

4120

* exist).

4115

* exist).

4121

*/

4116

*/

4122

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4117

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4123

unsigned long batch)

4118

unsigned long batch)

4124

{

4119

{

4125

/* start with a fail safe value for batch */

4120

/* start with a fail safe value for batch */

4126

pcp->batch = 1;

4121

pcp->batch = 1;

4127

smp_wmb();

4122

smp_wmb();

4128

4123

4129

/* Update high, then batch, in order */

4124

/* Update high, then batch, in order */

4130

pcp->high = high;

4125

pcp->high = high;

4131

smp_wmb();

4126

smp_wmb();

4132

4127

4133

pcp->batch = batch;

4128

pcp->batch = batch;

4134

}

4129

}

4135

4130

4136

/* a companion to pageset_set_high() */

4131

/* a companion to pageset_set_high() */

4137

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4132

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4138

{

4133

{

4139

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4134

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4140

}

4135

}

4141

4136

4142

static void pageset_init(struct per_cpu_pageset *p)

4137

static void pageset_init(struct per_cpu_pageset *p)

4143

{

4138

{

4144

struct per_cpu_pages *pcp;

4139

struct per_cpu_pages *pcp;

4145

int migratetype;

4140

int migratetype;

4146

4141

4147

memset(p, 0, sizeof(*p));

4142

memset(p, 0, sizeof(*p));

4148

4143

4149

pcp = &p->pcp;

4144

pcp = &p->pcp;

4150

pcp->count = 0;

4145

pcp->count = 0;

4151

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4146

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4152

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4147

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4153

}

4148

}

4154

4149

4155

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4150

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4156

{

4151

{

4157

pageset_init(p);

4152

pageset_init(p);

4158

pageset_set_batch(p, batch);

4153

pageset_set_batch(p, batch);

4159

}

4154

}

4160

4155

4161

/*

4156

/*

4162

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4157

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4163

* to the value high for the pageset p.

4158

* to the value high for the pageset p.

4164

*/

4159

*/

4165

static void pageset_set_high(struct per_cpu_pageset *p,

4160

static void pageset_set_high(struct per_cpu_pageset *p,

4166

unsigned long high)

4161

unsigned long high)

4167

{

4162

{

4168

unsigned long batch = max(1UL, high / 4);

4163

unsigned long batch = max(1UL, high / 4);

4169

if ((high / 4) > (PAGE_SHIFT * 8))

4164

if ((high / 4) > (PAGE_SHIFT * 8))

4170

batch = PAGE_SHIFT * 8;

4165

batch = PAGE_SHIFT * 8;

4171

4166

4172

pageset_update(&p->pcp, high, batch);

4167

pageset_update(&p->pcp, high, batch);

4173

}

4168

}

4174

4169

4175

static void __meminit pageset_set_high_and_batch(struct zone *zone,

4170

static void __meminit pageset_set_high_and_batch(struct zone *zone,

4176

struct per_cpu_pageset *pcp)

4171

struct per_cpu_pageset *pcp)

4177

{

4172

{

4178

if (percpu_pagelist_fraction)

4173

if (percpu_pagelist_fraction)

4179

pageset_set_high(pcp,

4174

pageset_set_high(pcp,

4180

(zone->managed_pages /

4175

(zone->managed_pages /

4181

percpu_pagelist_fraction));

4176

percpu_pagelist_fraction));

4182

else

4177

else

4183

pageset_set_batch(pcp, zone_batchsize(zone));

4178

pageset_set_batch(pcp, zone_batchsize(zone));

4184

}

4179

}

4185

4180

4186

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4181

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4187

{

4182

{

4188

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4183

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4189

4184

4190

pageset_init(pcp);

4185

pageset_init(pcp);

4191

pageset_set_high_and_batch(zone, pcp);

4186

pageset_set_high_and_batch(zone, pcp);

4192

}

4187

}

4193

4188

4194

static void __meminit setup_zone_pageset(struct zone *zone)

4189

static void __meminit setup_zone_pageset(struct zone *zone)

4195

{

4190

{

4196

int cpu;

4191

int cpu;

4197

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4192

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4198

for_each_possible_cpu(cpu)

4193

for_each_possible_cpu(cpu)

4199

zone_pageset_init(zone, cpu);

4194

zone_pageset_init(zone, cpu);

4200

}

4195

}

4201

4196

4202

/*

4197

/*

4203

* Allocate per cpu pagesets and initialize them.

4198

* Allocate per cpu pagesets and initialize them.

4204

* Before this call only boot pagesets were available.

4199

* Before this call only boot pagesets were available.

4205

*/

4200

*/

4206

void __init setup_per_cpu_pageset(void)

4201

void __init setup_per_cpu_pageset(void)

4207

{

4202

{

4208

struct zone *zone;

4203

struct zone *zone;

4209

4204

4210

for_each_populated_zone(zone)

4205

for_each_populated_zone(zone)

4211

setup_zone_pageset(zone);

4206

setup_zone_pageset(zone);

4212

}

4207

}

4213

4208

4214

static noinline __init_refok

4209

static noinline __init_refok

4215

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4210

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4216

{

4211

{

4217

int i;

4212

int i;

4218

struct pglist_data *pgdat = zone->zone_pgdat;

4213

struct pglist_data *pgdat = zone->zone_pgdat;

4219

size_t alloc_size;

4214

size_t alloc_size;

4220

4215

4221

/*

4216

/*

4222

* The per-page waitqueue mechanism uses hashed waitqueues

4217

* The per-page waitqueue mechanism uses hashed waitqueues

4223

* per zone.

4218

* per zone.

4224

*/

4219

*/

4225

zone->wait_table_hash_nr_entries =

4220

zone->wait_table_hash_nr_entries =

4226

wait_table_hash_nr_entries(zone_size_pages);

4221

wait_table_hash_nr_entries(zone_size_pages);

4227

zone->wait_table_bits =

4222

zone->wait_table_bits =

4228

wait_table_bits(zone->wait_table_hash_nr_entries);

4223

wait_table_bits(zone->wait_table_hash_nr_entries);

4229

alloc_size = zone->wait_table_hash_nr_entries

4224

alloc_size = zone->wait_table_hash_nr_entries

4230

* sizeof(wait_queue_head_t);

4225

* sizeof(wait_queue_head_t);

4231

4226

4232

if (!slab_is_available()) {

4227

if (!slab_is_available()) {

4233

zone->wait_table = (wait_queue_head_t *)

4228

zone->wait_table = (wait_queue_head_t *)

4234

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4229

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4235

} else {

4230

} else {

4236

/*

4231

/*

4237

* This case means that a zone whose size was 0 gets new memory

4232

* This case means that a zone whose size was 0 gets new memory

4238

* via memory hot-add.

4233

* via memory hot-add.

4239

* But it may be the case that a new node was hot-added. In

4234

* But it may be the case that a new node was hot-added. In

4240

* this case vmalloc() will not be able to use this new node's

4235

* this case vmalloc() will not be able to use this new node's

4241

* memory - this wait_table must be initialized to use this new

4236

* memory - this wait_table must be initialized to use this new

4242

* node itself as well.

4237

* node itself as well.

4243

* To use this new node's memory, further consideration will be

4238

* To use this new node's memory, further consideration will be

4244

* necessary.

4239

* necessary.

4245

*/

4240

*/

4246

zone->wait_table = vmalloc(alloc_size);

4241

zone->wait_table = vmalloc(alloc_size);

4247

}

4242

}

4248

if (!zone->wait_table)

4243

if (!zone->wait_table)

4249

return -ENOMEM;

4244

return -ENOMEM;

4250

4245

4251

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4246

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4252

init_waitqueue_head(zone->wait_table + i);

4247

init_waitqueue_head(zone->wait_table + i);

4253

4248

4254

return 0;

4249

return 0;

4255

}

4250

}

4256

4251

4257

static __meminit void zone_pcp_init(struct zone *zone)

4252

static __meminit void zone_pcp_init(struct zone *zone)

4258

{

4253

{

4259

/*

4254

/*

4260

* per cpu subsystem is not up at this point. The following code

4255

* per cpu subsystem is not up at this point. The following code

4261

* relies on the ability of the linker to provide the

4256

* relies on the ability of the linker to provide the

4262

* offset of a (static) per cpu variable into the per cpu area.

4257

* offset of a (static) per cpu variable into the per cpu area.

4263

*/

4258

*/

4264

zone->pageset = &boot_pageset;

4259

zone->pageset = &boot_pageset;

4265

4260

4266

if (populated_zone(zone))

4261

if (populated_zone(zone))

4267

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4262

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4268

zone->name, zone->present_pages,

4263

zone->name, zone->present_pages,

4269

zone_batchsize(zone));

4264

zone_batchsize(zone));

4270

}

4265

}

4271

4266

4272

int __meminit init_currently_empty_zone(struct zone *zone,

4267

int __meminit init_currently_empty_zone(struct zone *zone,

4273

unsigned long zone_start_pfn,

4268

unsigned long zone_start_pfn,

4274

unsigned long size,

4269

unsigned long size,

4275

enum memmap_context context)

4270

enum memmap_context context)

4276

{

4271

{

4277

struct pglist_data *pgdat = zone->zone_pgdat;

4272

struct pglist_data *pgdat = zone->zone_pgdat;

4278

int ret;

4273

int ret;

4279

ret = zone_wait_table_init(zone, size);

4274

ret = zone_wait_table_init(zone, size);

4280

if (ret)

4275

if (ret)

4281

return ret;

4276

return ret;

4282

pgdat->nr_zones = zone_idx(zone) + 1;

4277

pgdat->nr_zones = zone_idx(zone) + 1;

4283

4278

4284

zone->zone_start_pfn = zone_start_pfn;

4279

zone->zone_start_pfn = zone_start_pfn;

4285

4280

4286

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4281

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4287

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4282

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4288

pgdat->node_id,

4283

pgdat->node_id,

4289

(unsigned long)zone_idx(zone),

4284

(unsigned long)zone_idx(zone),

4290

zone_start_pfn, (zone_start_pfn + size));

4285

zone_start_pfn, (zone_start_pfn + size));

4291

4286

4292

zone_init_free_lists(zone);

4287

zone_init_free_lists(zone);

4293

4288

4294

return 0;

4289

return 0;

4295

}

4290

}

4296

4291

4297

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4292

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4298

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4293

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4299

/*

4294

/*

4300

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4295

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4301

* Architectures may implement their own version but if add_active_range()

4296

* Architectures may implement their own version but if add_active_range()

4302

* was used and there are no special requirements, this is a convenient

4297

* was used and there are no special requirements, this is a convenient

4303

* alternative

4298

* alternative

4304

*/

4299

*/

4305

int __meminit __early_pfn_to_nid(unsigned long pfn)

4300

int __meminit __early_pfn_to_nid(unsigned long pfn)

4306

{

4301

{

4307

unsigned long start_pfn, end_pfn;

4302

unsigned long start_pfn, end_pfn;

4308

int nid;

4303

int nid;

4309

/*

4304

/*

4310

* NOTE: The following SMP-unsafe globals are only used early in boot

4305

* NOTE: The following SMP-unsafe globals are only used early in boot

4311

* when the kernel is running single-threaded.

4306

* when the kernel is running single-threaded.

4312

*/

4307

*/

4313

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4308

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4314

static int __meminitdata last_nid;

4309

static int __meminitdata last_nid;

4315

4310

4316

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4311

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4317

return last_nid;

4312

return last_nid;

4318

4313

4319

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4314

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4320

if (nid != -1) {

4315

if (nid != -1) {

4321

last_start_pfn = start_pfn;

4316

last_start_pfn = start_pfn;

4322

last_end_pfn = end_pfn;

4317

last_end_pfn = end_pfn;

4323

last_nid = nid;

4318

last_nid = nid;

4324

}

4319

}

4325

4320

4326

return nid;

4321

return nid;

4327

}

4322

}

4328

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4323

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4329

4324

4330

int __meminit early_pfn_to_nid(unsigned long pfn)

4325

int __meminit early_pfn_to_nid(unsigned long pfn)

4331

{

4326

{

4332

int nid;

4327

int nid;

4333

4328

4334

nid = __early_pfn_to_nid(pfn);

4329

nid = __early_pfn_to_nid(pfn);

4335

if (nid >= 0)

4330

if (nid >= 0)

4336

return nid;

4331

return nid;

4337

/* just returns 0 */

4332

/* just returns 0 */

4338

return 0;

4333

return 0;

4339

}

4334

}

4340

4335

4341

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4336

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4342

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4337

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4343

{

4338

{

4344

int nid;

4339

int nid;

4345

4340

4346

nid = __early_pfn_to_nid(pfn);

4341

nid = __early_pfn_to_nid(pfn);

4347

if (nid >= 0 && nid != node)

4342

if (nid >= 0 && nid != node)

4348

return false;

4343

return false;

4349

return true;

4344

return true;

4350

}

4345

}

4351

#endif

4346

#endif

4352

4347

4353

/**

4348

/**

4354

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4349

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4355

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4350

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4356

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4351

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4357

*

4352

*

4358

* If an architecture guarantees that all ranges registered with

4353

* If an architecture guarantees that all ranges registered with

4359

* add_active_ranges() contain no holes and may be freed, this

4354

* add_active_ranges() contain no holes and may be freed, this

4360

* this function may be used instead of calling free_bootmem() manually.

4355

* this function may be used instead of calling free_bootmem() manually.

4361

*/

4356

*/

4362

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4357

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4363

{

4358

{

4364

unsigned long start_pfn, end_pfn;

4359

unsigned long start_pfn, end_pfn;

4365

int i, this_nid;

4360

int i, this_nid;

4366

4361

4367

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4362

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4368

start_pfn = min(start_pfn, max_low_pfn);

4363

start_pfn = min(start_pfn, max_low_pfn);

4369

end_pfn = min(end_pfn, max_low_pfn);

4364

end_pfn = min(end_pfn, max_low_pfn);

4370

4365

4371

if (start_pfn < end_pfn)

4366

if (start_pfn < end_pfn)

4372

free_bootmem_node(NODE_DATA(this_nid),

4367

free_bootmem_node(NODE_DATA(this_nid),

4373

PFN_PHYS(start_pfn),

4368

PFN_PHYS(start_pfn),

4374

(end_pfn - start_pfn) << PAGE_SHIFT);

4369

(end_pfn - start_pfn) << PAGE_SHIFT);

4375

}

4370

}

4376

}

4371

}

4377

4372

4378

/**

4373

/**

4379

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4374

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4380

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4375

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4381

*

4376

*

4382

* If an architecture guarantees that all ranges registered with

4377

* If an architecture guarantees that all ranges registered with

4383

* add_active_ranges() contain no holes and may be freed, this

4378

* add_active_ranges() contain no holes and may be freed, this

4384

* function may be used instead of calling memory_present() manually.

4379

* function may be used instead of calling memory_present() manually.

4385

*/

4380

*/

4386

void __init sparse_memory_present_with_active_regions(int nid)

4381

void __init sparse_memory_present_with_active_regions(int nid)

4387

{

4382

{

4388

unsigned long start_pfn, end_pfn;

4383

unsigned long start_pfn, end_pfn;

4389

int i, this_nid;

4384

int i, this_nid;

4390

4385

4391

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4386

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4392

memory_present(this_nid, start_pfn, end_pfn);

4387

memory_present(this_nid, start_pfn, end_pfn);

4393

}

4388

}

4394

4389

4395

/**

4390

/**

4396

* get_pfn_range_for_nid - Return the start and end page frames for a node

4391

* get_pfn_range_for_nid - Return the start and end page frames for a node

4397

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4392

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4398

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4393

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4399

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4394

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4400

*

4395

*

4401

* It returns the start and end page frame of a node based on information

4396

* It returns the start and end page frame of a node based on information

4402

* provided by an arch calling add_active_range(). If called for a node

4397

* provided by an arch calling add_active_range(). If called for a node

4403

* with no available memory, a warning is printed and the start and end

4398

* with no available memory, a warning is printed and the start and end

4404

* PFNs will be 0.

4399

* PFNs will be 0.

4405

*/

4400

*/

4406

void __meminit get_pfn_range_for_nid(unsigned int nid,

4401

void __meminit get_pfn_range_for_nid(unsigned int nid,

4407

unsigned long *start_pfn, unsigned long *end_pfn)

4402

unsigned long *start_pfn, unsigned long *end_pfn)

4408

{

4403

{

4409

unsigned long this_start_pfn, this_end_pfn;

4404

unsigned long this_start_pfn, this_end_pfn;

4410

int i;

4405

int i;

4411

4406

4412

*start_pfn = -1UL;

4407

*start_pfn = -1UL;

4413

*end_pfn = 0;

4408

*end_pfn = 0;

4414

4409

4415

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4410

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4416

*start_pfn = min(*start_pfn, this_start_pfn);

4411

*start_pfn = min(*start_pfn, this_start_pfn);

4417

*end_pfn = max(*end_pfn, this_end_pfn);

4412

*end_pfn = max(*end_pfn, this_end_pfn);

4418

}

4413

}

4419

4414

4420

if (*start_pfn == -1UL)

4415

if (*start_pfn == -1UL)

4421

*start_pfn = 0;

4416

*start_pfn = 0;

4422

}

4417

}

4423

4418

4424

/*

4419

/*

4425

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4420

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4426

* assumption is made that zones within a node are ordered in monotonic

4421

* assumption is made that zones within a node are ordered in monotonic

4427

* increasing memory addresses so that the "highest" populated zone is used

4422

* increasing memory addresses so that the "highest" populated zone is used

4428

*/

4423

*/

4429

static void __init find_usable_zone_for_movable(void)

4424

static void __init find_usable_zone_for_movable(void)

4430

{

4425

{

4431

int zone_index;

4426

int zone_index;

4432

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4427

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4433

if (zone_index == ZONE_MOVABLE)

4428

if (zone_index == ZONE_MOVABLE)

4434

continue;

4429

continue;

4435

4430

4436

if (arch_zone_highest_possible_pfn[zone_index] >

4431

if (arch_zone_highest_possible_pfn[zone_index] >

4437

arch_zone_lowest_possible_pfn[zone_index])

4432

arch_zone_lowest_possible_pfn[zone_index])

4438

break;

4433

break;

4439

}

4434

}

4440

4435

4441

VM_BUG_ON(zone_index == -1);

4436

VM_BUG_ON(zone_index == -1);

4442

movable_zone = zone_index;

4437

movable_zone = zone_index;

4443

}

4438

}

4444

4439

4445

/*

4440

/*

4446

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4441

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4447

* because it is sized independent of architecture. Unlike the other zones,

4442

* because it is sized independent of architecture. Unlike the other zones,

4448

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4443

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4449

* in each node depending on the size of each node and how evenly kernelcore

4444

* in each node depending on the size of each node and how evenly kernelcore

4450

* is distributed. This helper function adjusts the zone ranges

4445

* is distributed. This helper function adjusts the zone ranges

4451

* provided by the architecture for a given node by using the end of the

4446

* provided by the architecture for a given node by using the end of the

4452

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4447

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4453

* zones within a node are in order of monotonic increases memory addresses

4448

* zones within a node are in order of monotonic increases memory addresses

4454

*/

4449

*/

4455

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4450

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4456

unsigned long zone_type,

4451

unsigned long zone_type,

4457

unsigned long node_start_pfn,

4452

unsigned long node_start_pfn,

4458

unsigned long node_end_pfn,

4453

unsigned long node_end_pfn,

4459

unsigned long *zone_start_pfn,

4454

unsigned long *zone_start_pfn,

4460

unsigned long *zone_end_pfn)

4455

unsigned long *zone_end_pfn)

4461

{

4456

{

4462

/* Only adjust if ZONE_MOVABLE is on this node */

4457

/* Only adjust if ZONE_MOVABLE is on this node */

4463

if (zone_movable_pfn[nid]) {

4458

if (zone_movable_pfn[nid]) {

4464

/* Size ZONE_MOVABLE */

4459

/* Size ZONE_MOVABLE */

4465

if (zone_type == ZONE_MOVABLE) {

4460

if (zone_type == ZONE_MOVABLE) {

4466

*zone_start_pfn = zone_movable_pfn[nid];

4461

*zone_start_pfn = zone_movable_pfn[nid];

4467

*zone_end_pfn = min(node_end_pfn,

4462

*zone_end_pfn = min(node_end_pfn,

4468

arch_zone_highest_possible_pfn[movable_zone]);

4463

arch_zone_highest_possible_pfn[movable_zone]);

4469

4464

4470

/* Adjust for ZONE_MOVABLE starting within this range */

4465

/* Adjust for ZONE_MOVABLE starting within this range */

4471

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4466

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4472

*zone_end_pfn > zone_movable_pfn[nid]) {

4467

*zone_end_pfn > zone_movable_pfn[nid]) {

4473

*zone_end_pfn = zone_movable_pfn[nid];

4468

*zone_end_pfn = zone_movable_pfn[nid];

4474

4469

4475

/* Check if this whole range is within ZONE_MOVABLE */

4470

/* Check if this whole range is within ZONE_MOVABLE */

4476

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4471

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4477

*zone_start_pfn = *zone_end_pfn;

4472

*zone_start_pfn = *zone_end_pfn;

4478

}

4473

}

4479

}

4474

}

4480

4475

4481

/*

4476

/*

4482

* Return the number of pages a zone spans in a node, including holes

4477

* Return the number of pages a zone spans in a node, including holes

4483

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4478

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4484

*/

4479

*/

4485

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4480

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4486

unsigned long zone_type,

4481

unsigned long zone_type,

4487

unsigned long node_start_pfn,

4482

unsigned long node_start_pfn,

4488

unsigned long node_end_pfn,

4483

unsigned long node_end_pfn,

4489

unsigned long *ignored)

4484

unsigned long *ignored)

4490

{

4485

{

4491

unsigned long zone_start_pfn, zone_end_pfn;

4486

unsigned long zone_start_pfn, zone_end_pfn;

4492

4487

4493

/* Get the start and end of the zone */

4488

/* Get the start and end of the zone */

4494

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4489

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4495

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4490

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4496

adjust_zone_range_for_zone_movable(nid, zone_type,

4491

adjust_zone_range_for_zone_movable(nid, zone_type,

4497

node_start_pfn, node_end_pfn,

4492

node_start_pfn, node_end_pfn,

4498

&zone_start_pfn, &zone_end_pfn);

4493

&zone_start_pfn, &zone_end_pfn);

4499

4494

4500

/* Check that this node has pages within the zone's required range */

4495

/* Check that this node has pages within the zone's required range */

4501

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4496

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4502

return 0;

4497

return 0;

4503

4498

4504

/* Move the zone boundaries inside the node if necessary */

4499

/* Move the zone boundaries inside the node if necessary */

4505

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4500

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4506

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4501

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4507

4502

4508

/* Return the spanned pages */

4503

/* Return the spanned pages */

4509

return zone_end_pfn - zone_start_pfn;

4504

return zone_end_pfn - zone_start_pfn;

4510

}

4505

}

4511

4506

4512

/*

4507

/*

4513

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4508

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4514

* then all holes in the requested range will be accounted for.

4509

* then all holes in the requested range will be accounted for.

4515

*/

4510

*/

4516

unsigned long __meminit __absent_pages_in_range(int nid,

4511

unsigned long __meminit __absent_pages_in_range(int nid,

4517

unsigned long range_start_pfn,

4512

unsigned long range_start_pfn,

4518

unsigned long range_end_pfn)

4513

unsigned long range_end_pfn)

4519

{

4514

{

4520

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4515

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4521

unsigned long start_pfn, end_pfn;

4516

unsigned long start_pfn, end_pfn;

4522

int i;

4517

int i;

4523

4518

4524

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4519

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4525

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4520

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4526

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4521

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4527

nr_absent -= end_pfn - start_pfn;

4522

nr_absent -= end_pfn - start_pfn;

4528

}

4523

}

4529

return nr_absent;

4524

return nr_absent;

4530

}

4525

}

4531

4526

4532

/**

4527

/**

4533

* absent_pages_in_range - Return number of page frames in holes within a range

4528

* absent_pages_in_range - Return number of page frames in holes within a range

4534

* @start_pfn: The start PFN to start searching for holes

4529

* @start_pfn: The start PFN to start searching for holes

4535

* @end_pfn: The end PFN to stop searching for holes

4530

* @end_pfn: The end PFN to stop searching for holes

4536

*

4531

*

4537

* It returns the number of pages frames in memory holes within a range.

4532

* It returns the number of pages frames in memory holes within a range.

4538

*/

4533

*/

4539

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4534

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4540

unsigned long end_pfn)

4535

unsigned long end_pfn)

4541

{

4536

{

4542

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4537

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4543

}

4538

}

4544

4539

4545

/* Return the number of page frames in holes in a zone on a node */

4540

/* Return the number of page frames in holes in a zone on a node */

4546

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4541

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4547

unsigned long zone_type,

4542

unsigned long zone_type,

4548

unsigned long node_start_pfn,

4543

unsigned long node_start_pfn,

4549

unsigned long node_end_pfn,

4544

unsigned long node_end_pfn,

4550

unsigned long *ignored)

4545

unsigned long *ignored)

4551

{

4546

{

4552

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4547

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4553

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4548

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4554

unsigned long zone_start_pfn, zone_end_pfn;

4549

unsigned long zone_start_pfn, zone_end_pfn;

4555

4550

4556

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4551

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4557

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4552

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4558

4553

4559

adjust_zone_range_for_zone_movable(nid, zone_type,

4554

adjust_zone_range_for_zone_movable(nid, zone_type,

4560

node_start_pfn, node_end_pfn,

4555

node_start_pfn, node_end_pfn,

4561

&zone_start_pfn, &zone_end_pfn);

4556

&zone_start_pfn, &zone_end_pfn);

4562

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4557

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4563

}

4558

}

4564

4559

4565

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4560

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4566

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4561

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4567

unsigned long zone_type,

4562

unsigned long zone_type,

4568

unsigned long node_start_pfn,

4563

unsigned long node_start_pfn,

4569

unsigned long node_end_pfn,

4564

unsigned long node_end_pfn,

4570

unsigned long *zones_size)

4565

unsigned long *zones_size)

4571

{

4566

{

4572

return zones_size[zone_type];

4567

return zones_size[zone_type];

4573

}

4568

}

4574

4569

4575

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4570

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4576

unsigned long zone_type,

4571

unsigned long zone_type,

4577

unsigned long node_start_pfn,

4572

unsigned long node_start_pfn,

4578

unsigned long node_end_pfn,

4573

unsigned long node_end_pfn,

4579

unsigned long *zholes_size)

4574

unsigned long *zholes_size)

4580

{

4575

{

4581

if (!zholes_size)

4576

if (!zholes_size)

4582

return 0;

4577

return 0;

4583

4578

4584

return zholes_size[zone_type];

4579

return zholes_size[zone_type];

4585

}

4580

}

4586

4581

4587

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4582

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4588

4583

4589

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4584

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4590

unsigned long node_start_pfn,

4585

unsigned long node_start_pfn,

4591

unsigned long node_end_pfn,

4586

unsigned long node_end_pfn,

4592

unsigned long *zones_size,

4587

unsigned long *zones_size,

4593

unsigned long *zholes_size)

4588

unsigned long *zholes_size)

4594

{

4589

{

4595

unsigned long realtotalpages, totalpages = 0;

4590

unsigned long realtotalpages, totalpages = 0;

4596

enum zone_type i;

4591

enum zone_type i;

4597

4592

4598

for (i = 0; i < MAX_NR_ZONES; i++)

4593

for (i = 0; i < MAX_NR_ZONES; i++)

4599

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4594

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4600

node_start_pfn,

4595

node_start_pfn,

4601

node_end_pfn,

4596

node_end_pfn,

4602

zones_size);

4597

zones_size);

4603

pgdat->node_spanned_pages = totalpages;

4598

pgdat->node_spanned_pages = totalpages;

4604

4599

4605

realtotalpages = totalpages;

4600

realtotalpages = totalpages;

4606

for (i = 0; i < MAX_NR_ZONES; i++)

4601

for (i = 0; i < MAX_NR_ZONES; i++)

4607

realtotalpages -=

4602

realtotalpages -=

4608

zone_absent_pages_in_node(pgdat->node_id, i,

4603

zone_absent_pages_in_node(pgdat->node_id, i,

4609

node_start_pfn, node_end_pfn,

4604

node_start_pfn, node_end_pfn,

4610

zholes_size);

4605

zholes_size);

4611

pgdat->node_present_pages = realtotalpages;

4606

pgdat->node_present_pages = realtotalpages;

4612

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4607

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4613

realtotalpages);

4608

realtotalpages);

4614

}

4609

}

4615

4610

4616

#ifndef CONFIG_SPARSEMEM

4611

#ifndef CONFIG_SPARSEMEM

4617

/*

4612

/*

4618

* Calculate the size of the zone->blockflags rounded to an unsigned long

4613

* Calculate the size of the zone->blockflags rounded to an unsigned long

4619

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4614

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4620

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4615

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4621

* round what is now in bits to nearest long in bits, then return it in

4616

* round what is now in bits to nearest long in bits, then return it in

4622

* bytes.

4617

* bytes.

4623

*/

4618

*/

4624

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4619

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4625

{

4620

{

4626

unsigned long usemapsize;

4621

unsigned long usemapsize;

4627

4622

4628

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4623

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4629

usemapsize = roundup(zonesize, pageblock_nr_pages);

4624

usemapsize = roundup(zonesize, pageblock_nr_pages);

4630

usemapsize = usemapsize >> pageblock_order;

4625

usemapsize = usemapsize >> pageblock_order;

4631

usemapsize *= NR_PAGEBLOCK_BITS;

4626

usemapsize *= NR_PAGEBLOCK_BITS;

4632

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4627

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4633

4628

4634

return usemapsize / 8;

4629

return usemapsize / 8;

4635

}

4630

}

4636

4631

4637

static void __init setup_usemap(struct pglist_data *pgdat,

4632

static void __init setup_usemap(struct pglist_data *pgdat,

4638

struct zone *zone,

4633

struct zone *zone,

4639

unsigned long zone_start_pfn,

4634

unsigned long zone_start_pfn,

4640

unsigned long zonesize)

4635

unsigned long zonesize)

4641

{

4636

{

4642

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4637

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4643

zone->pageblock_flags = NULL;

4638

zone->pageblock_flags = NULL;

4644

if (usemapsize)

4639

if (usemapsize)

4645

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4640

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4646

usemapsize);

4641

usemapsize);

4647

}

4642

}

4648

#else

4643

#else

4649

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4644

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4650

unsigned long zone_start_pfn, unsigned long zonesize) {}

4645

unsigned long zone_start_pfn, unsigned long zonesize) {}

4651

#endif /* CONFIG_SPARSEMEM */

4646

#endif /* CONFIG_SPARSEMEM */

4652

4647

4653

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4648

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4654

4649

4655

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4650

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4656

void __paginginit set_pageblock_order(void)

4651

void __paginginit set_pageblock_order(void)

4657

{

4652

{

4658

unsigned int order;

4653

unsigned int order;

4659

4654

4660

/* Check that pageblock_nr_pages has not already been setup */

4655

/* Check that pageblock_nr_pages has not already been setup */

4661

if (pageblock_order)

4656

if (pageblock_order)

4662

return;

4657

return;

4663

4658

4664

if (HPAGE_SHIFT > PAGE_SHIFT)

4659

if (HPAGE_SHIFT > PAGE_SHIFT)

4665

order = HUGETLB_PAGE_ORDER;

4660

order = HUGETLB_PAGE_ORDER;

4666

else

4661

else

4667

order = MAX_ORDER - 1;

4662

order = MAX_ORDER - 1;

4668

4663

4669

/*

4664

/*

4670

* Assume the largest contiguous order of interest is a huge page.

4665

* Assume the largest contiguous order of interest is a huge page.

4671

* This value may be variable depending on boot parameters on IA64 and

4666

* This value may be variable depending on boot parameters on IA64 and

4672

* powerpc.

4667

* powerpc.

4673

*/

4668

*/

4674

pageblock_order = order;

4669

pageblock_order = order;

4675

}

4670

}

4676

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4671

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4677

4672

4678

/*

4673

/*

4679

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4674

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4680

* is unused as pageblock_order is set at compile-time. See

4675

* is unused as pageblock_order is set at compile-time. See

4681

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4676

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4682

* the kernel config

4677

* the kernel config

4683

*/

4678

*/

4684

void __paginginit set_pageblock_order(void)

4679

void __paginginit set_pageblock_order(void)

4685

{

4680

{

4686

}

4681

}

4687

4682

4688

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4683

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4689

4684

4690

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4685

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4691

unsigned long present_pages)

4686

unsigned long present_pages)

4692

{

4687

{

4693

unsigned long pages = spanned_pages;

4688

unsigned long pages = spanned_pages;

4694

4689

4695

/*

4690

/*

4696

* Provide a more accurate estimation if there are holes within

4691

* Provide a more accurate estimation if there are holes within

4697

* the zone and SPARSEMEM is in use. If there are holes within the

4692

* the zone and SPARSEMEM is in use. If there are holes within the

4698

* zone, each populated memory region may cost us one or two extra

4693

* zone, each populated memory region may cost us one or two extra

4699

* memmap pages due to alignment because memmap pages for each

4694

* memmap pages due to alignment because memmap pages for each

4700

* populated regions may not naturally algined on page boundary.

4695

* populated regions may not naturally algined on page boundary.

4701

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4696

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4702

*/

4697

*/

4703

if (spanned_pages > present_pages + (present_pages >> 4) &&

4698

if (spanned_pages > present_pages + (present_pages >> 4) &&

4704

IS_ENABLED(CONFIG_SPARSEMEM))

4699

IS_ENABLED(CONFIG_SPARSEMEM))

4705

pages = present_pages;

4700

pages = present_pages;

4706

4701

4707

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4702

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4708

}

4703

}

4709

4704

4710

/*

4705

/*

4711

* Set up the zone data structures:

4706

* Set up the zone data structures:

4712

* - mark all pages reserved

4707

* - mark all pages reserved

4713

* - mark all memory queues empty

4708

* - mark all memory queues empty

4714

* - clear the memory bitmaps

4709

* - clear the memory bitmaps

4715

*

4710

*

4716

* NOTE: pgdat should get zeroed by caller.

4711

* NOTE: pgdat should get zeroed by caller.

4717

*/

4712

*/

4718

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4713

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4719

unsigned long node_start_pfn, unsigned long node_end_pfn,

4714

unsigned long node_start_pfn, unsigned long node_end_pfn,

4720

unsigned long *zones_size, unsigned long *zholes_size)

4715

unsigned long *zones_size, unsigned long *zholes_size)

4721

{

4716

{

4722

enum zone_type j;

4717

enum zone_type j;

4723

int nid = pgdat->node_id;

4718

int nid = pgdat->node_id;

4724

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4719

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4725

int ret;

4720

int ret;

4726

4721

4727

pgdat_resize_init(pgdat);

4722

pgdat_resize_init(pgdat);

4728

#ifdef CONFIG_NUMA_BALANCING

4723

#ifdef CONFIG_NUMA_BALANCING

4729

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4724

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4730

pgdat->numabalancing_migrate_nr_pages = 0;

4725

pgdat->numabalancing_migrate_nr_pages = 0;

4731

pgdat->numabalancing_migrate_next_window = jiffies;

4726

pgdat->numabalancing_migrate_next_window = jiffies;

4732

#endif

4727

#endif

4733

init_waitqueue_head(&pgdat->kswapd_wait);

4728

init_waitqueue_head(&pgdat->kswapd_wait);

4734

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4729

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4735

pgdat_page_cgroup_init(pgdat);

4730

pgdat_page_cgroup_init(pgdat);

4736

4731

4737

for (j = 0; j < MAX_NR_ZONES; j++) {

4732

for (j = 0; j < MAX_NR_ZONES; j++) {

4738

struct zone *zone = pgdat->node_zones + j;

4733

struct zone *zone = pgdat->node_zones + j;

4739

unsigned long size, realsize, freesize, memmap_pages;

4734

unsigned long size, realsize, freesize, memmap_pages;

4740

4735

4741

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4736

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4742

node_end_pfn, zones_size);

4737

node_end_pfn, zones_size);

4743

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4738

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4744

node_start_pfn,

4739

node_start_pfn,

4745

node_end_pfn,

4740

node_end_pfn,

4746

zholes_size);

4741

zholes_size);

4747

4742

4748

/*

4743

/*

4749

* Adjust freesize so that it accounts for how much memory

4744

* Adjust freesize so that it accounts for how much memory

4750

* is used by this zone for memmap. This affects the watermark

4745

* is used by this zone for memmap. This affects the watermark

4751

* and per-cpu initialisations

4746

* and per-cpu initialisations

4752

*/

4747

*/

4753

memmap_pages = calc_memmap_size(size, realsize);

4748

memmap_pages = calc_memmap_size(size, realsize);

4754

if (freesize >= memmap_pages) {

4749

if (freesize >= memmap_pages) {

4755

freesize -= memmap_pages;

4750

freesize -= memmap_pages;

4756

if (memmap_pages)

4751

if (memmap_pages)

4757

printk(KERN_DEBUG

4752

printk(KERN_DEBUG

4758

" %s zone: %lu pages used for memmap\n",

4753

" %s zone: %lu pages used for memmap\n",

4759

zone_names[j], memmap_pages);

4754

zone_names[j], memmap_pages);

4760

} else

4755

} else

4761

printk(KERN_WARNING

4756

printk(KERN_WARNING

4762

" %s zone: %lu pages exceeds freesize %lu\n",

4757

" %s zone: %lu pages exceeds freesize %lu\n",

4763

zone_names[j], memmap_pages, freesize);

4758

zone_names[j], memmap_pages, freesize);

4764

4759

4765

/* Account for reserved pages */

4760

/* Account for reserved pages */

4766

if (j == 0 && freesize > dma_reserve) {

4761

if (j == 0 && freesize > dma_reserve) {

4767

freesize -= dma_reserve;

4762

freesize -= dma_reserve;

4768

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4763

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4769

zone_names[0], dma_reserve);

4764

zone_names[0], dma_reserve);

4770

}

4765

}

4771

4766

4772

if (!is_highmem_idx(j))

4767

if (!is_highmem_idx(j))

4773

nr_kernel_pages += freesize;

4768

nr_kernel_pages += freesize;

4774

/* Charge for highmem memmap if there are enough kernel pages */

4769

/* Charge for highmem memmap if there are enough kernel pages */

4775

else if (nr_kernel_pages > memmap_pages * 2)

4770

else if (nr_kernel_pages > memmap_pages * 2)

4776

nr_kernel_pages -= memmap_pages;

4771

nr_kernel_pages -= memmap_pages;

4777

nr_all_pages += freesize;

4772

nr_all_pages += freesize;

4778

4773

4779

zone->spanned_pages = size;

4774

zone->spanned_pages = size;

4780

zone->present_pages = realsize;

4775

zone->present_pages = realsize;

4781

/*

4776

/*

4782

* Set an approximate value for lowmem here, it will be adjusted

4777

* Set an approximate value for lowmem here, it will be adjusted

4783

* when the bootmem allocator frees pages into the buddy system.

4778

* when the bootmem allocator frees pages into the buddy system.

4784

* And all highmem pages will be managed by the buddy system.

4779

* And all highmem pages will be managed by the buddy system.

4785

*/

4780

*/

4786

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4781

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4787

#ifdef CONFIG_NUMA

4782

#ifdef CONFIG_NUMA

4788

zone->node = nid;

4783

zone->node = nid;

4789

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4784

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4790

/ 100;

4785

/ 100;

4791

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4786

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4792

#endif

4787

#endif

4793

zone->name = zone_names[j];

4788

zone->name = zone_names[j];

4794

spin_lock_init(&zone->lock);

4789

spin_lock_init(&zone->lock);

4795

spin_lock_init(&zone->lru_lock);

4790

spin_lock_init(&zone->lru_lock);

4796

zone_seqlock_init(zone);

4791

zone_seqlock_init(zone);

4797

zone->zone_pgdat = pgdat;

4792

zone->zone_pgdat = pgdat;

4798

zone_pcp_init(zone);

4793

zone_pcp_init(zone);

4799

4794

4800

/* For bootup, initialized properly in watermark setup */

4795

/* For bootup, initialized properly in watermark setup */

4801

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4796

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4802

4797

4803

lruvec_init(&zone->lruvec);

4798

lruvec_init(&zone->lruvec);

4804

if (!size)

4799

if (!size)

4805

continue;

4800

continue;

4806

4801

4807

set_pageblock_order();

4802

set_pageblock_order();

4808

setup_usemap(pgdat, zone, zone_start_pfn, size);

4803

setup_usemap(pgdat, zone, zone_start_pfn, size);

4809

ret = init_currently_empty_zone(zone, zone_start_pfn,

4804

ret = init_currently_empty_zone(zone, zone_start_pfn,

4810

size, MEMMAP_EARLY);

4805

size, MEMMAP_EARLY);

4811

BUG_ON(ret);

4806

BUG_ON(ret);

4812

memmap_init(size, nid, j, zone_start_pfn);

4807

memmap_init(size, nid, j, zone_start_pfn);

4813

zone_start_pfn += size;

4808

zone_start_pfn += size;

4814

}

4809

}

4815

}

4810

}

4816

4811

4817

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4812

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4818

{

4813

{

4819

/* Skip empty nodes */

4814

/* Skip empty nodes */

4820

if (!pgdat->node_spanned_pages)

4815

if (!pgdat->node_spanned_pages)

4821

return;

4816

return;

4822

4817

4823

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4818

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4824

/* ia64 gets its own node_mem_map, before this, without bootmem */

4819

/* ia64 gets its own node_mem_map, before this, without bootmem */

4825

if (!pgdat->node_mem_map) {

4820

if (!pgdat->node_mem_map) {

4826

unsigned long size, start, end;

4821

unsigned long size, start, end;

4827

struct page *map;

4822

struct page *map;

4828

4823

4829

/*

4824

/*

4830

* The zone's endpoints aren't required to be MAX_ORDER

4825

* The zone's endpoints aren't required to be MAX_ORDER

4831

* aligned but the node_mem_map endpoints must be in order

4826

* aligned but the node_mem_map endpoints must be in order

4832

* for the buddy allocator to function correctly.

4827

* for the buddy allocator to function correctly.

4833

*/

4828

*/

4834

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4829

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4835

end = pgdat_end_pfn(pgdat);

4830

end = pgdat_end_pfn(pgdat);

4836

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4831

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4837

size = (end - start) * sizeof(struct page);

4832

size = (end - start) * sizeof(struct page);

4838

map = alloc_remap(pgdat->node_id, size);

4833

map = alloc_remap(pgdat->node_id, size);

4839

if (!map)

4834

if (!map)

4840

map = alloc_bootmem_node_nopanic(pgdat, size);

4835

map = alloc_bootmem_node_nopanic(pgdat, size);

4841

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4836

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4842

}

4837

}

4843

#ifndef CONFIG_NEED_MULTIPLE_NODES

4838

#ifndef CONFIG_NEED_MULTIPLE_NODES

4844

/*

4839

/*

4845

* With no DISCONTIG, the global mem_map is just set as node 0's

4840

* With no DISCONTIG, the global mem_map is just set as node 0's

4846

*/

4841

*/

4847

if (pgdat == NODE_DATA(0)) {

4842

if (pgdat == NODE_DATA(0)) {

4848

mem_map = NODE_DATA(0)->node_mem_map;

4843

mem_map = NODE_DATA(0)->node_mem_map;

4849

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4844

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4850

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4845

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4851

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4846

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4852

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4847

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4853

}

4848

}

4854

#endif

4849

#endif

4855

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4850

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4856

}

4851

}

4857

4852

4858

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4853

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4859

unsigned long node_start_pfn, unsigned long *zholes_size)

4854

unsigned long node_start_pfn, unsigned long *zholes_size)

4860

{

4855

{

4861

pg_data_t *pgdat = NODE_DATA(nid);

4856

pg_data_t *pgdat = NODE_DATA(nid);

4862

unsigned long start_pfn = 0;

4857

unsigned long start_pfn = 0;

4863

unsigned long end_pfn = 0;

4858

unsigned long end_pfn = 0;

4864

4859

4865

/* pg_data_t should be reset to zero when it's allocated */

4860

/* pg_data_t should be reset to zero when it's allocated */

4866

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4861

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4867

4862

4868

pgdat->node_id = nid;

4863

pgdat->node_id = nid;

4869

pgdat->node_start_pfn = node_start_pfn;

4864

pgdat->node_start_pfn = node_start_pfn;

4870

init_zone_allows_reclaim(nid);

4865

init_zone_allows_reclaim(nid);

4871

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4866

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4872

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4867

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4873

#endif

4868

#endif

4874

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4869

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4875

zones_size, zholes_size);

4870

zones_size, zholes_size);

4876

4871

4877

alloc_node_mem_map(pgdat);

4872

alloc_node_mem_map(pgdat);

4878

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4873

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4879

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4874

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4880

nid, (unsigned long)pgdat,

4875

nid, (unsigned long)pgdat,

4881

(unsigned long)pgdat->node_mem_map);

4876

(unsigned long)pgdat->node_mem_map);

4882

#endif

4877

#endif

4883

4878

4884

free_area_init_core(pgdat, start_pfn, end_pfn,

4879

free_area_init_core(pgdat, start_pfn, end_pfn,

4885

zones_size, zholes_size);

4880

zones_size, zholes_size);

4886

}

4881

}

4887

4882

4888

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4883

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4889

4884

4890

#if MAX_NUMNODES > 1

4885

#if MAX_NUMNODES > 1

4891

/*

4886

/*

4892

* Figure out the number of possible node ids.

4887

* Figure out the number of possible node ids.

4893

*/

4888

*/

4894

void __init setup_nr_node_ids(void)

4889

void __init setup_nr_node_ids(void)

4895

{

4890

{

4896

unsigned int node;

4891

unsigned int node;

4897

unsigned int highest = 0;

4892

unsigned int highest = 0;

4898

4893

4899

for_each_node_mask(node, node_possible_map)

4894

for_each_node_mask(node, node_possible_map)

4900

highest = node;

4895

highest = node;

4901

nr_node_ids = highest + 1;

4896

nr_node_ids = highest + 1;

4902

}

4897

}

4903

#endif

4898

#endif

4904

4899

4905

/**

4900

/**

4906

* node_map_pfn_alignment - determine the maximum internode alignment

4901

* node_map_pfn_alignment - determine the maximum internode alignment

4907

*

4902

*

4908

* This function should be called after node map is populated and sorted.

4903

* This function should be called after node map is populated and sorted.

4909

* It calculates the maximum power of two alignment which can distinguish

4904

* It calculates the maximum power of two alignment which can distinguish

4910

* all the nodes.

4905

* all the nodes.

4911

*

4906

*

4912

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4907

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4913

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4908

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4914

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4909

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4915

* shifted, 1GiB is enough and this function will indicate so.

4910

* shifted, 1GiB is enough and this function will indicate so.

4916

*

4911

*

4917

* This is used to test whether pfn -> nid mapping of the chosen memory

4912

* This is used to test whether pfn -> nid mapping of the chosen memory

4918

* model has fine enough granularity to avoid incorrect mapping for the

4913

* model has fine enough granularity to avoid incorrect mapping for the

4919

* populated node map.

4914

* populated node map.

4920

*

4915

*

4921

* Returns the determined alignment in pfn's. 0 if there is no alignment

4916

* Returns the determined alignment in pfn's. 0 if there is no alignment

4922

* requirement (single node).

4917

* requirement (single node).

4923

*/

4918

*/

4924

unsigned long __init node_map_pfn_alignment(void)

4919

unsigned long __init node_map_pfn_alignment(void)

4925

{

4920

{

4926

unsigned long accl_mask = 0, last_end = 0;

4921

unsigned long accl_mask = 0, last_end = 0;

4927

unsigned long start, end, mask;

4922

unsigned long start, end, mask;

4928

int last_nid = -1;

4923

int last_nid = -1;

4929

int i, nid;

4924

int i, nid;

4930

4925

4931

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4926

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4932

if (!start || last_nid < 0 || last_nid == nid) {

4927

if (!start || last_nid < 0 || last_nid == nid) {

4933

last_nid = nid;

4928

last_nid = nid;

4934

last_end = end;

4929

last_end = end;

4935

continue;

4930

continue;

4936

}

4931

}

4937

4932

4938

/*

4933

/*

4939

* Start with a mask granular enough to pin-point to the

4934

* Start with a mask granular enough to pin-point to the

4940

* start pfn and tick off bits one-by-one until it becomes

4935

* start pfn and tick off bits one-by-one until it becomes

4941

* too coarse to separate the current node from the last.

4936

* too coarse to separate the current node from the last.

4942

*/

4937

*/

4943

mask = ~((1 << __ffs(start)) - 1);

4938

mask = ~((1 << __ffs(start)) - 1);

4944

while (mask && last_end <= (start & (mask << 1)))

4939

while (mask && last_end <= (start & (mask << 1)))

4945

mask <<= 1;

4940

mask <<= 1;

4946

4941

4947

/* accumulate all internode masks */

4942

/* accumulate all internode masks */

4948

accl_mask |= mask;

4943

accl_mask |= mask;

4949

}

4944

}

4950

4945

4951

/* convert mask to number of pages */

4946

/* convert mask to number of pages */

4952

return ~accl_mask + 1;

4947

return ~accl_mask + 1;

4953

}

4948

}

4954

4949

4955

/* Find the lowest pfn for a node */

4950

/* Find the lowest pfn for a node */

4956

static unsigned long __init find_min_pfn_for_node(int nid)

4951

static unsigned long __init find_min_pfn_for_node(int nid)

4957

{

4952

{

4958

unsigned long min_pfn = ULONG_MAX;

4953

unsigned long min_pfn = ULONG_MAX;

4959

unsigned long start_pfn;

4954

unsigned long start_pfn;

4960

int i;

4955

int i;

4961

4956

4962

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4957

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4963

min_pfn = min(min_pfn, start_pfn);

4958

min_pfn = min(min_pfn, start_pfn);

4964

4959

4965

if (min_pfn == ULONG_MAX) {

4960

if (min_pfn == ULONG_MAX) {

4966

printk(KERN_WARNING

4961

printk(KERN_WARNING

4967

"Could not find start_pfn for node %d\n", nid);

4962

"Could not find start_pfn for node %d\n", nid);

4968

return 0;

4963

return 0;

4969

}

4964

}

4970

4965

4971

return min_pfn;

4966

return min_pfn;

4972

}

4967

}

4973

4968

4974

/**

4969

/**

4975

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4970

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4976

*

4971

*

4977

* It returns the minimum PFN based on information provided via

4972

* It returns the minimum PFN based on information provided via

4978

* add_active_range().

4973

* add_active_range().

4979

*/

4974

*/

4980

unsigned long __init find_min_pfn_with_active_regions(void)

4975

unsigned long __init find_min_pfn_with_active_regions(void)

4981

{

4976

{

4982

return find_min_pfn_for_node(MAX_NUMNODES);

4977

return find_min_pfn_for_node(MAX_NUMNODES);

4983

}

4978

}

4984

4979

4985

/*

4980

/*

4986

* early_calculate_totalpages()

4981

* early_calculate_totalpages()

4987

* Sum pages in active regions for movable zone.

4982

* Sum pages in active regions for movable zone.

4988

* Populate N_MEMORY for calculating usable_nodes.

4983

* Populate N_MEMORY for calculating usable_nodes.

4989

*/

4984

*/

4990

static unsigned long __init early_calculate_totalpages(void)

4985

static unsigned long __init early_calculate_totalpages(void)

4991

{

4986

{

4992

unsigned long totalpages = 0;

4987

unsigned long totalpages = 0;

4993

unsigned long start_pfn, end_pfn;

4988

unsigned long start_pfn, end_pfn;

4994

int i, nid;

4989

int i, nid;

4995

4990

4996

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4991

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4997

unsigned long pages = end_pfn - start_pfn;

4992

unsigned long pages = end_pfn - start_pfn;

4998

4993

4999

totalpages += pages;

4994

totalpages += pages;

5000

if (pages)

4995

if (pages)

5001

node_set_state(nid, N_MEMORY);

4996

node_set_state(nid, N_MEMORY);

5002

}

4997

}

5003

return totalpages;

4998

return totalpages;

5004

}

4999

}

5005

5000

5006

/*

5001

/*

5007

* Find the PFN the Movable zone begins in each node. Kernel memory

5002

* Find the PFN the Movable zone begins in each node. Kernel memory

5008

* is spread evenly between nodes as long as the nodes have enough

5003

* is spread evenly between nodes as long as the nodes have enough

5009

* memory. When they don't, some nodes will have more kernelcore than

5004

* memory. When they don't, some nodes will have more kernelcore than

5010

* others

5005

* others

5011

*/

5006

*/

5012

static void __init find_zone_movable_pfns_for_nodes(void)

5007

static void __init find_zone_movable_pfns_for_nodes(void)

5013

{

5008

{

5014

int i, nid;

5009

int i, nid;

5015

unsigned long usable_startpfn;

5010

unsigned long usable_startpfn;

5016

unsigned long kernelcore_node, kernelcore_remaining;

5011

unsigned long kernelcore_node, kernelcore_remaining;

5017

/* save the state before borrow the nodemask */

5012

/* save the state before borrow the nodemask */

5018

nodemask_t saved_node_state = node_states[N_MEMORY];

5013

nodemask_t saved_node_state = node_states[N_MEMORY];

5019

unsigned long totalpages = early_calculate_totalpages();

5014

unsigned long totalpages = early_calculate_totalpages();

5020

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5015

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5021

5016

5022

/*

5017

/*

5023

* If movablecore was specified, calculate what size of

5018

* If movablecore was specified, calculate what size of

5024

* kernelcore that corresponds so that memory usable for

5019

* kernelcore that corresponds so that memory usable for

5025

* any allocation type is evenly spread. If both kernelcore

5020

* any allocation type is evenly spread. If both kernelcore

5026

* and movablecore are specified, then the value of kernelcore

5021

* and movablecore are specified, then the value of kernelcore

5027

* will be used for required_kernelcore if it's greater than

5022

* will be used for required_kernelcore if it's greater than

5028

* what movablecore would have allowed.

5023

* what movablecore would have allowed.

5029

*/

5024

*/

5030

if (required_movablecore) {

5025

if (required_movablecore) {

5031

unsigned long corepages;

5026

unsigned long corepages;

5032

5027

5033

/*

5028

/*

5034

* Round-up so that ZONE_MOVABLE is at least as large as what

5029

* Round-up so that ZONE_MOVABLE is at least as large as what

5035

* was requested by the user

5030

* was requested by the user

5036

*/

5031

*/

5037

required_movablecore =

5032

required_movablecore =

5038

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5033

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5039

corepages = totalpages - required_movablecore;

5034

corepages = totalpages - required_movablecore;

5040

5035

5041

required_kernelcore = max(required_kernelcore, corepages);

5036

required_kernelcore = max(required_kernelcore, corepages);

5042

}

5037

}

5043

5038

5044

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5039

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5045

if (!required_kernelcore)

5040

if (!required_kernelcore)

5046

goto out;

5041

goto out;

5047

5042

5048

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5043

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5049

find_usable_zone_for_movable();

5044

find_usable_zone_for_movable();

5050

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5045

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5051

5046

5052

restart:

5047

restart:

5053

/* Spread kernelcore memory as evenly as possible throughout nodes */

5048

/* Spread kernelcore memory as evenly as possible throughout nodes */

5054

kernelcore_node = required_kernelcore / usable_nodes;

5049

kernelcore_node = required_kernelcore / usable_nodes;

5055

for_each_node_state(nid, N_MEMORY) {

5050

for_each_node_state(nid, N_MEMORY) {

5056

unsigned long start_pfn, end_pfn;

5051

unsigned long start_pfn, end_pfn;

5057

5052

5058

/*

5053

/*

5059

* Recalculate kernelcore_node if the division per node

5054

* Recalculate kernelcore_node if the division per node

5060

* now exceeds what is necessary to satisfy the requested

5055

* now exceeds what is necessary to satisfy the requested

5061

* amount of memory for the kernel

5056

* amount of memory for the kernel

5062

*/

5057

*/

5063

if (required_kernelcore < kernelcore_node)

5058

if (required_kernelcore < kernelcore_node)

5064

kernelcore_node = required_kernelcore / usable_nodes;

5059

kernelcore_node = required_kernelcore / usable_nodes;

5065

5060

5066

/*

5061

/*

5067

* As the map is walked, we track how much memory is usable

5062

* As the map is walked, we track how much memory is usable

5068

* by the kernel using kernelcore_remaining. When it is

5063

* by the kernel using kernelcore_remaining. When it is

5069

* 0, the rest of the node is usable by ZONE_MOVABLE

5064

* 0, the rest of the node is usable by ZONE_MOVABLE

5070

*/

5065

*/

5071

kernelcore_remaining = kernelcore_node;

5066

kernelcore_remaining = kernelcore_node;

5072

5067

5073

/* Go through each range of PFNs within this node */

5068

/* Go through each range of PFNs within this node */

5074

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5069

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5075

unsigned long size_pages;

5070

unsigned long size_pages;

5076

5071

5077

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5072

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5078

if (start_pfn >= end_pfn)

5073

if (start_pfn >= end_pfn)

5079

continue;

5074

continue;

5080

5075

5081

/* Account for what is only usable for kernelcore */

5076

/* Account for what is only usable for kernelcore */

5082

if (start_pfn < usable_startpfn) {

5077

if (start_pfn < usable_startpfn) {

5083

unsigned long kernel_pages;

5078

unsigned long kernel_pages;

5084

kernel_pages = min(end_pfn, usable_startpfn)

5079

kernel_pages = min(end_pfn, usable_startpfn)

5085

- start_pfn;

5080

- start_pfn;

5086

5081

5087

kernelcore_remaining -= min(kernel_pages,

5082

kernelcore_remaining -= min(kernel_pages,

5088

kernelcore_remaining);

5083

kernelcore_remaining);

5089

required_kernelcore -= min(kernel_pages,

5084

required_kernelcore -= min(kernel_pages,

5090

required_kernelcore);

5085

required_kernelcore);

5091

5086

5092

/* Continue if range is now fully accounted */

5087

/* Continue if range is now fully accounted */

5093

if (end_pfn <= usable_startpfn) {

5088

if (end_pfn <= usable_startpfn) {

5094

5089

5095

/*

5090

/*

5096

* Push zone_movable_pfn to the end so

5091

* Push zone_movable_pfn to the end so

5097

* that if we have to rebalance

5092

* that if we have to rebalance

5098

* kernelcore across nodes, we will

5093

* kernelcore across nodes, we will

5099

* not double account here

5094

* not double account here

5100

*/

5095

*/

5101

zone_movable_pfn[nid] = end_pfn;

5096

zone_movable_pfn[nid] = end_pfn;

5102

continue;

5097

continue;

5103

}

5098

}

5104

start_pfn = usable_startpfn;

5099

start_pfn = usable_startpfn;

5105

}

5100

}

5106

5101

5107

/*

5102

/*

5108

* The usable PFN range for ZONE_MOVABLE is from

5103

* The usable PFN range for ZONE_MOVABLE is from

5109

* start_pfn->end_pfn. Calculate size_pages as the

5104

* start_pfn->end_pfn. Calculate size_pages as the

5110

* number of pages used as kernelcore

5105

* number of pages used as kernelcore

5111

*/

5106

*/

5112

size_pages = end_pfn - start_pfn;

5107

size_pages = end_pfn - start_pfn;

5113

if (size_pages > kernelcore_remaining)

5108

if (size_pages > kernelcore_remaining)

5114

size_pages = kernelcore_remaining;

5109

size_pages = kernelcore_remaining;

5115

zone_movable_pfn[nid] = start_pfn + size_pages;

5110

zone_movable_pfn[nid] = start_pfn + size_pages;

5116

5111

5117

/*

5112

/*

5118

* Some kernelcore has been met, update counts and

5113

* Some kernelcore has been met, update counts and

5119

* break if the kernelcore for this node has been

5114

* break if the kernelcore for this node has been

5120

* satisfied

5115

* satisfied

5121

*/

5116

*/

5122

required_kernelcore -= min(required_kernelcore,

5117

required_kernelcore -= min(required_kernelcore,

5123

size_pages);

5118

size_pages);

5124

kernelcore_remaining -= size_pages;

5119

kernelcore_remaining -= size_pages;

5125

if (!kernelcore_remaining)

5120

if (!kernelcore_remaining)

5126

break;

5121

break;

5127

}

5122

}

5128

}

5123

}

5129

5124

5130

/*

5125

/*

5131

* If there is still required_kernelcore, we do another pass with one

5126

* If there is still required_kernelcore, we do another pass with one

5132

* less node in the count. This will push zone_movable_pfn[nid] further

5127

* less node in the count. This will push zone_movable_pfn[nid] further

5133

* along on the nodes that still have memory until kernelcore is

5128

* along on the nodes that still have memory until kernelcore is

5134

* satisfied

5129

* satisfied

5135

*/

5130

*/

5136

usable_nodes--;

5131

usable_nodes--;

5137

if (usable_nodes && required_kernelcore > usable_nodes)

5132

if (usable_nodes && required_kernelcore > usable_nodes)

5138

goto restart;

5133

goto restart;

5139

5134

5140

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5135

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5141

for (nid = 0; nid < MAX_NUMNODES; nid++)

5136

for (nid = 0; nid < MAX_NUMNODES; nid++)

5142

zone_movable_pfn[nid] =

5137

zone_movable_pfn[nid] =

5143

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5138

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5144

5139

5145

out:

5140

out:

5146

/* restore the node_state */

5141

/* restore the node_state */

5147

node_states[N_MEMORY] = saved_node_state;

5142

node_states[N_MEMORY] = saved_node_state;

5148

}

5143

}

5149

5144

5150

/* Any regular or high memory on that node ? */

5145

/* Any regular or high memory on that node ? */

5151

static void check_for_memory(pg_data_t *pgdat, int nid)

5146

static void check_for_memory(pg_data_t *pgdat, int nid)

5152

{

5147

{

5153

enum zone_type zone_type;

5148

enum zone_type zone_type;

5154

5149

5155

if (N_MEMORY == N_NORMAL_MEMORY)

5150

if (N_MEMORY == N_NORMAL_MEMORY)

5156

return;

5151

return;

5157

5152

5158

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5153

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5159

struct zone *zone = &pgdat->node_zones[zone_type];

5154

struct zone *zone = &pgdat->node_zones[zone_type];

5160

if (populated_zone(zone)) {

5155

if (populated_zone(zone)) {

5161

node_set_state(nid, N_HIGH_MEMORY);

5156

node_set_state(nid, N_HIGH_MEMORY);

5162

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5157

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5163

zone_type <= ZONE_NORMAL)

5158

zone_type <= ZONE_NORMAL)

5164

node_set_state(nid, N_NORMAL_MEMORY);

5159

node_set_state(nid, N_NORMAL_MEMORY);

5165

break;

5160

break;

5166

}

5161

}

5167

}

5162

}

5168

}

5163

}

5169

5164

5170

/**

5165

/**

5171

* free_area_init_nodes - Initialise all pg_data_t and zone data

5166

* free_area_init_nodes - Initialise all pg_data_t and zone data

5172

* @max_zone_pfn: an array of max PFNs for each zone

5167

* @max_zone_pfn: an array of max PFNs for each zone

5173

*

5168

*

5174

* This will call free_area_init_node() for each active node in the system.

5169

* This will call free_area_init_node() for each active node in the system.

5175

* Using the page ranges provided by add_active_range(), the size of each

5170

* Using the page ranges provided by add_active_range(), the size of each

5176

* zone in each node and their holes is calculated. If the maximum PFN

5171

* zone in each node and their holes is calculated. If the maximum PFN

5177

* between two adjacent zones match, it is assumed that the zone is empty.

5172

* between two adjacent zones match, it is assumed that the zone is empty.

5178

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5173

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5179

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5174

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5180

* starts where the previous one ended. For example, ZONE_DMA32 starts

5175

* starts where the previous one ended. For example, ZONE_DMA32 starts

5181

* at arch_max_dma_pfn.

5176

* at arch_max_dma_pfn.

5182

*/

5177

*/

5183

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5178

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5184

{

5179

{

5185

unsigned long start_pfn, end_pfn;

5180

unsigned long start_pfn, end_pfn;

5186

int i, nid;

5181

int i, nid;

5187

5182

5188

/* Record where the zone boundaries are */

5183

/* Record where the zone boundaries are */

5189

memset(arch_zone_lowest_possible_pfn, 0,

5184

memset(arch_zone_lowest_possible_pfn, 0,

5190

sizeof(arch_zone_lowest_possible_pfn));

5185

sizeof(arch_zone_lowest_possible_pfn));

5191

memset(arch_zone_highest_possible_pfn, 0,

5186

memset(arch_zone_highest_possible_pfn, 0,

5192

sizeof(arch_zone_highest_possible_pfn));

5187

sizeof(arch_zone_highest_possible_pfn));

5193

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5188

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5194

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5189

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5195

for (i = 1; i < MAX_NR_ZONES; i++) {

5190

for (i = 1; i < MAX_NR_ZONES; i++) {

5196

if (i == ZONE_MOVABLE)

5191

if (i == ZONE_MOVABLE)

5197

continue;

5192

continue;

5198

arch_zone_lowest_possible_pfn[i] =

5193

arch_zone_lowest_possible_pfn[i] =

5199

arch_zone_highest_possible_pfn[i-1];

5194

arch_zone_highest_possible_pfn[i-1];

5200

arch_zone_highest_possible_pfn[i] =

5195

arch_zone_highest_possible_pfn[i] =

5201

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5196

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5202

}

5197

}

5203

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5198

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5204

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5199

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5205

5200

5206

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5201

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5207

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5202

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5208

find_zone_movable_pfns_for_nodes();

5203

find_zone_movable_pfns_for_nodes();

5209

5204

5210

/* Print out the zone ranges */

5205

/* Print out the zone ranges */

5211

printk("Zone ranges:\n");

5206

printk("Zone ranges:\n");

5212

for (i = 0; i < MAX_NR_ZONES; i++) {

5207

for (i = 0; i < MAX_NR_ZONES; i++) {

5213

if (i == ZONE_MOVABLE)

5208

if (i == ZONE_MOVABLE)

5214

continue;

5209

continue;

5215

printk(KERN_CONT " %-8s ", zone_names[i]);

5210

printk(KERN_CONT " %-8s ", zone_names[i]);

5216

if (arch_zone_lowest_possible_pfn[i] ==

5211

if (arch_zone_lowest_possible_pfn[i] ==

5217

arch_zone_highest_possible_pfn[i])

5212

arch_zone_highest_possible_pfn[i])

5218

printk(KERN_CONT "empty\n");

5213

printk(KERN_CONT "empty\n");

5219

else

5214

else

5220

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5215

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5221

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5216

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5222

(arch_zone_highest_possible_pfn[i]

5217

(arch_zone_highest_possible_pfn[i]

5223

<< PAGE_SHIFT) - 1);

5218

<< PAGE_SHIFT) - 1);

5224

}

5219

}

5225

5220

5226

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5221

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5227

printk("Movable zone start for each node\n");

5222

printk("Movable zone start for each node\n");

5228

for (i = 0; i < MAX_NUMNODES; i++) {

5223

for (i = 0; i < MAX_NUMNODES; i++) {

5229

if (zone_movable_pfn[i])

5224

if (zone_movable_pfn[i])

5230

printk(" Node %d: %#010lx\n", i,

5225

printk(" Node %d: %#010lx\n", i,

5231

zone_movable_pfn[i] << PAGE_SHIFT);

5226

zone_movable_pfn[i] << PAGE_SHIFT);

5232

}

5227

}

5233

5228

5234

/* Print out the early node map */

5229

/* Print out the early node map */

5235

printk("Early memory node ranges\n");

5230

printk("Early memory node ranges\n");

5236

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5231

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5237

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5232

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5238

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5233

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5239

5234

5240

/* Initialise every node */

5235

/* Initialise every node */

5241

mminit_verify_pageflags_layout();

5236

mminit_verify_pageflags_layout();

5242

setup_nr_node_ids();

5237

setup_nr_node_ids();

5243

for_each_online_node(nid) {

5238

for_each_online_node(nid) {

5244

pg_data_t *pgdat = NODE_DATA(nid);

5239

pg_data_t *pgdat = NODE_DATA(nid);

5245

free_area_init_node(nid, NULL,

5240

free_area_init_node(nid, NULL,

5246

find_min_pfn_for_node(nid), NULL);

5241

find_min_pfn_for_node(nid), NULL);

5247

5242

5248

/* Any memory on that node */

5243

/* Any memory on that node */

5249

if (pgdat->node_present_pages)

5244

if (pgdat->node_present_pages)

5250

node_set_state(nid, N_MEMORY);

5245

node_set_state(nid, N_MEMORY);

5251

check_for_memory(pgdat, nid);

5246

check_for_memory(pgdat, nid);

5252

}

5247

}

5253

}

5248

}

5254

5249

5255

static int __init cmdline_parse_core(char *p, unsigned long *core)

5250

static int __init cmdline_parse_core(char *p, unsigned long *core)

5256

{

5251

{

5257

unsigned long long coremem;

5252

unsigned long long coremem;

5258

if (!p)

5253

if (!p)

5259

return -EINVAL;

5254

return -EINVAL;

5260

5255

5261

coremem = memparse(p, &p);

5256

coremem = memparse(p, &p);

5262

*core = coremem >> PAGE_SHIFT;

5257

*core = coremem >> PAGE_SHIFT;

5263

5258

5264

/* Paranoid check that UL is enough for the coremem value */

5259

/* Paranoid check that UL is enough for the coremem value */

5265

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5260

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5266

5261

5267

return 0;

5262

return 0;

5268

}

5263

}

5269

5264

5270

/*

5265

/*

5271

* kernelcore=size sets the amount of memory for use for allocations that

5266

* kernelcore=size sets the amount of memory for use for allocations that

5272

* cannot be reclaimed or migrated.

5267

* cannot be reclaimed or migrated.

5273

*/

5268

*/

5274

static int __init cmdline_parse_kernelcore(char *p)

5269

static int __init cmdline_parse_kernelcore(char *p)

5275

{

5270

{

5276

return cmdline_parse_core(p, &required_kernelcore);

5271

return cmdline_parse_core(p, &required_kernelcore);

5277

}

5272

}

5278

5273

5279

/*

5274

/*

5280

* movablecore=size sets the amount of memory for use for allocations that

5275

* movablecore=size sets the amount of memory for use for allocations that

5281

* can be reclaimed or migrated.

5276

* can be reclaimed or migrated.

5282

*/

5277

*/

5283

static int __init cmdline_parse_movablecore(char *p)

5278

static int __init cmdline_parse_movablecore(char *p)

5284

{

5279

{

5285

return cmdline_parse_core(p, &required_movablecore);

5280

return cmdline_parse_core(p, &required_movablecore);

5286

}

5281

}

5287

5282

5288

early_param("kernelcore", cmdline_parse_kernelcore);

5283

early_param("kernelcore", cmdline_parse_kernelcore);

5289

early_param("movablecore", cmdline_parse_movablecore);

5284

early_param("movablecore", cmdline_parse_movablecore);

5290

5285

5291

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5286

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5292

5287

5293

void adjust_managed_page_count(struct page *page, long count)

5288

void adjust_managed_page_count(struct page *page, long count)

5294

{

5289

{

5295

spin_lock(&managed_page_count_lock);

5290

spin_lock(&managed_page_count_lock);

5296

page_zone(page)->managed_pages += count;

5291

page_zone(page)->managed_pages += count;

5297

totalram_pages += count;

5292

totalram_pages += count;

5298

#ifdef CONFIG_HIGHMEM

5293

#ifdef CONFIG_HIGHMEM

5299

if (PageHighMem(page))

5294

if (PageHighMem(page))

5300

totalhigh_pages += count;

5295

totalhigh_pages += count;

5301

#endif

5296

#endif

5302

spin_unlock(&managed_page_count_lock);

5297

spin_unlock(&managed_page_count_lock);

5303

}

5298

}

5304

EXPORT_SYMBOL(adjust_managed_page_count);

5299

EXPORT_SYMBOL(adjust_managed_page_count);

5305

5300

5306

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5301

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5307

{

5302

{

5308

void *pos;

5303

void *pos;

5309

unsigned long pages = 0;

5304

unsigned long pages = 0;

5310

5305

5311

start = (void *)PAGE_ALIGN((unsigned long)start);

5306

start = (void *)PAGE_ALIGN((unsigned long)start);

5312

end = (void *)((unsigned long)end & PAGE_MASK);

5307

end = (void *)((unsigned long)end & PAGE_MASK);

5313

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5308

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5314

if ((unsigned int)poison <= 0xFF)

5309

if ((unsigned int)poison <= 0xFF)

5315

memset(pos, poison, PAGE_SIZE);

5310

memset(pos, poison, PAGE_SIZE);

5316

free_reserved_page(virt_to_page(pos));

5311

free_reserved_page(virt_to_page(pos));

5317

}

5312

}

5318

5313

5319

if (pages && s)

5314

if (pages && s)

5320

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5315

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5321

s, pages << (PAGE_SHIFT - 10), start, end);

5316

s, pages << (PAGE_SHIFT - 10), start, end);

5322

5317

5323

return pages;

5318

return pages;

5324

}

5319

}

5325

EXPORT_SYMBOL(free_reserved_area);

5320

EXPORT_SYMBOL(free_reserved_area);

5326

5321

5327

#ifdef CONFIG_HIGHMEM

5322

#ifdef CONFIG_HIGHMEM

5328

void free_highmem_page(struct page *page)

5323

void free_highmem_page(struct page *page)

5329

{

5324

{

5330

__free_reserved_page(page);

5325

__free_reserved_page(page);

5331

totalram_pages++;

5326

totalram_pages++;

5332

page_zone(page)->managed_pages++;

5327

page_zone(page)->managed_pages++;

5333

totalhigh_pages++;

5328

totalhigh_pages++;

5334

}

5329

}

5335

#endif

5330

#endif

5336

5331

5337

5332

5338

void __init mem_init_print_info(const char *str)

5333

void __init mem_init_print_info(const char *str)

5339

{

5334

{

5340

unsigned long physpages, codesize, datasize, rosize, bss_size;

5335

unsigned long physpages, codesize, datasize, rosize, bss_size;

5341

unsigned long init_code_size, init_data_size;

5336

unsigned long init_code_size, init_data_size;

5342

5337

5343

physpages = get_num_physpages();

5338

physpages = get_num_physpages();

5344

codesize = _etext - _stext;

5339

codesize = _etext - _stext;

5345

datasize = _edata - _sdata;

5340

datasize = _edata - _sdata;

5346

rosize = __end_rodata - __start_rodata;

5341

rosize = __end_rodata - __start_rodata;

5347

bss_size = __bss_stop - __bss_start;

5342

bss_size = __bss_stop - __bss_start;

5348

init_data_size = __init_end - __init_begin;

5343

init_data_size = __init_end - __init_begin;

5349

init_code_size = _einittext - _sinittext;

5344

init_code_size = _einittext - _sinittext;

5350

5345

5351

/*

5346

/*

5352

* Detect special cases and adjust section sizes accordingly:

5347

* Detect special cases and adjust section sizes accordingly:

5353

* 1) .init.* may be embedded into .data sections

5348

* 1) .init.* may be embedded into .data sections

5354

* 2) .init.text.* may be out of [__init_begin, __init_end],

5349

* 2) .init.text.* may be out of [__init_begin, __init_end],

5355

* please refer to arch/tile/kernel/vmlinux.lds.S.

5350

* please refer to arch/tile/kernel/vmlinux.lds.S.

5356

* 3) .rodata.* may be embedded into .text or .data sections.

5351

* 3) .rodata.* may be embedded into .text or .data sections.

5357

*/

5352

*/

5358

#define adj_init_size(start, end, size, pos, adj) \

5353

#define adj_init_size(start, end, size, pos, adj) \

5359

do { \

5354

do { \

5360

if (start <= pos && pos < end && size > adj) \

5355

if (start <= pos && pos < end && size > adj) \

5361

size -= adj; \

5356

size -= adj; \

5362

} while (0)

5357

} while (0)

5363

5358

5364

adj_init_size(__init_begin, __init_end, init_data_size,

5359

adj_init_size(__init_begin, __init_end, init_data_size,

5365

_sinittext, init_code_size);

5360

_sinittext, init_code_size);

5366

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5361

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5367

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5362

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5368

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5363

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5369

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5364

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5370

5365

5371

#undef adj_init_size

5366

#undef adj_init_size

5372

5367

5373

printk("Memory: %luK/%luK available "

5368

printk("Memory: %luK/%luK available "

5374

"(%luK kernel code, %luK rwdata, %luK rodata, "

5369

"(%luK kernel code, %luK rwdata, %luK rodata, "

5375

"%luK init, %luK bss, %luK reserved"

5370

"%luK init, %luK bss, %luK reserved"

5376

#ifdef CONFIG_HIGHMEM

5371

#ifdef CONFIG_HIGHMEM

5377

", %luK highmem"

5372

", %luK highmem"

5378

#endif

5373

#endif

5379

"%s%s)\n",

5374

"%s%s)\n",

5380

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5375

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5381

codesize >> 10, datasize >> 10, rosize >> 10,

5376

codesize >> 10, datasize >> 10, rosize >> 10,

5382

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5377

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5383

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5378

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5384

#ifdef CONFIG_HIGHMEM

5379

#ifdef CONFIG_HIGHMEM

5385

totalhigh_pages << (PAGE_SHIFT-10),

5380

totalhigh_pages << (PAGE_SHIFT-10),

5386

#endif

5381

#endif

5387

str ? ", " : "", str ? str : "");

5382

str ? ", " : "", str ? str : "");

5388

}

5383

}

5389

5384

5390

/**

5385

/**

5391

* set_dma_reserve - set the specified number of pages reserved in the first zone

5386

* set_dma_reserve - set the specified number of pages reserved in the first zone

5392

* @new_dma_reserve: The number of pages to mark reserved

5387

* @new_dma_reserve: The number of pages to mark reserved

5393

*

5388

*

5394

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5389

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5395

* In the DMA zone, a significant percentage may be consumed by kernel image

5390

* In the DMA zone, a significant percentage may be consumed by kernel image

5396

* and other unfreeable allocations which can skew the watermarks badly. This

5391

* and other unfreeable allocations which can skew the watermarks badly. This

5397

* function may optionally be used to account for unfreeable pages in the

5392

* function may optionally be used to account for unfreeable pages in the

5398

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5393

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5399

* smaller per-cpu batchsize.

5394

* smaller per-cpu batchsize.

5400

*/

5395

*/

5401

void __init set_dma_reserve(unsigned long new_dma_reserve)

5396

void __init set_dma_reserve(unsigned long new_dma_reserve)

5402

{

5397

{

5403

dma_reserve = new_dma_reserve;

5398

dma_reserve = new_dma_reserve;

5404

}

5399

}

5405

5400

5406

void __init free_area_init(unsigned long *zones_size)

5401

void __init free_area_init(unsigned long *zones_size)

5407

{

5402

{

5408

free_area_init_node(0, zones_size,

5403

free_area_init_node(0, zones_size,

5409

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5404

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5410

}

5405

}

5411

5406

5412

static int page_alloc_cpu_notify(struct notifier_block *self,

5407

static int page_alloc_cpu_notify(struct notifier_block *self,

5413

unsigned long action, void *hcpu)

5408

unsigned long action, void *hcpu)

5414

{

5409

{

5415

int cpu = (unsigned long)hcpu;

5410

int cpu = (unsigned long)hcpu;

5416

5411

5417

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5412

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5418

lru_add_drain_cpu(cpu);

5413

lru_add_drain_cpu(cpu);

5419

drain_pages(cpu);

5414

drain_pages(cpu);

5420

5415

5421

/*

5416

/*

5422

* Spill the event counters of the dead processor

5417

* Spill the event counters of the dead processor

5423

* into the current processors event counters.

5418

* into the current processors event counters.

5424

* This artificially elevates the count of the current

5419

* This artificially elevates the count of the current

5425

* processor.

5420

* processor.

5426

*/

5421

*/

5427

vm_events_fold_cpu(cpu);

5422

vm_events_fold_cpu(cpu);

5428

5423

5429

/*

5424

/*

5430

* Zero the differential counters of the dead processor

5425

* Zero the differential counters of the dead processor

5431

* so that the vm statistics are consistent.

5426

* so that the vm statistics are consistent.

5432

*

5427

*

5433

* This is only okay since the processor is dead and cannot

5428

* This is only okay since the processor is dead and cannot

5434

* race with what we are doing.

5429

* race with what we are doing.

5435

*/

5430

*/

5436

cpu_vm_stats_fold(cpu);

5431

cpu_vm_stats_fold(cpu);

5437

}

5432

}

5438

return NOTIFY_OK;

5433

return NOTIFY_OK;

5439

}

5434

}

5440

5435

5441

void __init page_alloc_init(void)

5436

void __init page_alloc_init(void)

5442

{

5437

{

5443

hotcpu_notifier(page_alloc_cpu_notify, 0);

5438

hotcpu_notifier(page_alloc_cpu_notify, 0);

5444

}

5439

}

5445

5440

5446

/*

5441

/*

5447

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5442

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5448

* or min_free_kbytes changes.

5443

* or min_free_kbytes changes.

5449

*/

5444

*/

5450

static void calculate_totalreserve_pages(void)

5445

static void calculate_totalreserve_pages(void)

5451

{

5446

{

5452

struct pglist_data *pgdat;

5447

struct pglist_data *pgdat;

5453

unsigned long reserve_pages = 0;

5448

unsigned long reserve_pages = 0;

5454

enum zone_type i, j;

5449

enum zone_type i, j;

5455

5450

5456

for_each_online_pgdat(pgdat) {

5451

for_each_online_pgdat(pgdat) {

5457

for (i = 0; i < MAX_NR_ZONES; i++) {

5452

for (i = 0; i < MAX_NR_ZONES; i++) {

5458

struct zone *zone = pgdat->node_zones + i;

5453

struct zone *zone = pgdat->node_zones + i;

5459

unsigned long max = 0;

5454

unsigned long max = 0;

5460

5455

5461

/* Find valid and maximum lowmem_reserve in the zone */

5456

/* Find valid and maximum lowmem_reserve in the zone */

5462

for (j = i; j < MAX_NR_ZONES; j++) {

5457

for (j = i; j < MAX_NR_ZONES; j++) {

5463

if (zone->lowmem_reserve[j] > max)

5458

if (zone->lowmem_reserve[j] > max)

5464

max = zone->lowmem_reserve[j];

5459

max = zone->lowmem_reserve[j];

5465

}

5460

}

5466

5461

5467

/* we treat the high watermark as reserved pages. */

5462

/* we treat the high watermark as reserved pages. */

5468

max += high_wmark_pages(zone);

5463

max += high_wmark_pages(zone);

5469

5464

5470

if (max > zone->managed_pages)

5465

if (max > zone->managed_pages)

5471

max = zone->managed_pages;

5466

max = zone->managed_pages;

5472

reserve_pages += max;

5467

reserve_pages += max;

5473

/*

5468

/*

5474

* Lowmem reserves are not available to

5469

* Lowmem reserves are not available to

5475

* GFP_HIGHUSER page cache allocations and

5470

* GFP_HIGHUSER page cache allocations and

5476

* kswapd tries to balance zones to their high

5471

* kswapd tries to balance zones to their high

5477

* watermark. As a result, neither should be

5472

* watermark. As a result, neither should be

5478

* regarded as dirtyable memory, to prevent a

5473

* regarded as dirtyable memory, to prevent a

5479

* situation where reclaim has to clean pages

5474

* situation where reclaim has to clean pages

5480

* in order to balance the zones.

5475

* in order to balance the zones.

5481

*/

5476

*/

5482

zone->dirty_balance_reserve = max;

5477

zone->dirty_balance_reserve = max;

5483

}

5478

}

5484

}

5479

}

5485

dirty_balance_reserve = reserve_pages;

5480

dirty_balance_reserve = reserve_pages;

5486

totalreserve_pages = reserve_pages;

5481

totalreserve_pages = reserve_pages;

5487

}

5482

}

5488

5483

5489

/*

5484

/*

5490

* setup_per_zone_lowmem_reserve - called whenever

5485

* setup_per_zone_lowmem_reserve - called whenever

5491

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5486

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5492

* has a correct pages reserved value, so an adequate number of

5487

* has a correct pages reserved value, so an adequate number of

5493

* pages are left in the zone after a successful __alloc_pages().

5488

* pages are left in the zone after a successful __alloc_pages().

5494

*/

5489

*/

5495

static void setup_per_zone_lowmem_reserve(void)

5490

static void setup_per_zone_lowmem_reserve(void)

5496

{

5491

{

5497

struct pglist_data *pgdat;

5492

struct pglist_data *pgdat;

5498

enum zone_type j, idx;

5493

enum zone_type j, idx;

5499

5494

5500

for_each_online_pgdat(pgdat) {

5495

for_each_online_pgdat(pgdat) {

5501

for (j = 0; j < MAX_NR_ZONES; j++) {

5496

for (j = 0; j < MAX_NR_ZONES; j++) {

5502

struct zone *zone = pgdat->node_zones + j;

5497

struct zone *zone = pgdat->node_zones + j;

5503

unsigned long managed_pages = zone->managed_pages;

5498

unsigned long managed_pages = zone->managed_pages;

5504

5499

5505

zone->lowmem_reserve[j] = 0;

5500

zone->lowmem_reserve[j] = 0;

5506

5501

5507

idx = j;

5502

idx = j;

5508

while (idx) {

5503

while (idx) {

5509

struct zone *lower_zone;

5504

struct zone *lower_zone;

5510

5505

5511

idx--;

5506

idx--;

5512

5507

5513

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5508

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5514

sysctl_lowmem_reserve_ratio[idx] = 1;

5509

sysctl_lowmem_reserve_ratio[idx] = 1;

5515

5510

5516

lower_zone = pgdat->node_zones + idx;

5511

lower_zone = pgdat->node_zones + idx;

5517

lower_zone->lowmem_reserve[j] = managed_pages /

5512

lower_zone->lowmem_reserve[j] = managed_pages /

5518

sysctl_lowmem_reserve_ratio[idx];

5513

sysctl_lowmem_reserve_ratio[idx];

5519

managed_pages += lower_zone->managed_pages;

5514

managed_pages += lower_zone->managed_pages;

5520

}

5515

}

5521

}

5516

}

5522

}

5517

}

5523

5518

5524

/* update totalreserve_pages */

5519

/* update totalreserve_pages */

5525

calculate_totalreserve_pages();

5520

calculate_totalreserve_pages();

5526

}

5521

}

5527

5522

5528

static void __setup_per_zone_wmarks(void)

5523

static void __setup_per_zone_wmarks(void)

5529

{

5524

{

5530

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5525

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5531

unsigned long lowmem_pages = 0;

5526

unsigned long lowmem_pages = 0;

5532

struct zone *zone;

5527

struct zone *zone;

5533

unsigned long flags;

5528

unsigned long flags;

5534

5529

5535

/* Calculate total number of !ZONE_HIGHMEM pages */

5530

/* Calculate total number of !ZONE_HIGHMEM pages */

5536

for_each_zone(zone) {

5531

for_each_zone(zone) {

5537

if (!is_highmem(zone))

5532

if (!is_highmem(zone))

5538

lowmem_pages += zone->managed_pages;

5533

lowmem_pages += zone->managed_pages;

5539

}

5534

}

5540

5535

5541

for_each_zone(zone) {

5536

for_each_zone(zone) {

5542

u64 tmp;

5537

u64 tmp;

5543

5538

5544

spin_lock_irqsave(&zone->lock, flags);

5539

spin_lock_irqsave(&zone->lock, flags);

5545

tmp = (u64)pages_min * zone->managed_pages;

5540

tmp = (u64)pages_min * zone->managed_pages;

5546

do_div(tmp, lowmem_pages);

5541

do_div(tmp, lowmem_pages);

5547

if (is_highmem(zone)) {

5542

if (is_highmem(zone)) {

5548

/*

5543

/*

5549

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5544

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5550

* need highmem pages, so cap pages_min to a small

5545

* need highmem pages, so cap pages_min to a small

5551

* value here.

5546

* value here.

5552

*

5547

*

5553

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5548

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5554

* deltas controls asynch page reclaim, and so should

5549

* deltas controls asynch page reclaim, and so should

5555

* not be capped for highmem.

5550

* not be capped for highmem.

5556

*/

5551

*/

5557

unsigned long min_pages;

5552

unsigned long min_pages;

5558

5553

5559

min_pages = zone->managed_pages / 1024;

5554

min_pages = zone->managed_pages / 1024;

5560

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5555

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5561

zone->watermark[WMARK_MIN] = min_pages;

5556

zone->watermark[WMARK_MIN] = min_pages;

5562

} else {

5557

} else {

5563

/*

5558

/*

5564

* If it's a lowmem zone, reserve a number of pages

5559

* If it's a lowmem zone, reserve a number of pages

5565

* proportionate to the zone's size.

5560

* proportionate to the zone's size.

5566

*/

5561

*/

5567

zone->watermark[WMARK_MIN] = tmp;

5562

zone->watermark[WMARK_MIN] = tmp;

5568

}

5563

}

5569

5564

5570

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5565

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5571

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5566

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5572

5567

5573

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5568

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5574

high_wmark_pages(zone) -

5569

high_wmark_pages(zone) -

5575

low_wmark_pages(zone) -

5570

low_wmark_pages(zone) -

5576

zone_page_state(zone, NR_ALLOC_BATCH));

5571

zone_page_state(zone, NR_ALLOC_BATCH));

5577

5572

5578

setup_zone_migrate_reserve(zone);

5573

setup_zone_migrate_reserve(zone);

5579

spin_unlock_irqrestore(&zone->lock, flags);

5574

spin_unlock_irqrestore(&zone->lock, flags);

5580

}

5575

}

5581

5576

5582

/* update totalreserve_pages */

5577

/* update totalreserve_pages */

5583

calculate_totalreserve_pages();

5578

calculate_totalreserve_pages();

5584

}

5579

}

5585

5580

5586

/**

5581

/**

5587

* setup_per_zone_wmarks - called when min_free_kbytes changes

5582

* setup_per_zone_wmarks - called when min_free_kbytes changes

5588

* or when memory is hot-{added|removed}

5583

* or when memory is hot-{added|removed}

5589

*

5584

*

5590

* Ensures that the watermark[min,low,high] values for each zone are set

5585

* Ensures that the watermark[min,low,high] values for each zone are set

5591

* correctly with respect to min_free_kbytes.

5586

* correctly with respect to min_free_kbytes.

5592

*/

5587

*/

5593

void setup_per_zone_wmarks(void)

5588

void setup_per_zone_wmarks(void)

5594

{

5589

{

5595

mutex_lock(&zonelists_mutex);

5590

mutex_lock(&zonelists_mutex);

5596

__setup_per_zone_wmarks();

5591

__setup_per_zone_wmarks();

5597

mutex_unlock(&zonelists_mutex);

5592

mutex_unlock(&zonelists_mutex);

5598

}

5593

}

5599

5594

5600

/*

5595

/*

5601

* The inactive anon list should be small enough that the VM never has to

5596

* The inactive anon list should be small enough that the VM never has to

5602

* do too much work, but large enough that each inactive page has a chance

5597

* do too much work, but large enough that each inactive page has a chance

5603

* to be referenced again before it is swapped out.

5598

* to be referenced again before it is swapped out.

5604

*

5599

*

5605

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5600

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5606

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5601

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5607

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5602

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5608

* the anonymous pages are kept on the inactive list.

5603

* the anonymous pages are kept on the inactive list.

5609

*

5604

*

5610

* total target max

5605

* total target max

5611

* memory ratio inactive anon

5606

* memory ratio inactive anon

5612

* -------------------------------------

5607

* -------------------------------------

5613

* 10MB 1 5MB

5608

* 10MB 1 5MB

5614

* 100MB 1 50MB

5609

* 100MB 1 50MB

5615

* 1GB 3 250MB

5610

* 1GB 3 250MB

5616

* 10GB 10 0.9GB

5611

* 10GB 10 0.9GB

5617

* 100GB 31 3GB

5612

* 100GB 31 3GB

5618

* 1TB 101 10GB

5613

* 1TB 101 10GB

5619

* 10TB 320 32GB

5614

* 10TB 320 32GB

5620

*/

5615

*/

5621

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5616

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5622

{

5617

{

5623

unsigned int gb, ratio;

5618

unsigned int gb, ratio;

5624

5619

5625

/* Zone size in gigabytes */

5620

/* Zone size in gigabytes */

5626

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5621

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5627

if (gb)

5622

if (gb)

5628

ratio = int_sqrt(10 * gb);

5623

ratio = int_sqrt(10 * gb);

5629

else

5624

else

5630

ratio = 1;

5625

ratio = 1;

5631

5626

5632

zone->inactive_ratio = ratio;

5627

zone->inactive_ratio = ratio;

5633

}

5628

}

5634

5629

5635

static void __meminit setup_per_zone_inactive_ratio(void)

5630

static void __meminit setup_per_zone_inactive_ratio(void)

5636

{

5631

{

5637

struct zone *zone;

5632

struct zone *zone;

5638

5633

5639

for_each_zone(zone)

5634

for_each_zone(zone)

5640

calculate_zone_inactive_ratio(zone);

5635

calculate_zone_inactive_ratio(zone);

5641

}

5636

}

5642

5637

5643

/*

5638

/*

5644

* Initialise min_free_kbytes.

5639

* Initialise min_free_kbytes.

5645

*

5640

*

5646

* For small machines we want it small (128k min). For large machines

5641

* For small machines we want it small (128k min). For large machines

5647

* we want it large (64MB max). But it is not linear, because network

5642

* we want it large (64MB max). But it is not linear, because network

5648

* bandwidth does not increase linearly with machine size. We use

5643

* bandwidth does not increase linearly with machine size. We use

5649

*

5644

*

5650

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5645

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5651

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5646

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5652

*

5647

*

5653

* which yields

5648

* which yields

5654

*

5649

*

5655

* 16MB: 512k

5650

* 16MB: 512k

5656

* 32MB: 724k

5651

* 32MB: 724k

5657

* 64MB: 1024k

5652

* 64MB: 1024k

5658

* 128MB: 1448k

5653

* 128MB: 1448k

5659

* 256MB: 2048k

5654

* 256MB: 2048k

5660

* 512MB: 2896k

5655

* 512MB: 2896k

5661

* 1024MB: 4096k

5656

* 1024MB: 4096k

5662

* 2048MB: 5792k

5657

* 2048MB: 5792k

5663

* 4096MB: 8192k

5658

* 4096MB: 8192k

5664

* 8192MB: 11584k

5659

* 8192MB: 11584k

5665

* 16384MB: 16384k

5660

* 16384MB: 16384k

5666

*/

5661

*/

5667

int __meminit init_per_zone_wmark_min(void)

5662

int __meminit init_per_zone_wmark_min(void)

5668

{

5663

{

5669

unsigned long lowmem_kbytes;

5664

unsigned long lowmem_kbytes;

5670

int new_min_free_kbytes;

5665

int new_min_free_kbytes;

5671

5666

5672

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5667

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5673

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5668

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5674

5669

5675

if (new_min_free_kbytes > user_min_free_kbytes) {

5670

if (new_min_free_kbytes > user_min_free_kbytes) {

5676

min_free_kbytes = new_min_free_kbytes;

5671

min_free_kbytes = new_min_free_kbytes;

5677

if (min_free_kbytes < 128)

5672

if (min_free_kbytes < 128)

5678

min_free_kbytes = 128;

5673

min_free_kbytes = 128;

5679

if (min_free_kbytes > 65536)

5674

if (min_free_kbytes > 65536)

5680

min_free_kbytes = 65536;

5675

min_free_kbytes = 65536;

5681

} else {

5676

} else {

5682

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5677

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5683

new_min_free_kbytes, user_min_free_kbytes);

5678

new_min_free_kbytes, user_min_free_kbytes);

5684

}

5679

}

5685

setup_per_zone_wmarks();

5680

setup_per_zone_wmarks();

5686

refresh_zone_stat_thresholds();

5681

refresh_zone_stat_thresholds();

5687

setup_per_zone_lowmem_reserve();

5682

setup_per_zone_lowmem_reserve();

5688

setup_per_zone_inactive_ratio();

5683

setup_per_zone_inactive_ratio();

5689

return 0;

5684

return 0;

5690

}

5685

}

5691

module_init(init_per_zone_wmark_min)

5686

module_init(init_per_zone_wmark_min)

5692

5687

5693

/*

5688

/*

5694

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5689

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5695

* that we can call two helper functions whenever min_free_kbytes

5690

* that we can call two helper functions whenever min_free_kbytes

5696

* changes.

5691

* changes.

5697

*/

5692

*/

5698

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5693

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5699

void __user *buffer, size_t *length, loff_t *ppos)

5694

void __user *buffer, size_t *length, loff_t *ppos)

5700

{

5695

{

5701

proc_dointvec(table, write, buffer, length, ppos);

5696

proc_dointvec(table, write, buffer, length, ppos);

5702

if (write) {

5697

if (write) {

5703

user_min_free_kbytes = min_free_kbytes;

5698

user_min_free_kbytes = min_free_kbytes;

5704

setup_per_zone_wmarks();

5699

setup_per_zone_wmarks();

5705

}

5700

}

5706

return 0;

5701

return 0;

5707

}

5702

}

5708

5703

5709

#ifdef CONFIG_NUMA

5704

#ifdef CONFIG_NUMA

5710

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5705

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5711

void __user *buffer, size_t *length, loff_t *ppos)

5706

void __user *buffer, size_t *length, loff_t *ppos)

5712

{

5707

{

5713

struct zone *zone;

5708

struct zone *zone;

5714

int rc;

5709

int rc;

5715

5710

5716

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5711

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5717

if (rc)

5712

if (rc)

5718

return rc;

5713

return rc;

5719

5714

5720

for_each_zone(zone)

5715

for_each_zone(zone)

5721

zone->min_unmapped_pages = (zone->managed_pages *

5716

zone->min_unmapped_pages = (zone->managed_pages *

5722

sysctl_min_unmapped_ratio) / 100;

5717

sysctl_min_unmapped_ratio) / 100;

5723

return 0;

5718

return 0;

5724

}

5719

}

5725

5720

5726

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5721

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5727

void __user *buffer, size_t *length, loff_t *ppos)

5722

void __user *buffer, size_t *length, loff_t *ppos)

5728

{

5723

{

5729

struct zone *zone;

5724

struct zone *zone;

5730

int rc;

5725

int rc;

5731

5726

5732

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5727

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5733

if (rc)

5728

if (rc)

5734

return rc;

5729

return rc;

5735

5730

5736

for_each_zone(zone)

5731

for_each_zone(zone)

5737

zone->min_slab_pages = (zone->managed_pages *

5732

zone->min_slab_pages = (zone->managed_pages *

5738

sysctl_min_slab_ratio) / 100;

5733

sysctl_min_slab_ratio) / 100;

5739

return 0;

5734

return 0;

5740

}

5735

}

5741

#endif

5736

#endif

5742

5737

5743

/*

5738

/*

5744

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5739

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5745

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5740

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5746

* whenever sysctl_lowmem_reserve_ratio changes.

5741

* whenever sysctl_lowmem_reserve_ratio changes.

5747

*

5742

*

5748

* The reserve ratio obviously has absolutely no relation with the

5743

* The reserve ratio obviously has absolutely no relation with the

5749

* minimum watermarks. The lowmem reserve ratio can only make sense

5744

* minimum watermarks. The lowmem reserve ratio can only make sense

5750

* if in function of the boot time zone sizes.

5745

* if in function of the boot time zone sizes.

5751

*/

5746

*/

5752

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5747

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5753

void __user *buffer, size_t *length, loff_t *ppos)

5748

void __user *buffer, size_t *length, loff_t *ppos)

5754

{

5749

{

5755

proc_dointvec_minmax(table, write, buffer, length, ppos);

5750

proc_dointvec_minmax(table, write, buffer, length, ppos);

5756

setup_per_zone_lowmem_reserve();

5751

setup_per_zone_lowmem_reserve();

5757

return 0;

5752

return 0;

5758

}

5753

}

5759

5754

5760

/*

5755

/*

5761

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5756

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5762

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5757

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5763

* pagelist can have before it gets flushed back to buddy allocator.

5758

* pagelist can have before it gets flushed back to buddy allocator.

5764

*/

5759

*/

5765

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5760

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5766

void __user *buffer, size_t *length, loff_t *ppos)

5761

void __user *buffer, size_t *length, loff_t *ppos)

5767

{

5762

{

5768

struct zone *zone;

5763

struct zone *zone;

5769

unsigned int cpu;

5764

unsigned int cpu;

5770

int ret;

5765

int ret;

5771

5766

5772

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5767

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5773

if (!write || (ret < 0))

5768

if (!write || (ret < 0))

5774

return ret;

5769

return ret;

5775

5770

5776

mutex_lock(&pcp_batch_high_lock);

5771

mutex_lock(&pcp_batch_high_lock);

5777

for_each_populated_zone(zone) {

5772

for_each_populated_zone(zone) {

5778

unsigned long high;

5773

unsigned long high;

5779

high = zone->managed_pages / percpu_pagelist_fraction;

5774

high = zone->managed_pages / percpu_pagelist_fraction;

5780

for_each_possible_cpu(cpu)

5775

for_each_possible_cpu(cpu)

5781

pageset_set_high(per_cpu_ptr(zone->pageset, cpu),

5776

pageset_set_high(per_cpu_ptr(zone->pageset, cpu),

5782

high);

5777

high);

5783

}

5778

}

5784

mutex_unlock(&pcp_batch_high_lock);

5779

mutex_unlock(&pcp_batch_high_lock);

5785

return 0;

5780

return 0;

5786

}

5781

}

5787

5782

5788

int hashdist = HASHDIST_DEFAULT;

5783

int hashdist = HASHDIST_DEFAULT;

5789

5784

5790

#ifdef CONFIG_NUMA

5785

#ifdef CONFIG_NUMA

5791

static int __init set_hashdist(char *str)

5786

static int __init set_hashdist(char *str)

5792

{

5787

{

5793

if (!str)

5788

if (!str)

5794

return 0;

5789

return 0;

5795

hashdist = simple_strtoul(str, &str, 0);

5790

hashdist = simple_strtoul(str, &str, 0);

5796

return 1;

5791

return 1;

5797

}

5792

}

5798

__setup("hashdist=", set_hashdist);

5793

__setup("hashdist=", set_hashdist);

5799

#endif

5794

#endif

5800

5795

5801

/*

5796

/*

5802

* allocate a large system hash table from bootmem

5797

* allocate a large system hash table from bootmem

5803

* - it is assumed that the hash table must contain an exact power-of-2

5798

* - it is assumed that the hash table must contain an exact power-of-2

5804

* quantity of entries

5799

* quantity of entries

5805

* - limit is the number of hash buckets, not the total allocation size

5800

* - limit is the number of hash buckets, not the total allocation size

5806

*/

5801

*/

5807

void *__init alloc_large_system_hash(const char *tablename,

5802

void *__init alloc_large_system_hash(const char *tablename,

5808

unsigned long bucketsize,

5803

unsigned long bucketsize,

5809

unsigned long numentries,

5804

unsigned long numentries,

5810

int scale,

5805

int scale,

5811

int flags,

5806

int flags,

5812

unsigned int *_hash_shift,

5807

unsigned int *_hash_shift,

5813

unsigned int *_hash_mask,

5808

unsigned int *_hash_mask,

5814

unsigned long low_limit,

5809

unsigned long low_limit,

5815

unsigned long high_limit)

5810

unsigned long high_limit)

5816

{

5811

{

5817

unsigned long long max = high_limit;

5812

unsigned long long max = high_limit;

5818

unsigned long log2qty, size;

5813

unsigned long log2qty, size;

5819

void *table = NULL;

5814

void *table = NULL;

5820

5815

5821

/* allow the kernel cmdline to have a say */

5816

/* allow the kernel cmdline to have a say */

5822

if (!numentries) {

5817

if (!numentries) {

5823

/* round applicable memory size up to nearest megabyte */

5818

/* round applicable memory size up to nearest megabyte */

5824

numentries = nr_kernel_pages;

5819

numentries = nr_kernel_pages;

5825

5820

5826

/* It isn't necessary when PAGE_SIZE >= 1MB */

5821

/* It isn't necessary when PAGE_SIZE >= 1MB */

5827

if (PAGE_SHIFT < 20)

5822

if (PAGE_SHIFT < 20)

5828

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5823

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5829

5824

5830

/* limit to 1 bucket per 2^scale bytes of low memory */

5825

/* limit to 1 bucket per 2^scale bytes of low memory */

5831

if (scale > PAGE_SHIFT)

5826

if (scale > PAGE_SHIFT)

5832

numentries >>= (scale - PAGE_SHIFT);

5827

numentries >>= (scale - PAGE_SHIFT);

5833

else

5828

else

5834

numentries <<= (PAGE_SHIFT - scale);

5829

numentries <<= (PAGE_SHIFT - scale);

5835

5830

5836

/* Make sure we've got at least a 0-order allocation.. */

5831

/* Make sure we've got at least a 0-order allocation.. */

5837

if (unlikely(flags & HASH_SMALL)) {

5832

if (unlikely(flags & HASH_SMALL)) {

5838

/* Makes no sense without HASH_EARLY */

5833

/* Makes no sense without HASH_EARLY */

5839

WARN_ON(!(flags & HASH_EARLY));

5834

WARN_ON(!(flags & HASH_EARLY));

5840

if (!(numentries >> *_hash_shift)) {

5835

if (!(numentries >> *_hash_shift)) {

5841

numentries = 1UL << *_hash_shift;

5836

numentries = 1UL << *_hash_shift;

5842

BUG_ON(!numentries);

5837

BUG_ON(!numentries);

5843

}

5838

}

5844

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5839

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5845

numentries = PAGE_SIZE / bucketsize;

5840

numentries = PAGE_SIZE / bucketsize;

5846

}

5841

}

5847

numentries = roundup_pow_of_two(numentries);

5842

numentries = roundup_pow_of_two(numentries);

5848

5843

5849

/* limit allocation size to 1/16 total memory by default */

5844

/* limit allocation size to 1/16 total memory by default */

5850

if (max == 0) {

5845

if (max == 0) {

5851

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5846

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5852

do_div(max, bucketsize);

5847

do_div(max, bucketsize);

5853

}

5848

}

5854

max = min(max, 0x80000000ULL);

5849

max = min(max, 0x80000000ULL);

5855

5850

5856

if (numentries < low_limit)

5851

if (numentries < low_limit)

5857

numentries = low_limit;

5852

numentries = low_limit;

5858

if (numentries > max)

5853

if (numentries > max)

5859

numentries = max;

5854

numentries = max;

5860

5855

5861

log2qty = ilog2(numentries);

5856

log2qty = ilog2(numentries);

5862

5857

5863

do {

5858

do {

5864

size = bucketsize << log2qty;

5859

size = bucketsize << log2qty;

5865

if (flags & HASH_EARLY)

5860

if (flags & HASH_EARLY)

5866

table = alloc_bootmem_nopanic(size);

5861

table = alloc_bootmem_nopanic(size);

5867

else if (hashdist)

5862

else if (hashdist)

5868

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5863

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5869

else {

5864

else {

5870

/*

5865

/*

5871

* If bucketsize is not a power-of-two, we may free

5866

* If bucketsize is not a power-of-two, we may free

5872

* some pages at the end of hash table which

5867

* some pages at the end of hash table which

5873

* alloc_pages_exact() automatically does

5868

* alloc_pages_exact() automatically does

5874

*/

5869

*/

5875

if (get_order(size) < MAX_ORDER) {

5870

if (get_order(size) < MAX_ORDER) {

5876

table = alloc_pages_exact(size, GFP_ATOMIC);

5871

table = alloc_pages_exact(size, GFP_ATOMIC);

5877

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5872

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5878

}

5873

}

5879

}

5874

}

5880

} while (!table && size > PAGE_SIZE && --log2qty);

5875

} while (!table && size > PAGE_SIZE && --log2qty);

5881

5876

5882

if (!table)

5877

if (!table)

5883

panic("Failed to allocate %s hash table\n", tablename);

5878

panic("Failed to allocate %s hash table\n", tablename);

5884

5879

5885

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5880

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5886

tablename,

5881

tablename,

5887

(1UL << log2qty),

5882

(1UL << log2qty),

5888

ilog2(size) - PAGE_SHIFT,

5883

ilog2(size) - PAGE_SHIFT,

5889

size);

5884

size);

5890

5885

5891

if (_hash_shift)

5886

if (_hash_shift)

5892

*_hash_shift = log2qty;

5887

*_hash_shift = log2qty;

5893

if (_hash_mask)

5888

if (_hash_mask)

5894

*_hash_mask = (1 << log2qty) - 1;

5889

*_hash_mask = (1 << log2qty) - 1;

5895

5890

5896

return table;

5891

return table;

5897

}

5892

}

5898

5893

5899

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5894

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5900

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5895

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5901

unsigned long pfn)

5896

unsigned long pfn)

5902

{

5897

{

5903

#ifdef CONFIG_SPARSEMEM

5898

#ifdef CONFIG_SPARSEMEM

5904

return __pfn_to_section(pfn)->pageblock_flags;

5899

return __pfn_to_section(pfn)->pageblock_flags;

5905

#else

5900

#else

5906

return zone->pageblock_flags;

5901

return zone->pageblock_flags;

5907

#endif /* CONFIG_SPARSEMEM */

5902

#endif /* CONFIG_SPARSEMEM */

5908

}

5903

}

5909

5904

5910

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5905

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5911

{

5906

{

5912

#ifdef CONFIG_SPARSEMEM

5907

#ifdef CONFIG_SPARSEMEM

5913

pfn &= (PAGES_PER_SECTION-1);

5908

pfn &= (PAGES_PER_SECTION-1);

5914

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5909

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5915

#else

5910

#else

5916

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5911

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5917

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5912

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5918

#endif /* CONFIG_SPARSEMEM */

5913

#endif /* CONFIG_SPARSEMEM */

5919

}

5914

}

5920

5915

5921

/**

5916

/**

5922

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5917

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5923

* @page: The page within the block of interest

5918

* @page: The page within the block of interest

5924

* @start_bitidx: The first bit of interest to retrieve

5919

* @start_bitidx: The first bit of interest to retrieve

5925

* @end_bitidx: The last bit of interest

5920

* @end_bitidx: The last bit of interest

5926

* returns pageblock_bits flags

5921

* returns pageblock_bits flags

5927

*/

5922

*/

5928

unsigned long get_pageblock_flags_group(struct page *page,

5923

unsigned long get_pageblock_flags_group(struct page *page,

5929

int start_bitidx, int end_bitidx)

5924

int start_bitidx, int end_bitidx)

5930

{

5925

{

5931

struct zone *zone;

5926

struct zone *zone;

5932

unsigned long *bitmap;

5927

unsigned long *bitmap;

5933

unsigned long pfn, bitidx;

5928

unsigned long pfn, bitidx;

5934

unsigned long flags = 0;

5929

unsigned long flags = 0;

5935

unsigned long value = 1;

5930

unsigned long value = 1;

5936

5931

5937

zone = page_zone(page);

5932

zone = page_zone(page);

5938

pfn = page_to_pfn(page);

5933

pfn = page_to_pfn(page);

5939

bitmap = get_pageblock_bitmap(zone, pfn);

5934

bitmap = get_pageblock_bitmap(zone, pfn);

5940

bitidx = pfn_to_bitidx(zone, pfn);

5935

bitidx = pfn_to_bitidx(zone, pfn);

5941

5936

5942

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5937

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5943

if (test_bit(bitidx + start_bitidx, bitmap))

5938

if (test_bit(bitidx + start_bitidx, bitmap))

5944

flags |= value;

5939

flags |= value;

5945

5940

5946

return flags;

5941

return flags;

5947

}

5942

}

5948

5943

5949

/**

5944

/**

5950

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5945

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5951

* @page: The page within the block of interest

5946

* @page: The page within the block of interest

5952

* @start_bitidx: The first bit of interest

5947

* @start_bitidx: The first bit of interest

5953

* @end_bitidx: The last bit of interest

5948

* @end_bitidx: The last bit of interest

5954

* @flags: The flags to set

5949

* @flags: The flags to set

5955

*/

5950

*/

5956

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5951

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5957

int start_bitidx, int end_bitidx)

5952

int start_bitidx, int end_bitidx)

5958

{

5953

{

5959

struct zone *zone;

5954

struct zone *zone;

5960

unsigned long *bitmap;

5955

unsigned long *bitmap;

5961

unsigned long pfn, bitidx;

5956

unsigned long pfn, bitidx;

5962

unsigned long value = 1;

5957

unsigned long value = 1;

5963

5958

5964

zone = page_zone(page);

5959

zone = page_zone(page);

5965

pfn = page_to_pfn(page);

5960

pfn = page_to_pfn(page);

5966

bitmap = get_pageblock_bitmap(zone, pfn);

5961

bitmap = get_pageblock_bitmap(zone, pfn);

5967

bitidx = pfn_to_bitidx(zone, pfn);

5962

bitidx = pfn_to_bitidx(zone, pfn);

5968

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

5963

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

5969

5964

5970

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5965

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5971

if (flags & value)

5966

if (flags & value)

5972

__set_bit(bitidx + start_bitidx, bitmap);

5967

__set_bit(bitidx + start_bitidx, bitmap);

5973

else

5968

else

5974

__clear_bit(bitidx + start_bitidx, bitmap);

5969

__clear_bit(bitidx + start_bitidx, bitmap);

5975

}

5970

}

5976

5971

5977

/*

5972

/*

5978

* This function checks whether pageblock includes unmovable pages or not.

5973

* This function checks whether pageblock includes unmovable pages or not.

5979

* If @count is not zero, it is okay to include less @count unmovable pages

5974

* If @count is not zero, it is okay to include less @count unmovable pages

5980

*

5975

*

5981

* PageLRU check without isolation or lru_lock could race so that

5976

* PageLRU check without isolation or lru_lock could race so that

5982

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5977

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5983

* expect this function should be exact.

5978

* expect this function should be exact.

5984

*/

5979

*/

5985

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

5980

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

5986

bool skip_hwpoisoned_pages)

5981

bool skip_hwpoisoned_pages)

5987

{

5982

{

5988

unsigned long pfn, iter, found;

5983

unsigned long pfn, iter, found;

5989

int mt;

5984

int mt;

5990

5985

5991

/*

5986

/*

5992

* For avoiding noise data, lru_add_drain_all() should be called

5987

* For avoiding noise data, lru_add_drain_all() should be called

5993

* If ZONE_MOVABLE, the zone never contains unmovable pages

5988

* If ZONE_MOVABLE, the zone never contains unmovable pages

5994

*/

5989

*/

5995

if (zone_idx(zone) == ZONE_MOVABLE)

5990

if (zone_idx(zone) == ZONE_MOVABLE)

5996

return false;

5991

return false;

5997

mt = get_pageblock_migratetype(page);

5992

mt = get_pageblock_migratetype(page);

5998

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5993

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5999

return false;

5994

return false;

6000

5995

6001

pfn = page_to_pfn(page);

5996

pfn = page_to_pfn(page);

6002

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5997

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6003

unsigned long check = pfn + iter;

5998

unsigned long check = pfn + iter;

6004

5999

6005

if (!pfn_valid_within(check))

6000

if (!pfn_valid_within(check))

6006

continue;

6001

continue;

6007

6002

6008

page = pfn_to_page(check);

6003

page = pfn_to_page(check);

6009

6004

6010

/*

6005

/*

6011

* Hugepages are not in LRU lists, but they're movable.

6006

* Hugepages are not in LRU lists, but they're movable.

6012

* We need not scan over tail pages bacause we don't

6007

* We need not scan over tail pages bacause we don't

6013

* handle each tail page individually in migration.

6008

* handle each tail page individually in migration.

6014

*/

6009

*/

6015

if (PageHuge(page)) {

6010

if (PageHuge(page)) {

6016

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6011

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6017

continue;

6012

continue;

6018

}

6013

}

6019

6014

6020

/*

6015

/*

6021

* We can't use page_count without pin a page

6016

* We can't use page_count without pin a page

6022

* because another CPU can free compound page.

6017

* because another CPU can free compound page.

6023

* This check already skips compound tails of THP

6018

* This check already skips compound tails of THP

6024

* because their page->_count is zero at all time.

6019

* because their page->_count is zero at all time.

6025

*/

6020

*/

6026

if (!atomic_read(&page->_count)) {

6021

if (!atomic_read(&page->_count)) {

6027

if (PageBuddy(page))

6022

if (PageBuddy(page))

6028

iter += (1 << page_order(page)) - 1;

6023

iter += (1 << page_order(page)) - 1;

6029

continue;

6024

continue;

6030

}

6025

}

6031

6026

6032

/*

6027

/*

6033

* The HWPoisoned page may be not in buddy system, and

6028

* The HWPoisoned page may be not in buddy system, and

6034

* page_count() is not 0.

6029

* page_count() is not 0.

6035

*/

6030

*/

6036

if (skip_hwpoisoned_pages && PageHWPoison(page))

6031

if (skip_hwpoisoned_pages && PageHWPoison(page))

6037

continue;

6032

continue;

6038

6033

6039

if (!PageLRU(page))

6034

if (!PageLRU(page))

6040

found++;

6035

found++;

6041

/*

6036

/*

6042

* If there are RECLAIMABLE pages, we need to check it.

6037

* If there are RECLAIMABLE pages, we need to check it.

6043

* But now, memory offline itself doesn't call shrink_slab()

6038

* But now, memory offline itself doesn't call shrink_slab()

6044

* and it still to be fixed.

6039

* and it still to be fixed.

6045

*/

6040

*/

6046

/*

6041

/*

6047

* If the page is not RAM, page_count()should be 0.

6042

* If the page is not RAM, page_count()should be 0.

6048

* we don't need more check. This is an _used_ not-movable page.

6043

* we don't need more check. This is an _used_ not-movable page.

6049

*

6044

*

6050

* The problematic thing here is PG_reserved pages. PG_reserved

6045

* The problematic thing here is PG_reserved pages. PG_reserved

6051

* is set to both of a memory hole page and a _used_ kernel

6046

* is set to both of a memory hole page and a _used_ kernel

6052

* page at boot.

6047

* page at boot.

6053

*/

6048

*/

6054

if (found > count)

6049

if (found > count)

6055

return true;

6050

return true;

6056

}

6051

}

6057

return false;

6052

return false;

6058

}

6053

}

6059

6054

6060

bool is_pageblock_removable_nolock(struct page *page)

6055

bool is_pageblock_removable_nolock(struct page *page)

6061

{

6056

{

6062

struct zone *zone;

6057

struct zone *zone;

6063

unsigned long pfn;

6058

unsigned long pfn;

6064

6059

6065

/*

6060

/*

6066

* We have to be careful here because we are iterating over memory

6061

* We have to be careful here because we are iterating over memory

6067

* sections which are not zone aware so we might end up outside of

6062

* sections which are not zone aware so we might end up outside of

6068

* the zone but still within the section.

6063

* the zone but still within the section.

6069

* We have to take care about the node as well. If the node is offline

6064

* We have to take care about the node as well. If the node is offline

6070

* its NODE_DATA will be NULL - see page_zone.

6065

* its NODE_DATA will be NULL - see page_zone.

6071

*/

6066

*/

6072

if (!node_online(page_to_nid(page)))

6067

if (!node_online(page_to_nid(page)))

6073

return false;

6068

return false;

6074

6069

6075

zone = page_zone(page);

6070

zone = page_zone(page);

6076

pfn = page_to_pfn(page);

6071

pfn = page_to_pfn(page);

6077

if (!zone_spans_pfn(zone, pfn))

6072

if (!zone_spans_pfn(zone, pfn))

6078

return false;

6073

return false;

6079

6074

6080

return !has_unmovable_pages(zone, page, 0, true);

6075

return !has_unmovable_pages(zone, page, 0, true);

6081

}

6076

}

6082

6077

6083

#ifdef CONFIG_CMA

6078

#ifdef CONFIG_CMA

6084

6079

6085

static unsigned long pfn_max_align_down(unsigned long pfn)

6080

static unsigned long pfn_max_align_down(unsigned long pfn)

6086

{

6081

{

6087

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6082

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6088

pageblock_nr_pages) - 1);

6083

pageblock_nr_pages) - 1);

6089

}

6084

}

6090

6085

6091

static unsigned long pfn_max_align_up(unsigned long pfn)

6086

static unsigned long pfn_max_align_up(unsigned long pfn)

6092

{

6087

{

6093

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6088

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6094

pageblock_nr_pages));

6089

pageblock_nr_pages));

6095

}

6090

}

6096

6091

6097

/* [start, end) must belong to a single zone. */

6092

/* [start, end) must belong to a single zone. */

6098

static int __alloc_contig_migrate_range(struct compact_control *cc,

6093

static int __alloc_contig_migrate_range(struct compact_control *cc,

6099

unsigned long start, unsigned long end)

6094

unsigned long start, unsigned long end)

6100

{

6095

{

6101

/* This function is based on compact_zone() from compaction.c. */

6096

/* This function is based on compact_zone() from compaction.c. */

6102

unsigned long nr_reclaimed;

6097

unsigned long nr_reclaimed;

6103

unsigned long pfn = start;

6098

unsigned long pfn = start;

6104

unsigned int tries = 0;

6099

unsigned int tries = 0;

6105

int ret = 0;

6100

int ret = 0;

6106

6101

6107

migrate_prep();

6102

migrate_prep();

6108

6103

6109

while (pfn < end || !list_empty(&cc->migratepages)) {

6104

while (pfn < end || !list_empty(&cc->migratepages)) {

6110

if (fatal_signal_pending(current)) {

6105

if (fatal_signal_pending(current)) {

6111

ret = -EINTR;

6106

ret = -EINTR;

6112

break;

6107

break;

6113

}

6108

}

6114

6109

6115

if (list_empty(&cc->migratepages)) {

6110

if (list_empty(&cc->migratepages)) {

6116

cc->nr_migratepages = 0;

6111

cc->nr_migratepages = 0;

6117

pfn = isolate_migratepages_range(cc->zone, cc,

6112

pfn = isolate_migratepages_range(cc->zone, cc,

6118

pfn, end, true);

6113

pfn, end, true);

6119

if (!pfn) {

6114

if (!pfn) {

6120

ret = -EINTR;

6115

ret = -EINTR;

6121

break;

6116

break;

6122

}

6117

}

6123

tries = 0;

6118

tries = 0;

6124

} else if (++tries == 5) {

6119

} else if (++tries == 5) {

6125

ret = ret < 0 ? ret : -EBUSY;

6120

ret = ret < 0 ? ret : -EBUSY;

6126

break;

6121

break;

6127

}

6122

}

6128

6123

6129

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6124

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6130

&cc->migratepages);

6125

&cc->migratepages);

6131

cc->nr_migratepages -= nr_reclaimed;

6126

cc->nr_migratepages -= nr_reclaimed;

6132

6127

6133

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6128

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6134

0, MIGRATE_SYNC, MR_CMA);

6129

0, MIGRATE_SYNC, MR_CMA);

6135

}

6130

}

6136

if (ret < 0) {

6131

if (ret < 0) {

6137

putback_movable_pages(&cc->migratepages);

6132

putback_movable_pages(&cc->migratepages);

6138

return ret;

6133

return ret;

6139

}

6134

}

6140

return 0;

6135

return 0;

6141

}

6136

}

6142

6137

6143

/**

6138

/**

6144

* alloc_contig_range() -- tries to allocate given range of pages

6139

* alloc_contig_range() -- tries to allocate given range of pages

6145

* @start: start PFN to allocate

6140

* @start: start PFN to allocate

6146

* @end: one-past-the-last PFN to allocate

6141

* @end: one-past-the-last PFN to allocate

6147

* @migratetype: migratetype of the underlaying pageblocks (either

6142

* @migratetype: migratetype of the underlaying pageblocks (either

6148

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6143

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6149

* in range must have the same migratetype and it must

6144

* in range must have the same migratetype and it must

6150

* be either of the two.

6145

* be either of the two.

6151

*

6146

*

6152

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6147

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6153

* aligned, however it's the caller's responsibility to guarantee that

6148

* aligned, however it's the caller's responsibility to guarantee that

6154

* we are the only thread that changes migrate type of pageblocks the

6149

* we are the only thread that changes migrate type of pageblocks the

6155

* pages fall in.

6150

* pages fall in.

6156

*

6151

*

6157

* The PFN range must belong to a single zone.

6152

* The PFN range must belong to a single zone.

6158

*

6153

*

6159

* Returns zero on success or negative error code. On success all

6154

* Returns zero on success or negative error code. On success all

6160

* pages which PFN is in [start, end) are allocated for the caller and

6155

* pages which PFN is in [start, end) are allocated for the caller and

6161

* need to be freed with free_contig_range().

6156

* need to be freed with free_contig_range().

6162

*/

6157

*/

6163

int alloc_contig_range(unsigned long start, unsigned long end,

6158

int alloc_contig_range(unsigned long start, unsigned long end,

6164

unsigned migratetype)

6159

unsigned migratetype)

6165

{

6160

{

6166

unsigned long outer_start, outer_end;

6161

unsigned long outer_start, outer_end;

6167

int ret = 0, order;

6162

int ret = 0, order;

6168

6163

6169

struct compact_control cc = {

6164

struct compact_control cc = {

6170

.nr_migratepages = 0,

6165

.nr_migratepages = 0,

6171

.order = -1,

6166

.order = -1,

6172

.zone = page_zone(pfn_to_page(start)),

6167

.zone = page_zone(pfn_to_page(start)),

6173

.sync = true,

6168

.sync = true,

6174

.ignore_skip_hint = true,

6169

.ignore_skip_hint = true,

6175

};

6170

};

6176

INIT_LIST_HEAD(&cc.migratepages);

6171

INIT_LIST_HEAD(&cc.migratepages);

6177

6172

6178

/*

6173

/*

6179

* What we do here is we mark all pageblocks in range as

6174

* What we do here is we mark all pageblocks in range as

6180

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6175

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6181

* have different sizes, and due to the way page allocator

6176

* have different sizes, and due to the way page allocator

6182

* work, we align the range to biggest of the two pages so

6177

* work, we align the range to biggest of the two pages so

6183

* that page allocator won't try to merge buddies from

6178

* that page allocator won't try to merge buddies from

6184

* different pageblocks and change MIGRATE_ISOLATE to some

6179

* different pageblocks and change MIGRATE_ISOLATE to some

6185

* other migration type.

6180

* other migration type.

6186

*

6181

*

6187

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6182

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6188

* migrate the pages from an unaligned range (ie. pages that

6183

* migrate the pages from an unaligned range (ie. pages that

6189

* we are interested in). This will put all the pages in

6184

* we are interested in). This will put all the pages in

6190

* range back to page allocator as MIGRATE_ISOLATE.

6185

* range back to page allocator as MIGRATE_ISOLATE.

6191

*

6186

*

6192

* When this is done, we take the pages in range from page

6187

* When this is done, we take the pages in range from page

6193

* allocator removing them from the buddy system. This way

6188

* allocator removing them from the buddy system. This way

6194

* page allocator will never consider using them.

6189

* page allocator will never consider using them.

6195

*

6190

*

6196

* This lets us mark the pageblocks back as

6191

* This lets us mark the pageblocks back as

6197

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6192

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6198

* aligned range but not in the unaligned, original range are

6193

* aligned range but not in the unaligned, original range are

6199

* put back to page allocator so that buddy can use them.

6194

* put back to page allocator so that buddy can use them.

6200

*/

6195

*/

6201

6196

6202

ret = start_isolate_page_range(pfn_max_align_down(start),

6197

ret = start_isolate_page_range(pfn_max_align_down(start),

6203

pfn_max_align_up(end), migratetype,

6198

pfn_max_align_up(end), migratetype,

6204

false);

6199

false);

6205

if (ret)

6200

if (ret)

6206

return ret;

6201

return ret;

6207

6202

6208

ret = __alloc_contig_migrate_range(&cc, start, end);

6203

ret = __alloc_contig_migrate_range(&cc, start, end);

6209

if (ret)

6204

if (ret)

6210

goto done;

6205

goto done;

6211

6206

6212

/*

6207

/*

6213

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6208

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6214

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6209

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6215

* more, all pages in [start, end) are free in page allocator.

6210

* more, all pages in [start, end) are free in page allocator.

6216

* What we are going to do is to allocate all pages from

6211

* What we are going to do is to allocate all pages from

6217

* [start, end) (that is remove them from page allocator).

6212

* [start, end) (that is remove them from page allocator).

6218

*

6213

*

6219

* The only problem is that pages at the beginning and at the

6214

* The only problem is that pages at the beginning and at the

6220

* end of interesting range may be not aligned with pages that

6215

* end of interesting range may be not aligned with pages that

6221

* page allocator holds, ie. they can be part of higher order

6216

* page allocator holds, ie. they can be part of higher order

6222

* pages. Because of this, we reserve the bigger range and

6217

* pages. Because of this, we reserve the bigger range and

6223

* once this is done free the pages we are not interested in.

6218

* once this is done free the pages we are not interested in.

6224

*

6219

*

6225

* We don't have to hold zone->lock here because the pages are

6220

* We don't have to hold zone->lock here because the pages are

6226

* isolated thus they won't get removed from buddy.

6221

* isolated thus they won't get removed from buddy.

6227

*/

6222

*/

6228

6223

6229

lru_add_drain_all();

6224

lru_add_drain_all();

6230

drain_all_pages();

6225

drain_all_pages();

6231

6226

6232

order = 0;

6227

order = 0;

6233

outer_start = start;

6228

outer_start = start;

6234

while (!PageBuddy(pfn_to_page(outer_start))) {

6229

while (!PageBuddy(pfn_to_page(outer_start))) {

6235

if (++order >= MAX_ORDER) {

6230

if (++order >= MAX_ORDER) {

6236

ret = -EBUSY;

6231

ret = -EBUSY;

6237

goto done;

6232

goto done;

6238

}

6233

}

6239

outer_start &= ~0UL << order;

6234

outer_start &= ~0UL << order;

6240

}

6235

}

6241

6236

6242

/* Make sure the range is really isolated. */

6237

/* Make sure the range is really isolated. */

6243

if (test_pages_isolated(outer_start, end, false)) {

6238

if (test_pages_isolated(outer_start, end, false)) {

6244

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6239

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6245

outer_start, end);

6240

outer_start, end);

6246

ret = -EBUSY;

6241

ret = -EBUSY;

6247

goto done;

6242

goto done;

6248

}

6243

}

6249

6244

6250

6245

6251

/* Grab isolated pages from freelists. */

6246

/* Grab isolated pages from freelists. */

6252

outer_end = isolate_freepages_range(&cc, outer_start, end);

6247

outer_end = isolate_freepages_range(&cc, outer_start, end);

6253

if (!outer_end) {

6248

if (!outer_end) {

6254

ret = -EBUSY;

6249

ret = -EBUSY;

6255

goto done;

6250

goto done;

6256

}

6251

}

6257

6252

6258

/* Free head and tail (if any) */

6253

/* Free head and tail (if any) */

6259

if (start != outer_start)

6254

if (start != outer_start)

6260

free_contig_range(outer_start, start - outer_start);

6255

free_contig_range(outer_start, start - outer_start);

6261

if (end != outer_end)

6256

if (end != outer_end)

6262

free_contig_range(end, outer_end - end);

6257

free_contig_range(end, outer_end - end);

6263

6258

6264

done:

6259

done:

6265

undo_isolate_page_range(pfn_max_align_down(start),

6260

undo_isolate_page_range(pfn_max_align_down(start),

6266

pfn_max_align_up(end), migratetype);

6261

pfn_max_align_up(end), migratetype);

6267

return ret;

6262

return ret;

6268

}

6263

}

6269

6264

6270

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6265

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6271

{

6266

{

6272

unsigned int count = 0;

6267

unsigned int count = 0;

6273

6268

6274

for (; nr_pages--; pfn++) {

6269

for (; nr_pages--; pfn++) {

6275

struct page *page = pfn_to_page(pfn);

6270

struct page *page = pfn_to_page(pfn);

6276

6271

6277

count += page_count(page) != 1;

6272

count += page_count(page) != 1;

6278

__free_page(page);

6273

__free_page(page);

6279

}

6274

}

6280

WARN(count != 0, "%d pages are still in use!\n", count);

6275

WARN(count != 0, "%d pages are still in use!\n", count);

6281

}

6276

}

6282

#endif

6277

#endif

6283

6278

6284

#ifdef CONFIG_MEMORY_HOTPLUG

6279

#ifdef CONFIG_MEMORY_HOTPLUG

6285

/*

6280

/*

6286

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6281

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6287

* page high values need to be recalulated.

6282

* page high values need to be recalulated.

6288

*/

6283

*/

6289

void __meminit zone_pcp_update(struct zone *zone)

6284

void __meminit zone_pcp_update(struct zone *zone)

6290

{

6285

{

6291

unsigned cpu;

6286

unsigned cpu;

6292

mutex_lock(&pcp_batch_high_lock);

6287

mutex_lock(&pcp_batch_high_lock);

6293

for_each_possible_cpu(cpu)

6288

for_each_possible_cpu(cpu)

6294

pageset_set_high_and_batch(zone,

6289

pageset_set_high_and_batch(zone,

6295

per_cpu_ptr(zone->pageset, cpu));

6290

per_cpu_ptr(zone->pageset, cpu));

6296

mutex_unlock(&pcp_batch_high_lock);

6291

mutex_unlock(&pcp_batch_high_lock);

6297

}

6292

}

6298

#endif

6293

#endif

6299

6294

6300

void zone_pcp_reset(struct zone *zone)

6295

void zone_pcp_reset(struct zone *zone)

6301

{

6296

{

6302

unsigned long flags;

6297

unsigned long flags;

6303

int cpu;

6298

int cpu;

6304

struct per_cpu_pageset *pset;

6299

struct per_cpu_pageset *pset;

6305

6300

6306

/* avoid races with drain_pages() */

6301

/* avoid races with drain_pages() */

6307

local_irq_save(flags);

6302

local_irq_save(flags);

6308

if (zone->pageset != &boot_pageset) {

6303

if (zone->pageset != &boot_pageset) {

6309

for_each_online_cpu(cpu) {

6304

for_each_online_cpu(cpu) {

6310

pset = per_cpu_ptr(zone->pageset, cpu);

6305

pset = per_cpu_ptr(zone->pageset, cpu);

6311

drain_zonestat(zone, pset);

6306

drain_zonestat(zone, pset);

6312

}

6307

}

6313

free_percpu(zone->pageset);

6308

free_percpu(zone->pageset);

6314

zone->pageset = &boot_pageset;

6309

zone->pageset = &boot_pageset;

6315

}

6310

}

6316

local_irq_restore(flags);

6311

local_irq_restore(flags);

6317

}

6312

}

6318

6313

6319

#ifdef CONFIG_MEMORY_HOTREMOVE

6314

#ifdef CONFIG_MEMORY_HOTREMOVE

6320

/*

6315

/*

6321

* All pages in the range must be isolated before calling this.

6316

* All pages in the range must be isolated before calling this.

6322

*/

6317

*/

6323

void

6318

void

6324

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6319

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6325

{

6320

{

6326

struct page *page;

6321

struct page *page;

6327

struct zone *zone;

6322

struct zone *zone;

6328

int order, i;

6323

int order, i;

6329

unsigned long pfn;

6324

unsigned long pfn;

6330

unsigned long flags;

6325

unsigned long flags;

6331

/* find the first valid pfn */

6326

/* find the first valid pfn */

6332

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6327

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6333

if (pfn_valid(pfn))

6328

if (pfn_valid(pfn))

6334

break;

6329

break;

6335

if (pfn == end_pfn)

6330

if (pfn == end_pfn)

6336

return;

6331

return;

6337

zone = page_zone(pfn_to_page(pfn));

6332

zone = page_zone(pfn_to_page(pfn));

6338

spin_lock_irqsave(&zone->lock, flags);

6333

spin_lock_irqsave(&zone->lock, flags);

6339

pfn = start_pfn;

6334

pfn = start_pfn;

6340

while (pfn < end_pfn) {

6335

while (pfn < end_pfn) {

6341

if (!pfn_valid(pfn)) {

6336

if (!pfn_valid(pfn)) {

6342

pfn++;

6337

pfn++;

6343

continue;

6338

continue;

6344

}

6339

}

6345

page = pfn_to_page(pfn);

6340

page = pfn_to_page(pfn);

6346

/*

6341

/*

6347

* The HWPoisoned page may be not in buddy system, and

6342

* The HWPoisoned page may be not in buddy system, and

6348

* page_count() is not 0.

6343

* page_count() is not 0.

6349

*/

6344

*/

6350

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6345

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6351

pfn++;

6346

pfn++;

6352

SetPageReserved(page);

6347

SetPageReserved(page);

6353

continue;

6348

continue;

6354

}

6349

}

6355

6350

6356

BUG_ON(page_count(page));

6351

BUG_ON(page_count(page));

6357

BUG_ON(!PageBuddy(page));

6352

BUG_ON(!PageBuddy(page));

6358

order = page_order(page);

6353

order = page_order(page);

6359

#ifdef CONFIG_DEBUG_VM

6354

#ifdef CONFIG_DEBUG_VM

6360

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6355

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6361

pfn, 1 << order, end_pfn);

6356

pfn, 1 << order, end_pfn);

6362

#endif

6357

#endif

6363

list_del(&page->lru);

6358

list_del(&page->lru);

6364

rmv_page_order(page);

6359

rmv_page_order(page);

6365

zone->free_area[order].nr_free--;

6360

zone->free_area[order].nr_free--;

6366

for (i = 0; i < (1 << order); i++)

6361

for (i = 0; i < (1 << order); i++)

6367

SetPageReserved((page+i));

6362

SetPageReserved((page+i));

6368

pfn += (1 << order);

6363

pfn += (1 << order);

6369

}

6364

}

6370

spin_unlock_irqrestore(&zone->lock, flags);

6365

spin_unlock_irqrestore(&zone->lock, flags);

6371

}

6366

}

6372

#endif

6367

#endif

6373

6368

6374

#ifdef CONFIG_MEMORY_FAILURE

6369

#ifdef CONFIG_MEMORY_FAILURE

6375

bool is_free_buddy_page(struct page *page)

6370

bool is_free_buddy_page(struct page *page)

6376

{

6371

{

6377

struct zone *zone = page_zone(page);

6372

struct zone *zone = page_zone(page);

6378

unsigned long pfn = page_to_pfn(page);

6373

unsigned long pfn = page_to_pfn(page);

6379

unsigned long flags;

6374

unsigned long flags;

6380

int order;

6375

int order;

6381

6376

6382

spin_lock_irqsave(&zone->lock, flags);

6377

spin_lock_irqsave(&zone->lock, flags);

6383

for (order = 0; order < MAX_ORDER; order++) {

6378

for (order = 0; order < MAX_ORDER; order++) {

6384

struct page *page_head = page - (pfn & ((1 << order) - 1));

6379

struct page *page_head = page - (pfn & ((1 << order) - 1));

6385

6380

6386

if (PageBuddy(page_head) && page_order(page_head) >= order)

6381

if (PageBuddy(page_head) && page_order(page_head) >= order)

6387

break;

6382

break;

6388

}

6383

}

6389

spin_unlock_irqrestore(&zone->lock, flags);

6384

spin_unlock_irqrestore(&zone->lock, flags);

6390

6385

6391

return order < MAX_ORDER;

6386

return order < MAX_ORDER;

6392

}

6387

}

6393

#endif

6388

#endif

6394

6389

6395

static const struct trace_print_flags pageflag_names[] = {

6390

static const struct trace_print_flags pageflag_names[] = {

6396

{1UL << PG_locked, "locked" },

6391

{1UL << PG_locked, "locked" },

6397

{1UL << PG_error, "error" },

6392

{1UL << PG_error, "error" },

6398

{1UL << PG_referenced, "referenced" },

6393

{1UL << PG_referenced, "referenced" },

6399

{1UL << PG_uptodate, "uptodate" },

6394

{1UL << PG_uptodate, "uptodate" },

6400

{1UL << PG_dirty, "dirty" },

6395

{1UL << PG_dirty, "dirty" },

6401

{1UL << PG_lru, "lru" },

6396

{1UL << PG_lru, "lru" },

6402

{1UL << PG_active, "active" },

6397

{1UL << PG_active, "active" },

6403

{1UL << PG_slab, "slab" },

6398

{1UL << PG_slab, "slab" },

6404

{1UL << PG_owner_priv_1, "owner_priv_1" },

6399

{1UL << PG_owner_priv_1, "owner_priv_1" },

6405

{1UL << PG_arch_1, "arch_1" },

6400

{1UL << PG_arch_1, "arch_1" },

6406

{1UL << PG_reserved, "reserved" },

6401

{1UL << PG_reserved, "reserved" },

6407

{1UL << PG_private, "private" },

6402

{1UL << PG_private, "private" },

6408

{1UL << PG_private_2, "private_2" },

6403

{1UL << PG_private_2, "private_2" },

6409

{1UL << PG_writeback, "writeback" },

6404

{1UL << PG_writeback, "writeback" },

6410

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6405

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6411

{1UL << PG_head, "head" },

6406

{1UL << PG_head, "head" },

6412

{1UL << PG_tail, "tail" },

6407

{1UL << PG_tail, "tail" },

6413

#else

6408

#else

6414

{1UL << PG_compound, "compound" },

6409

{1UL << PG_compound, "compound" },

6415

#endif

6410

#endif

6416

{1UL << PG_swapcache, "swapcache" },

6411

{1UL << PG_swapcache, "swapcache" },

6417

{1UL << PG_mappedtodisk, "mappedtodisk" },

6412

{1UL << PG_mappedtodisk, "mappedtodisk" },

6418

{1UL << PG_reclaim, "reclaim" },

6413

{1UL << PG_reclaim, "reclaim" },

6419

{1UL << PG_swapbacked, "swapbacked" },

6414

{1UL << PG_swapbacked, "swapbacked" },

6420

{1UL << PG_unevictable, "unevictable" },

6415

{1UL << PG_unevictable, "unevictable" },

6421

#ifdef CONFIG_MMU

6416

#ifdef CONFIG_MMU

6422

{1UL << PG_mlocked, "mlocked" },

6417

{1UL << PG_mlocked, "mlocked" },

6423

#endif

6418

#endif

6424

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6419

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6425

{1UL << PG_uncached, "uncached" },

6420

{1UL << PG_uncached, "uncached" },

6426

#endif

6421

#endif

6427

#ifdef CONFIG_MEMORY_FAILURE

6422

#ifdef CONFIG_MEMORY_FAILURE

6428

{1UL << PG_hwpoison, "hwpoison" },

6423

{1UL << PG_hwpoison, "hwpoison" },

6429

#endif

6424

#endif

6430

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6425

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6431

{1UL << PG_compound_lock, "compound_lock" },

6426

{1UL << PG_compound_lock, "compound_lock" },

6432

#endif

6427

#endif

6433

};

6428

};

6434

6429

6435

static void dump_page_flags(unsigned long flags)

6430

static void dump_page_flags(unsigned long flags)

6436

{

6431

{

6437

const char *delim = "";

6432

const char *delim = "";

6438

unsigned long mask;

6433

unsigned long mask;

6439

int i;

6434

int i;

6440

6435

6441

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6436

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6442

6437

6443

printk(KERN_ALERT "page flags: %#lx(", flags);

6438

printk(KERN_ALERT "page flags: %#lx(", flags);

6444

6439

6445

/* remove zone id */

6440

/* remove zone id */

6446

flags &= (1UL << NR_PAGEFLAGS) - 1;

6441

flags &= (1UL << NR_PAGEFLAGS) - 1;

6447

6442

6448

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6443

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6449

6444

6450

mask = pageflag_names[i].mask;

6445

mask = pageflag_names[i].mask;

6451

if ((flags & mask) != mask)

6446

if ((flags & mask) != mask)

6452

continue;

6447

continue;

6453

6448

6454

flags &= ~mask;

6449

flags &= ~mask;

6455

printk("%s%s", delim, pageflag_names[i].name);

6450

printk("%s%s", delim, pageflag_names[i].name);

6456

delim = "|";

6451

delim = "|";

6457

}

6452

}

6458

6453

6459

/* check for left over flags */

6454

/* check for left over flags */

6460

if (flags)

6455

if (flags)

6461

printk("%s%#lx", delim, flags);

6456

printk("%s%#lx", delim, flags);

6462

6457

6463

printk(")\n");

6458

printk(")\n");

6464

}

6459

}

6465

6460

6466

void dump_page(struct page *page)

6461

void dump_page(struct page *page)

6467

{

6462

{

6468

printk(KERN_ALERT

6463

printk(KERN_ALERT

6469

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6464

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6470

page, atomic_read(&page->_count), page_mapcount(page),

6465

page, atomic_read(&page->_count), page_mapcount(page),

6471

page->mapping, page->index);

6466

page->mapping, page->index);

GITLAB

mm: __rmqueue_fallback() should respect pageblock type

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 #ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled &&
 		     migratetype < MIGRATE_PCPTYPES))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page %lu outside zone [ %lu - %lu ]\n",
 			pfn, start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		__SetPageTail(p);
 		set_page_count(p, 0);
 		p->first_page = page;
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	VM_BUG_ON(!zone_is_initialized(zone));
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	page_cpupid_reset_last(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 			if (likely(!is_migrate_isolate_page(page))) {
 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
 				if (is_migrate_cma(mt))
 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 			}
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_page_refcounted(page);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	__free_pages(page, pageblock_order);
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_freepage_state(zone, -(1 << high),
 						  migratetype);
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 #endif
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * If breaking a large block of pages, move all free pages to the preferred
  * allocation list. If falling back for a reclaimable kernel allocation, be
  * more aggressive about taking ownership of free pages.
  *
  * On the other hand, never change migration type of MIGRATE_CMA pageblocks
  * nor move CMA pages to different free lists. We don't want unmovable pages
  * to be allocated from MIGRATE_CMA areas.
  *
  * Returns the new migratetype of the pageblock (or the same old migratetype
  * if it was unchanged).
  */
 static int try_to_steal_freepages(struct zone *zone, struct page *page,
 				  int start_type, int fallback_type)
 {
 	int current_order = page_order(page);
+	/*
+	 * When borrowing from MIGRATE_CMA, we need to release the excess
+	 * buddy pages to CMA itself.
+	 */
 	if (is_migrate_cma(fallback_type))
 		return fallback_type;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		return start_type;
 	}
 	if (current_order >= pageblock_order / 2 ||
 	    start_type == MIGRATE_RECLAIMABLE ||
 	    page_group_by_mobility_disabled) {
 		int pages;
 		pages = move_freepages_block(zone, page, start_type);
 		/* Claim the whole block if over half of it is free */
 		if (pages >= (1 << (pageblock_order-1)) ||
 				page_group_by_mobility_disabled) {
 			set_pageblock_migratetype(page, start_type);
 			return start_type;
 		}
 	}
 	return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int migratetype, new_type, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			new_type = try_to_steal_freepages(zone, page,
 							  start_migratetype,
 							  migratetype);
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
-			/*
-			 * Borrow the excess buddy pages as well, irrespective
-			 * of whether we stole freepages, or took ownership of
-			 * the pageblock or not.
-			 *
-			 * Exception: When borrowing from MIGRATE_CMA, release
-			 * the excess buddy pages to CMA itself.
-			 */
 			expand(zone, page, order, current_order, area,
-			       is_migrate_cma(migratetype)
+			       new_type);
-			     ? migratetype : start_migratetype);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype, new_type);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
 	int mt = migratetype, i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		if (IS_ENABLED(CONFIG_CMA)) {
 			mt = get_pageblock_migratetype(page);
 			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
 				mt = migratetype;
 		}
 		set_freepage_migratetype(page, mt);
 		list = &page->lru;
 		if (is_migrate_cma(mt))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	unsigned long batch;
 	local_irq_save(flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	if (pcp->count >= batch)
 		to_drain = batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	int nr_pages;
 	order = page_order(page);
 	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	return nr_pages;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_pageblock_migratetype(page));
 	}
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	long free_cma = 0;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	if (free_pages - free_cma <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 static void __paginginit init_zone_allows_reclaim(int nid)
 {
 	int i;
 	for_each_online_node(i)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
 		else
 			zone_reclaim_mode = 1;
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static inline void init_zone_allows_reclaim(int nid)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		unsigned long mark;
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
 			goto try_this_zone;
 		/*
 		 * Distribute pages in proportion to the individual
 		 * zone size to ensure fair page aging.  The zone a
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 *
 		 * When zone_reclaim_mode is enabled, try to stay in
 		 * local zones in the fastpath.  If that fails, the
 		 * slowpath is entered, which will do another pass
 		 * starting with the local zones, but ultimately fall
 		 * back to remote zones that do not partake in the
 		 * fairness round-robin cycle of this zonelist.
 		 */
 		if (alloc_flags & ALLOC_WMARK_LOW) {
 			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
 				continue;
 			if (zone_reclaim_mode &&
 			    !zone_local(preferred_zone, zone))
 				continue;
 		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if ((alloc_flags & ALLOC_WMARK_LOW) &&
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			goto this_zone_full;
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_ok(zone, order, mark,
 				       classzone_idx, alloc_flags)) {
 			int ret;
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto try_this_zone;
 				/*
 				 * Failed to reclaim enough to meet watermark.
 				 * Only mark the zone full if checking the min
 				 * watermark or if we failed to reclaim just
 				 * 1<<order pages or else the page allocator
 				 * fastpath will prematurely mark zones full
 				 * when the watermark is between the low and
 				 * min watermarks.
 				 */
 				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
 				    ret == ZONE_RECLAIM_SOME)
 					goto this_zone_full;
 				continue;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (IS_ENABLED(CONFIG_NUMA))
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
 		 * that the caller is taking steps that will free more
 		 * memory. The caller should avoid the page being used
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * Walking all memory to count page types is very expensive and should
 	 * be inhibited in non-blockable contexts.
 	 */
 	if (!(gfp_mask & __GFP_WAIT))
 		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		struct page *page;
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
 				preferred_zone->compact_order_failed = order + 1;
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (sync_migration)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (IS_ENABLED(CONFIG_NUMA))
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
 					preferred_zone, migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
 			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		if (!(gfp_mask & __GFP_NO_KSWAPD))
 			wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 		/*
 		 * Only reset the batches of zones that were actually
 		 * considered in the fast path, we don't want to
 		 * thrash fairness information for zones that are not
 		 * actually part of this zonelist's round-robin cycle.
 		 */
 		if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
 			continue;
 		mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				    high_wmark_pages(zone) -
 				    low_wmark_pages(zone) -
 				    zone_page_state(zone, NR_ALLOC_BATCH));
 	}
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (!wait) {
 		/*
 		 * Not worth trying to allocate harder for
 		 * __GFP_NOMEMALLOC even if it can't schedule.
 		 */
 		if  (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	bool sync_migration = false;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (IS_ENABLED(CONFIG_NUMA) &&
 			(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	prepare_slowpath(gfp_mask, order, zonelist,
 			 high_zoneidx, preferred_zone);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
 		first_zones_zonelist(zonelist, high_zoneidx, NULL,
 					&preferred_zone);
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		/*
 		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
 		 * the allocation is high priority and these type of
 		 * allocations are system rather than user orientated
 		 */
 		zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 		if (page) {
 			goto got_pg;
 		}
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	sync_migration = true;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * requested a movable allocation that does not heavily disrupt the
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
 						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if (oom_gfp_allowed(gfp_mask)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
 	struct mem_cgroup *memcg = NULL;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 	/*
 	 * Will only have any effect when __GFP_KMEMCG is set.  This is
 	 * verified in the (always inline) callee
 	 */
 	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, migratetype);
 	if (unlikely(!page)) {
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
 		 * complete.
 		 */
 		gfp_mask = memalloc_noio_flags(gfp_mask);
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 	}
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
 	memcg_kmem_commit_charge(page, memcg, order);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
  * pages allocated with __GFP_KMEMCG.
  *
  * Those pages are accounted to a particular memcg, embedded in the
  * corresponding page_cgroup. To avoid adding a hit in the allocator to search
  * for that information only to find out that it is NULL for users who have no
  * interest in that whatsoever, we provide these functions.
  *
  * The caller knows better which flags it relies on.
  */
 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
 {
 	memcg_kmem_uncharge_pages(page, order);
 	__free_pages(page, order);
 }
 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
 	}
 }
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *     managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = get_mems_allowed();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (!put_mems_allowed(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RESERVE]	= 'R',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk("(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE),
 		global_page_state(NR_FREE_CMA_PAGES));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk("= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write) {
 		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
 			ret = -EINVAL;
 			goto out;
 		}
 		strcpy(saved_string, (char *)table->data);
 	}
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		ret = __parse_numa_zonelist_order((char *)table->data);
 		if (ret) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char *)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
 	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->managed_pages;
 				total_size += z->managed_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
 	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
 	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = zone_end_pfn(zone);
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
 		page_cpupid_reset_last(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < zone_end_pfn(z))
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void __meminit pageset_set_high_and_batch(struct zone *zone,
 		struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (populated_zone(zone))
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
 	 */
 	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
 	static int __meminitdata last_nid;
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		last_start_pfn = start_pfn;
 		last_end_pfn = end_pfn;
 		last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 							 node_start_pfn,
 							 node_end_pfn,
 							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not naturally algined on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
 						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
 								node_start_pfn,
 								node_end_pfn,
 								zholes_size);
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (freesize >= memmap_pages) {
 			freesize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds freesize %lu\n",
 				zone_names[j], memmap_pages, freesize);
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, start_pfn, end_pfn,
 			    zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (populated_zone(zone)) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	printk("Memory: %luK/%luK available "
 	       "(%luK kernel code, %luK rwdata, %luK rodata, "
 	       "%luK init, %luK bss, %luK reserved"
 #ifdef	CONFIG_HIGHMEM
 	       ", %luK highmem"
 #endif
 	       "%s%s)\n",
 	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
 	       codesize >> 10, datasize >> 10, rosize >> 10,
 	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
 	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
 #ifdef	CONFIG_HIGHMEM
 	       totalhigh_pages << (PAGE_SHIFT-10),
 #endif
 	       str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->managed_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
 				      low_wmark_pages(zone) -
 				      zone_page_state(zone, NR_ALLOC_BATCH));
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret < 0))
 		return ret;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_populated_zone(zone) {
 		unsigned long  high;
 		high = zone->managed_pages / percpu_pagelist_fraction;
 		for_each_possible_cpu(cpu)
 			pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
 					 high);
 	}
 	mutex_unlock(&pcp_batch_high_lock);
 	return 0;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_group(struct page *page,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long flags = 0;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (test_bit(bitidx + start_bitidx, bitmap))
 			flags |= value;
 	return flags;
 }
 /**
  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (flags & value)
 			__set_bit(bitidx + start_bitidx, bitmap);
 		else
 			__clear_bit(bitidx + start_bitidx, bitmap);
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    0, MIGRATE_SYNC, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.sync = true,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);