Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

73

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

75

DEFINE_PER_CPU(int, numa_node);

75

DEFINE_PER_CPU(int, numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

77

#endif

77

#endif

78

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

80

/*

80

/*

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

84

* defined in <linux/topology.h>.

84

* defined in <linux/topology.h>.

85

*/

85

*/

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

88

#endif

88

#endif

89

90

/*

90

/*

91

* Array of node states.

91

* Array of node states.

92

*/

92

*/

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

94

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_POSSIBLE] = NODE_MASK_ALL,

95

[N_ONLINE] = { { [0] = 1UL } },

95

[N_ONLINE] = { { [0] = 1UL } },

96

#ifndef CONFIG_NUMA

96

#ifndef CONFIG_NUMA

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

100

#endif

100

#endif

101

#ifdef CONFIG_MOVABLE_NODE

101

#ifdef CONFIG_MOVABLE_NODE

102

[N_MEMORY] = { { [0] = 1UL } },

102

[N_MEMORY] = { { [0] = 1UL } },

103

#endif

103

#endif

104

[N_CPU] = { { [0] = 1UL } },

104

[N_CPU] = { { [0] = 1UL } },

105

#endif /* NUMA */

105

#endif /* NUMA */

106

};

106

};

107

EXPORT_SYMBOL(node_states);

107

EXPORT_SYMBOL(node_states);

108

109

/* Protect totalram_pages and zone->managed_pages */

109

/* Protect totalram_pages and zone->managed_pages */

110

static DEFINE_SPINLOCK(managed_page_count_lock);

110

static DEFINE_SPINLOCK(managed_page_count_lock);

111

112

unsigned long totalram_pages __read_mostly;

112

unsigned long totalram_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

114

/*

114

/*

115

* When calculating the number of globally allowed dirty pages, there

115

* When calculating the number of globally allowed dirty pages, there

116

* is a certain number of per-zone reserves that should not be

116

* is a certain number of per-zone reserves that should not be

117

* considered dirtyable memory. This is the sum of those reserves

117

* considered dirtyable memory. This is the sum of those reserves

118

* over all existing zones that contribute dirtyable memory.

118

* over all existing zones that contribute dirtyable memory.

119

*/

119

*/

120

unsigned long dirty_balance_reserve __read_mostly;

120

unsigned long dirty_balance_reserve __read_mostly;

121

122

int percpu_pagelist_fraction;

122

int percpu_pagelist_fraction;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

124

125

#ifdef CONFIG_PM_SLEEP

125

#ifdef CONFIG_PM_SLEEP

126

/*

126

/*

127

* The following functions are used by the suspend/hibernate code to temporarily

127

* The following functions are used by the suspend/hibernate code to temporarily

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

132

* guaranteed not to run in parallel with that modification).

132

* guaranteed not to run in parallel with that modification).

133

*/

133

*/

134

135

static gfp_t saved_gfp_mask;

135

static gfp_t saved_gfp_mask;

136

137

void pm_restore_gfp_mask(void)

137

void pm_restore_gfp_mask(void)

138

{

138

{

139

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(!mutex_is_locked(&pm_mutex));

140

if (saved_gfp_mask) {

140

if (saved_gfp_mask) {

141

gfp_allowed_mask = saved_gfp_mask;

141

gfp_allowed_mask = saved_gfp_mask;

142

saved_gfp_mask = 0;

142

saved_gfp_mask = 0;

143

}

143

}

144

}

144

}

145

146

void pm_restrict_gfp_mask(void)

146

void pm_restrict_gfp_mask(void)

147

{

147

{

148

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(!mutex_is_locked(&pm_mutex));

149

WARN_ON(saved_gfp_mask);

149

WARN_ON(saved_gfp_mask);

150

saved_gfp_mask = gfp_allowed_mask;

150

saved_gfp_mask = gfp_allowed_mask;

151

gfp_allowed_mask &= ~GFP_IOFS;

151

gfp_allowed_mask &= ~GFP_IOFS;

152

}

152

}

153

154

bool pm_suspended_storage(void)

154

bool pm_suspended_storage(void)

155

{

155

{

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

157

return false;

157

return false;

158

return true;

158

return true;

159

}

159

}

160

#endif /* CONFIG_PM_SLEEP */

160

#endif /* CONFIG_PM_SLEEP */

161

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

163

int pageblock_order __read_mostly;

163

int pageblock_order __read_mostly;

164

#endif

164

#endif

165

166

static void __free_pages_ok(struct page *page, unsigned int order);

166

static void __free_pages_ok(struct page *page, unsigned int order);

167

168

/*

168

/*

169

* results with 256, 32 in the lowmem_reserve sysctl:

169

* results with 256, 32 in the lowmem_reserve sysctl:

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

175

*

175

*

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

177

* don't need any ZONE_NORMAL reservation

177

* don't need any ZONE_NORMAL reservation

178

*/

178

*/

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

180

#ifdef CONFIG_ZONE_DMA

180

#ifdef CONFIG_ZONE_DMA

181

256,

181

256,

182

#endif

182

#endif

183

#ifdef CONFIG_ZONE_DMA32

183

#ifdef CONFIG_ZONE_DMA32

184

256,

184

256,

185

#endif

185

#endif

186

#ifdef CONFIG_HIGHMEM

186

#ifdef CONFIG_HIGHMEM

187

32,

187

32,

188

#endif

188

#endif

189

32,

189

32,

190

};

190

};

191

192

EXPORT_SYMBOL(totalram_pages);

192

EXPORT_SYMBOL(totalram_pages);

193

194

static char * const zone_names[MAX_NR_ZONES] = {

194

static char * const zone_names[MAX_NR_ZONES] = {

195

#ifdef CONFIG_ZONE_DMA

195

#ifdef CONFIG_ZONE_DMA

196

"DMA",

196

"DMA",

197

#endif

197

#endif

198

#ifdef CONFIG_ZONE_DMA32

198

#ifdef CONFIG_ZONE_DMA32

199

"DMA32",

199

"DMA32",

200

#endif

200

#endif

201

"Normal",

201

"Normal",

202

#ifdef CONFIG_HIGHMEM

202

#ifdef CONFIG_HIGHMEM

203

"HighMem",

203

"HighMem",

204

#endif

204

#endif

205

"Movable",

205

"Movable",

206

};

206

};

207

208

int min_free_kbytes = 1024;

208

int min_free_kbytes = 1024;

209

int user_min_free_kbytes;

209

int user_min_free_kbytes;

210

211

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_kernel_pages;

212

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata nr_all_pages;

213

static unsigned long __meminitdata dma_reserve;

213

static unsigned long __meminitdata dma_reserve;

214

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

218

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_kernelcore;

219

static unsigned long __initdata required_movablecore;

219

static unsigned long __initdata required_movablecore;

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

221

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

223

int movable_zone;

223

int movable_zone;

224

EXPORT_SYMBOL(movable_zone);

224

EXPORT_SYMBOL(movable_zone);

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

226

227

#if MAX_NUMNODES > 1

227

#if MAX_NUMNODES > 1

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

229

int nr_online_nodes __read_mostly = 1;

229

int nr_online_nodes __read_mostly = 1;

230

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_node_ids);

231

EXPORT_SYMBOL(nr_online_nodes);

231

EXPORT_SYMBOL(nr_online_nodes);

232

#endif

232

#endif

233

234

int page_group_by_mobility_disabled __read_mostly;

234

int page_group_by_mobility_disabled __read_mostly;

235

236

void set_pageblock_migratetype(struct page *page, int migratetype)

236

void set_pageblock_migratetype(struct page *page, int migratetype)

237

{

237

{

238

239

if (unlikely(page_group_by_mobility_disabled))

239

if (unlikely(page_group_by_mobility_disabled))

240

migratetype = MIGRATE_UNMOVABLE;

240

migratetype = MIGRATE_UNMOVABLE;

241

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

243

PB_migrate, PB_migrate_end);

243

PB_migrate, PB_migrate_end);

244

}

244

}

245

246

bool oom_killer_disabled __read_mostly;

246

bool oom_killer_disabled __read_mostly;

247

248

#ifdef CONFIG_DEBUG_VM

248

#ifdef CONFIG_DEBUG_VM

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

250

{

250

{

251

int ret = 0;

251

int ret = 0;

252

unsigned seq;

252

unsigned seq;

253

unsigned long pfn = page_to_pfn(page);

253

unsigned long pfn = page_to_pfn(page);

254

unsigned long sp, start_pfn;

254

unsigned long sp, start_pfn;

255

256

do {

256

do {

257

seq = zone_span_seqbegin(zone);

257

seq = zone_span_seqbegin(zone);

258

start_pfn = zone->zone_start_pfn;

258

start_pfn = zone->zone_start_pfn;

259

sp = zone->spanned_pages;

259

sp = zone->spanned_pages;

260

if (!zone_spans_pfn(zone, pfn))

260

if (!zone_spans_pfn(zone, pfn))

261

ret = 1;

261

ret = 1;

262

} while (zone_span_seqretry(zone, seq));

262

} while (zone_span_seqretry(zone, seq));

263

264

if (ret)

264

if (ret)

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

266

pfn, start_pfn, start_pfn + sp);

266

pfn, start_pfn, start_pfn + sp);

267

268

return ret;

268

return ret;

269

}

269

}

270

271

static int page_is_consistent(struct zone *zone, struct page *page)

271

static int page_is_consistent(struct zone *zone, struct page *page)

272

{

272

{

273

if (!pfn_valid_within(page_to_pfn(page)))

273

if (!pfn_valid_within(page_to_pfn(page)))

274

return 0;

274

return 0;

275

if (zone != page_zone(page))

275

if (zone != page_zone(page))

276

return 0;

276

return 0;

277

278

return 1;

278

return 1;

279

}

279

}

280

/*

280

/*

281

* Temporary debugging check for pages not lying within a given zone.

281

* Temporary debugging check for pages not lying within a given zone.

282

*/

282

*/

283

static int bad_range(struct zone *zone, struct page *page)

283

static int bad_range(struct zone *zone, struct page *page)

284

{

284

{

285

if (page_outside_zone_boundaries(zone, page))

285

if (page_outside_zone_boundaries(zone, page))

286

return 1;

286

return 1;

287

if (!page_is_consistent(zone, page))

287

if (!page_is_consistent(zone, page))

288

return 1;

288

return 1;

289

290

return 0;

290

return 0;

291

}

291

}

292

#else

292

#else

293

static inline int bad_range(struct zone *zone, struct page *page)

293

static inline int bad_range(struct zone *zone, struct page *page)

294

{

294

{

295

return 0;

295

return 0;

296

}

296

}

297

#endif

297

#endif

298

299

static void bad_page(struct page *page)

299

static void bad_page(struct page *page)

300

{

300

{

301

static unsigned long resume;

301

static unsigned long resume;

302

static unsigned long nr_shown;

302

static unsigned long nr_shown;

303

static unsigned long nr_unshown;

303

static unsigned long nr_unshown;

304

305

/* Don't complain about poisoned pages */

305

/* Don't complain about poisoned pages */

306

if (PageHWPoison(page)) {

306

if (PageHWPoison(page)) {

307

page_mapcount_reset(page); /* remove PageBuddy */

307

page_mapcount_reset(page); /* remove PageBuddy */

308

return;

308

return;

309

}

309

}

310

311

/*

311

/*

312

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* Allow a burst of 60 reports, then keep quiet for that minute;

313

* or allow a steady drip of one report per second.

313

* or allow a steady drip of one report per second.

314

*/

314

*/

315

if (nr_shown == 60) {

315

if (nr_shown == 60) {

316

if (time_before(jiffies, resume)) {

316

if (time_before(jiffies, resume)) {

317

nr_unshown++;

317

nr_unshown++;

318

goto out;

318

goto out;

319

}

319

}

320

if (nr_unshown) {

320

if (nr_unshown) {

321

printk(KERN_ALERT

321

printk(KERN_ALERT

322

"BUG: Bad page state: %lu messages suppressed\n",

322

"BUG: Bad page state: %lu messages suppressed\n",

323

nr_unshown);

323

nr_unshown);

324

nr_unshown = 0;

324

nr_unshown = 0;

325

}

325

}

326

nr_shown = 0;

326

nr_shown = 0;

327

}

327

}

328

if (nr_shown++ == 0)

328

if (nr_shown++ == 0)

329

resume = jiffies + 60 * HZ;

329

resume = jiffies + 60 * HZ;

330

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

332

current->comm, page_to_pfn(page));

332

current->comm, page_to_pfn(page));

333

dump_page(page);

333

dump_page(page);

334

335

print_modules();

335

print_modules();

336

dump_stack();

336

dump_stack();

337

out:

337

out:

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

339

page_mapcount_reset(page); /* remove PageBuddy */

339

page_mapcount_reset(page); /* remove PageBuddy */

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

341

}

341

}

342

343

/*

343

/*

344

* Higher-order pages are called "compound pages". They are structured thusly:

344

* Higher-order pages are called "compound pages". They are structured thusly:

345

*

345

*

346

* The first PAGE_SIZE page is called the "head page".

346

* The first PAGE_SIZE page is called the "head page".

347

*

347

*

348

* The remaining PAGE_SIZE pages are called "tail pages".

348

* The remaining PAGE_SIZE pages are called "tail pages".

349

*

349

*

350

* All pages have PG_compound set. All tail pages have their ->first_page

350

* All pages have PG_compound set. All tail pages have their ->first_page

351

* pointing at the head page.

351

* pointing at the head page.

352

*

352

*

353

* The first tail page's ->lru.next holds the address of the compound page's

353

* The first tail page's ->lru.next holds the address of the compound page's

354

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* put_page() function. Its ->lru.prev holds the order of allocation.

355

* This usage means that zero-order pages may not be compound.

355

* This usage means that zero-order pages may not be compound.

356

*/

356

*/

357

358

static void free_compound_page(struct page *page)

358

static void free_compound_page(struct page *page)

359

{

359

{

360

__free_pages_ok(page, compound_order(page));

360

__free_pages_ok(page, compound_order(page));

361

}

361

}

362

363

void prep_compound_page(struct page *page, unsigned long order)

363

void prep_compound_page(struct page *page, unsigned long order)

364

{

364

{

365

int i;

365

int i;

366

int nr_pages = 1 << order;

366

int nr_pages = 1 << order;

367

368

set_compound_page_dtor(page, free_compound_page);

368

set_compound_page_dtor(page, free_compound_page);

369

set_compound_order(page, order);

369

set_compound_order(page, order);

370

__SetPageHead(page);

370

__SetPageHead(page);

371

for (i = 1; i < nr_pages; i++) {

371

for (i = 1; i < nr_pages; i++) {

372

struct page *p = page + i;

372

struct page *p = page + i;

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

/* Make sure p->first_page is always valid for PageTail() */

375

/* Make sure p->first_page is always valid for PageTail() */

376

smp_wmb();

376

smp_wmb();

377

__SetPageTail(p);

377

__SetPageTail(p);

378

}

378

}

379

}

379

}

380

381

/* update __split_huge_page_refcount if you change this function */

381

/* update __split_huge_page_refcount if you change this function */

382

static int destroy_compound_page(struct page *page, unsigned long order)

382

static int destroy_compound_page(struct page *page, unsigned long order)

383

{

383

{

384

int i;

384

int i;

385

int nr_pages = 1 << order;

385

int nr_pages = 1 << order;

386

int bad = 0;

386

int bad = 0;

387

388

if (unlikely(compound_order(page) != order)) {

388

if (unlikely(compound_order(page) != order)) {

389

bad_page(page);

389

bad_page(page);

390

bad++;

390

bad++;

391

}

391

}

392

393

__ClearPageHead(page);

393

__ClearPageHead(page);

394

395

for (i = 1; i < nr_pages; i++) {

395

for (i = 1; i < nr_pages; i++) {

396

struct page *p = page + i;

396

struct page *p = page + i;

397

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

399

bad_page(page);

399

bad_page(page);

400

bad++;

400

bad++;

401

}

401

}

402

__ClearPageTail(p);

402

__ClearPageTail(p);

403

}

403

}

404

405

return bad;

405

return bad;

406

}

406

}

407

408

static inline void prep_zero_page(struct page *page, unsigned int order,

408

static inline void prep_zero_page(struct page *page, unsigned int order,

409

gfp_t gfp_flags)

409

gfp_t gfp_flags)

410

{

410

{

411

int i;

411

int i;

412

413

/*

413

/*

414

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

414

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

415

* and __GFP_HIGHMEM from hard or soft interrupt context.

415

* and __GFP_HIGHMEM from hard or soft interrupt context.

416

*/

416

*/

417

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

417

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

418

for (i = 0; i < (1 << order); i++)

418

for (i = 0; i < (1 << order); i++)

419

clear_highpage(page + i);

419

clear_highpage(page + i);

420

}

420

}

421

422

#ifdef CONFIG_DEBUG_PAGEALLOC

422

#ifdef CONFIG_DEBUG_PAGEALLOC

423

unsigned int _debug_guardpage_minorder;

423

unsigned int _debug_guardpage_minorder;

424

425

static int __init debug_guardpage_minorder_setup(char *buf)

425

static int __init debug_guardpage_minorder_setup(char *buf)

426

{

426

{

427

unsigned long res;

427

unsigned long res;

428

429

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

429

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

430

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

430

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

431

return 0;

431

return 0;

432

}

432

}

433

_debug_guardpage_minorder = res;

433

_debug_guardpage_minorder = res;

434

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

434

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

435

return 0;

435

return 0;

436

}

436

}

437

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

437

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

438

439

static inline void set_page_guard_flag(struct page *page)

439

static inline void set_page_guard_flag(struct page *page)

440

{

440

{

441

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

441

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

442

}

442

}

443

444

static inline void clear_page_guard_flag(struct page *page)

444

static inline void clear_page_guard_flag(struct page *page)

445

{

445

{

446

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

446

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

447

}

447

}

448

#else

448

#else

449

static inline void set_page_guard_flag(struct page *page) { }

449

static inline void set_page_guard_flag(struct page *page) { }

450

static inline void clear_page_guard_flag(struct page *page) { }

450

static inline void clear_page_guard_flag(struct page *page) { }

451

#endif

451

#endif

452

453

static inline void set_page_order(struct page *page, unsigned int order)

453

static inline void set_page_order(struct page *page, unsigned int order)

454

{

454

{

455

set_page_private(page, order);

455

set_page_private(page, order);

456

__SetPageBuddy(page);

456

__SetPageBuddy(page);

457

}

457

}

458

459

static inline void rmv_page_order(struct page *page)

459

static inline void rmv_page_order(struct page *page)

460

{

460

{

461

__ClearPageBuddy(page);

461

__ClearPageBuddy(page);

462

set_page_private(page, 0);

462

set_page_private(page, 0);

463

}

463

}

464

465

/*

465

/*

466

* Locate the struct page for both the matching buddy in our

466

* Locate the struct page for both the matching buddy in our

467

* pair (buddy1) and the combined O(n+1) page they form (page).

467

* pair (buddy1) and the combined O(n+1) page they form (page).

468

*

468

*

469

* 1) Any buddy B1 will have an order O twin B2 which satisfies

469

* 1) Any buddy B1 will have an order O twin B2 which satisfies

470

* the following equation:

470

* the following equation:

471

* B2 = B1 ^ (1 << O)

471

* B2 = B1 ^ (1 << O)

472

* For example, if the starting buddy (buddy2) is #8 its order

472

* For example, if the starting buddy (buddy2) is #8 its order

473

* 1 buddy is #10:

473

* 1 buddy is #10:

474

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

474

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

475

*

475

*

476

* 2) Any buddy B will have an order O+1 parent P which

476

* 2) Any buddy B will have an order O+1 parent P which

477

* satisfies the following equation:

477

* satisfies the following equation:

478

* P = B & ~(1 << O)

478

* P = B & ~(1 << O)

479

*

479

*

480

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

480

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

481

*/

481

*/

482

static inline unsigned long

482

static inline unsigned long

483

__find_buddy_index(unsigned long page_idx, unsigned int order)

483

__find_buddy_index(unsigned long page_idx, unsigned int order)

484

{

484

{

485

return page_idx ^ (1 << order);

485

return page_idx ^ (1 << order);

486

}

486

}

487

488

/*

488

/*

489

* This function checks whether a page is free && is the buddy

489

* This function checks whether a page is free && is the buddy

490

* we can do coalesce a page and its buddy if

490

* we can do coalesce a page and its buddy if

491

* (a) the buddy is not in a hole &&

491

* (a) the buddy is not in a hole &&

492

* (b) the buddy is in the buddy system &&

492

* (b) the buddy is in the buddy system &&

493

* (c) a page and its buddy have the same order &&

493

* (c) a page and its buddy have the same order &&

494

* (d) a page and its buddy are in the same zone.

494

* (d) a page and its buddy are in the same zone.

495

*

495

*

496

* For recording whether a page is in the buddy system, we set ->_mapcount

496

* For recording whether a page is in the buddy system, we set ->_mapcount

497

* PAGE_BUDDY_MAPCOUNT_VALUE.

497

* PAGE_BUDDY_MAPCOUNT_VALUE.

498

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

498

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

499

* serialized by zone->lock.

499

* serialized by zone->lock.

500

*

500

*

501

* For recording page's order, we use page_private(page).

501

* For recording page's order, we use page_private(page).

502

*/

502

*/

503

static inline int page_is_buddy(struct page *page, struct page *buddy,

503

static inline int page_is_buddy(struct page *page, struct page *buddy,

504

unsigned int order)

504

unsigned int order)

505

{

505

{

506

if (!pfn_valid_within(page_to_pfn(buddy)))

506

if (!pfn_valid_within(page_to_pfn(buddy)))

507

return 0;

507

return 0;

508

509

if (page_is_guard(buddy) && page_order(buddy) == order) {

509

if (page_is_guard(buddy) && page_order(buddy) == order) {

510

VM_BUG_ON(page_count(buddy) != 0);

510

VM_BUG_ON(page_count(buddy) != 0);

511

512

if (page_zone_id(page) != page_zone_id(buddy))

512

if (page_zone_id(page) != page_zone_id(buddy))

513

return 0;

513

return 0;

514

515

return 1;

515

return 1;

516

}

516

}

517

518

if (PageBuddy(buddy) && page_order(buddy) == order) {

518

if (PageBuddy(buddy) && page_order(buddy) == order) {

519

VM_BUG_ON(page_count(buddy) != 0);

519

VM_BUG_ON(page_count(buddy) != 0);

520

521

/*

521

/*

522

* zone check is done late to avoid uselessly

522

* zone check is done late to avoid uselessly

523

* calculating zone/node ids for pages that could

523

* calculating zone/node ids for pages that could

524

* never merge.

524

* never merge.

525

*/

525

*/

526

if (page_zone_id(page) != page_zone_id(buddy))

526

if (page_zone_id(page) != page_zone_id(buddy))

527

return 0;

527

return 0;

528

529

return 1;

529

return 1;

530

}

530

}

531

return 0;

531

return 0;

532

}

532

}

533

534

/*

534

/*

535

* Freeing function for a buddy system allocator.

535

* Freeing function for a buddy system allocator.

536

*

536

*

537

* The concept of a buddy system is to maintain direct-mapped table

537

* The concept of a buddy system is to maintain direct-mapped table

538

* (containing bit values) for memory blocks of various "orders".

538

* (containing bit values) for memory blocks of various "orders".

539

* The bottom level table contains the map for the smallest allocatable

539

* The bottom level table contains the map for the smallest allocatable

540

* units of memory (here, pages), and each level above it describes

540

* units of memory (here, pages), and each level above it describes

541

* pairs of units from the levels below, hence, "buddies".

541

* pairs of units from the levels below, hence, "buddies".

542

* At a high level, all that happens here is marking the table entry

542

* At a high level, all that happens here is marking the table entry

543

* at the bottom level available, and propagating the changes upward

543

* at the bottom level available, and propagating the changes upward

544

* as necessary, plus some accounting needed to play nicely with other

544

* as necessary, plus some accounting needed to play nicely with other

545

* parts of the VM system.

545

* parts of the VM system.

546

* At each level, we keep a list of pages, which are heads of continuous

546

* At each level, we keep a list of pages, which are heads of continuous

547

* free pages of length of (1 << order) and marked with _mapcount

547

* free pages of length of (1 << order) and marked with _mapcount

548

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

548

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

549

* field.

549

* field.

550

* So when we are allocating or freeing one, we can derive the state of the

550

* So when we are allocating or freeing one, we can derive the state of the

551

* other. That is, if we allocate a small block, and both were

551

* other. That is, if we allocate a small block, and both were

552

* free, the remainder of the region must be split into blocks.

552

* free, the remainder of the region must be split into blocks.

553

* If a block is freed, and its buddy is also free, then this

553

* If a block is freed, and its buddy is also free, then this

554

* triggers coalescing into a block of larger size.

554

* triggers coalescing into a block of larger size.

555

*

555

*

556

* -- nyc

556

* -- nyc

557

*/

557

*/

558

559

static inline void __free_one_page(struct page *page,

559

static inline void __free_one_page(struct page *page,

560

unsigned long pfn,

560

unsigned long pfn,

561

struct zone *zone, unsigned int order,

561

struct zone *zone, unsigned int order,

562

int migratetype)

562

int migratetype)

563

{

563

{

564

unsigned long page_idx;

564

unsigned long page_idx;

565

unsigned long combined_idx;

565

unsigned long combined_idx;

566

unsigned long uninitialized_var(buddy_idx);

566

unsigned long uninitialized_var(buddy_idx);

567

struct page *buddy;

567

struct page *buddy;

568

569

VM_BUG_ON(!zone_is_initialized(zone));

569

VM_BUG_ON(!zone_is_initialized(zone));

570

571

if (unlikely(PageCompound(page)))

571

if (unlikely(PageCompound(page)))

572

if (unlikely(destroy_compound_page(page, order)))

572

if (unlikely(destroy_compound_page(page, order)))

573

return;

573

return;

574

575

VM_BUG_ON(migratetype == -1);

575

VM_BUG_ON(migratetype == -1);

576

577

page_idx = pfn & ((1 << MAX_ORDER) - 1);

577

page_idx = pfn & ((1 << MAX_ORDER) - 1);

578

579

VM_BUG_ON(page_idx & ((1 << order) - 1));

579

VM_BUG_ON(page_idx & ((1 << order) - 1));

580

VM_BUG_ON(bad_range(zone, page));

580

VM_BUG_ON(bad_range(zone, page));

581

582

while (order < MAX_ORDER-1) {

582

while (order < MAX_ORDER-1) {

583

buddy_idx = __find_buddy_index(page_idx, order);

583

buddy_idx = __find_buddy_index(page_idx, order);

584

buddy = page + (buddy_idx - page_idx);

584

buddy = page + (buddy_idx - page_idx);

585

if (!page_is_buddy(page, buddy, order))

585

if (!page_is_buddy(page, buddy, order))

586

break;

586

break;

587

/*

587

/*

588

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

588

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

589

* merge with it and move up one order.

589

* merge with it and move up one order.

590

*/

590

*/

591

if (page_is_guard(buddy)) {

591

if (page_is_guard(buddy)) {

592

clear_page_guard_flag(buddy);

592

clear_page_guard_flag(buddy);

593

set_page_private(page, 0);

593

set_page_private(page, 0);

594

__mod_zone_freepage_state(zone, 1 << order,

594

__mod_zone_freepage_state(zone, 1 << order,

595

migratetype);

595

migratetype);

596

} else {

596

} else {

597

list_del(&buddy->lru);

597

list_del(&buddy->lru);

598

zone->free_area[order].nr_free--;

598

zone->free_area[order].nr_free--;

599

rmv_page_order(buddy);

599

rmv_page_order(buddy);

600

}

600

}

601

combined_idx = buddy_idx & page_idx;

601

combined_idx = buddy_idx & page_idx;

602

page = page + (combined_idx - page_idx);

602

page = page + (combined_idx - page_idx);

603

page_idx = combined_idx;

603

page_idx = combined_idx;

604

order++;

604

order++;

605

}

605

}

606

set_page_order(page, order);

606

set_page_order(page, order);

607

608

/*

608

/*

609

* If this is not the largest possible page, check if the buddy

609

* If this is not the largest possible page, check if the buddy

610

* of the next-highest order is free. If it is, it's possible

610

* of the next-highest order is free. If it is, it's possible

611

* that pages are being freed that will coalesce soon. In case,

611

* that pages are being freed that will coalesce soon. In case,

612

* that is happening, add the free page to the tail of the list

612

* that is happening, add the free page to the tail of the list

613

* so it's less likely to be used soon and more likely to be merged

613

* so it's less likely to be used soon and more likely to be merged

614

* as a higher order page

614

* as a higher order page

615

*/

615

*/

616

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

616

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

617

struct page *higher_page, *higher_buddy;

617

struct page *higher_page, *higher_buddy;

618

combined_idx = buddy_idx & page_idx;

618

combined_idx = buddy_idx & page_idx;

619

higher_page = page + (combined_idx - page_idx);

619

higher_page = page + (combined_idx - page_idx);

620

buddy_idx = __find_buddy_index(combined_idx, order + 1);

620

buddy_idx = __find_buddy_index(combined_idx, order + 1);

621

higher_buddy = higher_page + (buddy_idx - combined_idx);

621

higher_buddy = higher_page + (buddy_idx - combined_idx);

622

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

622

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

623

list_add_tail(&page->lru,

623

list_add_tail(&page->lru,

624

&zone->free_area[order].free_list[migratetype]);

624

&zone->free_area[order].free_list[migratetype]);

625

goto out;

625

goto out;

626

}

626

}

627

}

627

}

628

629

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

629

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

630

out:

630

out:

631

zone->free_area[order].nr_free++;

631

zone->free_area[order].nr_free++;

632

}

632

}

633

634

static inline int free_pages_check(struct page *page)

634

static inline int free_pages_check(struct page *page)

635

{

635

{

636

if (unlikely(page_mapcount(page) |

636

if (unlikely(page_mapcount(page) |

637

(page->mapping != NULL) |

637

(page->mapping != NULL) |

638

(atomic_read(&page->_count) != 0) |

638

(atomic_read(&page->_count) != 0) |

639

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

639

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

640

(mem_cgroup_bad_page_check(page)))) {

640

(mem_cgroup_bad_page_check(page)))) {

641

bad_page(page);

641

bad_page(page);

642

return 1;

642

return 1;

643

}

643

}

644

page_nid_reset_last(page);

644

page_nid_reset_last(page);

645

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

645

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

646

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

646

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

647

return 0;

647

return 0;

648

}

648

}

649

650

/*

650

/*

651

* Frees a number of pages from the PCP lists

651

* Frees a number of pages from the PCP lists

652

* Assumes all pages on list are in same zone, and of same order.

652

* Assumes all pages on list are in same zone, and of same order.

653

* count is the number of pages to free.

653

* count is the number of pages to free.

654

*

654

*

655

* If the zone was previously in an "all pages pinned" state then look to

655

* If the zone was previously in an "all pages pinned" state then look to

656

* see if this freeing clears that state.

656

* see if this freeing clears that state.

657

*

657

*

658

* And clear the zone's pages_scanned counter, to hold off the "all pages are

658

* And clear the zone's pages_scanned counter, to hold off the "all pages are

659

* pinned" detection logic.

659

* pinned" detection logic.

660

*/

660

*/

661

static void free_pcppages_bulk(struct zone *zone, int count,

661

static void free_pcppages_bulk(struct zone *zone, int count,

662

struct per_cpu_pages *pcp)

662

struct per_cpu_pages *pcp)

663

{

663

{

664

int migratetype = 0;

664

int migratetype = 0;

665

int batch_free = 0;

665

int batch_free = 0;

666

int to_free = count;

666

int to_free = count;

667

668

spin_lock(&zone->lock);

668

spin_lock(&zone->lock);

669

zone->pages_scanned = 0;

669

zone->pages_scanned = 0;

670

671

while (to_free) {

671

while (to_free) {

672

struct page *page;

672

struct page *page;

673

struct list_head *list;

673

struct list_head *list;

674

675

/*

675

/*

676

* Remove pages from lists in a round-robin fashion. A

676

* Remove pages from lists in a round-robin fashion. A

677

* batch_free count is maintained that is incremented when an

677

* batch_free count is maintained that is incremented when an

678

* empty list is encountered. This is so more pages are freed

678

* empty list is encountered. This is so more pages are freed

679

* off fuller lists instead of spinning excessively around empty

679

* off fuller lists instead of spinning excessively around empty

680

* lists

680

* lists

681

*/

681

*/

682

do {

682

do {

683

batch_free++;

683

batch_free++;

684

if (++migratetype == MIGRATE_PCPTYPES)

684

if (++migratetype == MIGRATE_PCPTYPES)

685

migratetype = 0;

685

migratetype = 0;

686

list = &pcp->lists[migratetype];

686

list = &pcp->lists[migratetype];

687

} while (list_empty(list));

687

} while (list_empty(list));

688

689

/* This is the only non-empty list. Free them all. */

689

/* This is the only non-empty list. Free them all. */

690

if (batch_free == MIGRATE_PCPTYPES)

690

if (batch_free == MIGRATE_PCPTYPES)

691

batch_free = to_free;

691

batch_free = to_free;

692

693

do {

693

do {

694

int mt; /* migratetype of the to-be-freed page */

694

int mt; /* migratetype of the to-be-freed page */

695

696

page = list_entry(list->prev, struct page, lru);

696

page = list_entry(list->prev, struct page, lru);

697

/* must delete as __free_one_page list manipulates */

697

/* must delete as __free_one_page list manipulates */

698

list_del(&page->lru);

698

list_del(&page->lru);

699

mt = get_freepage_migratetype(page);

699

mt = get_freepage_migratetype(page);

700

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

700

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

701

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

701

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

702

trace_mm_page_pcpu_drain(page, 0, mt);

702

trace_mm_page_pcpu_drain(page, 0, mt);

703

if (likely(!is_migrate_isolate_page(page))) {

703

if (likely(!is_migrate_isolate_page(page))) {

704

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

704

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

705

if (is_migrate_cma(mt))

705

if (is_migrate_cma(mt))

706

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

706

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

707

}

707

}

708

} while (--to_free && --batch_free && !list_empty(list));

708

} while (--to_free && --batch_free && !list_empty(list));

709

}

709

}

710

spin_unlock(&zone->lock);

710

spin_unlock(&zone->lock);

711

}

711

}

712

713

static void free_one_page(struct zone *zone,

713

static void free_one_page(struct zone *zone,

714

struct page *page, unsigned long pfn,

714

struct page *page, unsigned long pfn,

715

unsigned int order,

715

unsigned int order,

716

int migratetype)

716

int migratetype)

717

{

717

{

718

spin_lock(&zone->lock);

718

spin_lock(&zone->lock);

719

zone->pages_scanned = 0;

719

zone->pages_scanned = 0;

720

721

__free_one_page(page, pfn, zone, order, migratetype);

721

__free_one_page(page, pfn, zone, order, migratetype);

722

if (unlikely(!is_migrate_isolate(migratetype)))

722

if (unlikely(!is_migrate_isolate(migratetype)))

723

__mod_zone_freepage_state(zone, 1 << order, migratetype);

723

__mod_zone_freepage_state(zone, 1 << order, migratetype);

724

spin_unlock(&zone->lock);

724

spin_unlock(&zone->lock);

725

}

725

}

726

727

static bool free_pages_prepare(struct page *page, unsigned int order)

727

static bool free_pages_prepare(struct page *page, unsigned int order)

728

{

728

{

729

int i;

729

int i;

730

int bad = 0;

730

int bad = 0;

731

732

trace_mm_page_free(page, order);

732

trace_mm_page_free(page, order);

733

kmemcheck_free_shadow(page, order);

733

kmemcheck_free_shadow(page, order);

734

735

if (PageAnon(page))

735

if (PageAnon(page))

736

page->mapping = NULL;

736

page->mapping = NULL;

737

for (i = 0; i < (1 << order); i++)

737

for (i = 0; i < (1 << order); i++)

738

bad += free_pages_check(page + i);

738

bad += free_pages_check(page + i);

739

if (bad)

739

if (bad)

740

return false;

740

return false;

741

742

if (!PageHighMem(page)) {

742

if (!PageHighMem(page)) {

743

debug_check_no_locks_freed(page_address(page),

743

debug_check_no_locks_freed(page_address(page),

744

PAGE_SIZE << order);

744

PAGE_SIZE << order);

745

debug_check_no_obj_freed(page_address(page),

745

debug_check_no_obj_freed(page_address(page),

746

PAGE_SIZE << order);

746

PAGE_SIZE << order);

747

}

747

}

748

arch_free_page(page, order);

748

arch_free_page(page, order);

749

kernel_map_pages(page, 1 << order, 0);

749

kernel_map_pages(page, 1 << order, 0);

750

751

return true;

751

return true;

752

}

752

}

753

754

static void __free_pages_ok(struct page *page, unsigned int order)

754

static void __free_pages_ok(struct page *page, unsigned int order)

755

{

755

{

756

unsigned long flags;

756

unsigned long flags;

757

int migratetype;

757

int migratetype;

758

unsigned long pfn = page_to_pfn(page);

758

unsigned long pfn = page_to_pfn(page);

759

760

if (!free_pages_prepare(page, order))

760

if (!free_pages_prepare(page, order))

761

return;

761

return;

762

763

migratetype = get_pfnblock_migratetype(page, pfn);

763

local_irq_save(flags);

764

local_irq_save(flags);

764

__count_vm_events(PGFREE, 1 << order);

765

__count_vm_events(PGFREE, 1 << order);

765

migratetype = get_pfnblock_migratetype(page, pfn);

766

set_freepage_migratetype(page, migratetype);

766

set_freepage_migratetype(page, migratetype);

767

free_one_page(page_zone(page), page, pfn, order, migratetype);

767

free_one_page(page_zone(page), page, pfn, order, migratetype);

768

local_irq_restore(flags);

768

local_irq_restore(flags);

769

}

769

}

770

771

void __init __free_pages_bootmem(struct page *page, unsigned int order)

771

void __init __free_pages_bootmem(struct page *page, unsigned int order)

772

{

772

{

773

unsigned int nr_pages = 1 << order;

773

unsigned int nr_pages = 1 << order;

774

struct page *p = page;

774

struct page *p = page;

775

unsigned int loop;

775

unsigned int loop;

776

777

prefetchw(p);

777

prefetchw(p);

778

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

778

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

779

prefetchw(p + 1);

779

prefetchw(p + 1);

780

__ClearPageReserved(p);

780

__ClearPageReserved(p);

781

set_page_count(p, 0);

781

set_page_count(p, 0);

782

}

782

}

783

__ClearPageReserved(p);

783

__ClearPageReserved(p);

784

set_page_count(p, 0);

784

set_page_count(p, 0);

785

786

page_zone(page)->managed_pages += nr_pages;

786

page_zone(page)->managed_pages += nr_pages;

787

set_page_refcounted(page);

787

set_page_refcounted(page);

788

__free_pages(page, order);

788

__free_pages(page, order);

789

}

789

}

790

791

#ifdef CONFIG_CMA

791

#ifdef CONFIG_CMA

792

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

792

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

793

void __init init_cma_reserved_pageblock(struct page *page)

793

void __init init_cma_reserved_pageblock(struct page *page)

794

{

794

{

795

unsigned i = pageblock_nr_pages;

795

unsigned i = pageblock_nr_pages;

796

struct page *p = page;

796

struct page *p = page;

797

798

do {

798

do {

799

__ClearPageReserved(p);

799

__ClearPageReserved(p);

800

set_page_count(p, 0);

800

set_page_count(p, 0);

801

} while (++p, --i);

801

} while (++p, --i);

802

803

set_pageblock_migratetype(page, MIGRATE_CMA);

803

set_pageblock_migratetype(page, MIGRATE_CMA);

804

805

if (pageblock_order >= MAX_ORDER) {

805

if (pageblock_order >= MAX_ORDER) {

806

i = pageblock_nr_pages;

806

i = pageblock_nr_pages;

807

p = page;

807

p = page;

808

do {

808

do {

809

set_page_refcounted(p);

809

set_page_refcounted(p);

810

__free_pages(p, MAX_ORDER - 1);

810

__free_pages(p, MAX_ORDER - 1);

811

p += MAX_ORDER_NR_PAGES;

811

p += MAX_ORDER_NR_PAGES;

812

} while (i -= MAX_ORDER_NR_PAGES);

812

} while (i -= MAX_ORDER_NR_PAGES);

813

} else {

813

} else {

814

set_page_refcounted(page);

814

set_page_refcounted(page);

815

__free_pages(page, pageblock_order);

815

__free_pages(page, pageblock_order);

816

}

816

}

817

818

adjust_managed_page_count(page, pageblock_nr_pages);

818

adjust_managed_page_count(page, pageblock_nr_pages);

819

}

819

}

820

#endif

820

#endif

821

822

/*

822

/*

823

* The order of subdivision here is critical for the IO subsystem.

823

* The order of subdivision here is critical for the IO subsystem.

824

* Please do not alter this order without good reasons and regression

824

* Please do not alter this order without good reasons and regression

825

* testing. Specifically, as large blocks of memory are subdivided,

825

* testing. Specifically, as large blocks of memory are subdivided,

826

* the order in which smaller blocks are delivered depends on the order

826

* the order in which smaller blocks are delivered depends on the order

827

* they're subdivided in this function. This is the primary factor

827

* they're subdivided in this function. This is the primary factor

828

* influencing the order in which pages are delivered to the IO

828

* influencing the order in which pages are delivered to the IO

829

* subsystem according to empirical testing, and this is also justified

829

* subsystem according to empirical testing, and this is also justified

830

* by considering the behavior of a buddy system containing a single

830

* by considering the behavior of a buddy system containing a single

831

* large block of memory acted on by a series of small allocations.

831

* large block of memory acted on by a series of small allocations.

832

* This behavior is a critical factor in sglist merging's success.

832

* This behavior is a critical factor in sglist merging's success.

833

*

833

*

834

* -- nyc

834

* -- nyc

835

*/

835

*/

836

static inline void expand(struct zone *zone, struct page *page,

836

static inline void expand(struct zone *zone, struct page *page,

837

int low, int high, struct free_area *area,

837

int low, int high, struct free_area *area,

838

int migratetype)

838

int migratetype)

839

{

839

{

840

unsigned long size = 1 << high;

840

unsigned long size = 1 << high;

841

842

while (high > low) {

842

while (high > low) {

843

area--;

843

area--;

844

high--;

844

high--;

845

size >>= 1;

845

size >>= 1;

846

VM_BUG_ON(bad_range(zone, &page[size]));

846

VM_BUG_ON(bad_range(zone, &page[size]));

847

848

#ifdef CONFIG_DEBUG_PAGEALLOC

848

#ifdef CONFIG_DEBUG_PAGEALLOC

849

if (high < debug_guardpage_minorder()) {

849

if (high < debug_guardpage_minorder()) {

850

/*

850

/*

851

* Mark as guard pages (or page), that will allow to

851

* Mark as guard pages (or page), that will allow to

852

* merge back to allocator when buddy will be freed.

852

* merge back to allocator when buddy will be freed.

853

* Corresponding page table entries will not be touched,

853

* Corresponding page table entries will not be touched,

854

* pages will stay not present in virtual address space

854

* pages will stay not present in virtual address space

855

*/

855

*/

856

INIT_LIST_HEAD(&page[size].lru);

856

INIT_LIST_HEAD(&page[size].lru);

857

set_page_guard_flag(&page[size]);

857

set_page_guard_flag(&page[size]);

858

set_page_private(&page[size], high);

858

set_page_private(&page[size], high);

859

/* Guard pages are not available for any usage */

859

/* Guard pages are not available for any usage */

860

__mod_zone_freepage_state(zone, -(1 << high),

860

__mod_zone_freepage_state(zone, -(1 << high),

861

migratetype);

861

migratetype);

862

continue;

862

continue;

863

}

863

}

864

#endif

864

#endif

865

list_add(&page[size].lru, &area->free_list[migratetype]);

865

list_add(&page[size].lru, &area->free_list[migratetype]);

866

area->nr_free++;

866

area->nr_free++;

867

set_page_order(&page[size], high);

867

set_page_order(&page[size], high);

868

}

868

}

869

}

869

}

870

871

/*

871

/*

872

* This page is about to be returned from the page allocator

872

* This page is about to be returned from the page allocator

873

*/

873

*/

874

static inline int check_new_page(struct page *page)

874

static inline int check_new_page(struct page *page)

875

{

875

{

876

if (unlikely(page_mapcount(page) |

876

if (unlikely(page_mapcount(page) |

877

(page->mapping != NULL) |

877

(page->mapping != NULL) |

878

(atomic_read(&page->_count) != 0) |

878

(atomic_read(&page->_count) != 0) |

879

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

879

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

880

(mem_cgroup_bad_page_check(page)))) {

880

(mem_cgroup_bad_page_check(page)))) {

881

bad_page(page);

881

bad_page(page);

882

return 1;

882

return 1;

883

}

883

}

884

return 0;

884

return 0;

885

}

885

}

886

887

static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)

887

static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)

888

{

888

{

889

int i;

889

int i;

890

891

for (i = 0; i < (1 << order); i++) {

891

for (i = 0; i < (1 << order); i++) {

892

struct page *p = page + i;

892

struct page *p = page + i;

893

if (unlikely(check_new_page(p)))

893

if (unlikely(check_new_page(p)))

894

return 1;

894

return 1;

895

}

895

}

896

897

set_page_private(page, 0);

897

set_page_private(page, 0);

898

set_page_refcounted(page);

898

set_page_refcounted(page);

899

900

arch_alloc_page(page, order);

900

arch_alloc_page(page, order);

901

kernel_map_pages(page, 1 << order, 1);

901

kernel_map_pages(page, 1 << order, 1);

902

903

if (gfp_flags & __GFP_ZERO)

903

if (gfp_flags & __GFP_ZERO)

904

prep_zero_page(page, order, gfp_flags);

904

prep_zero_page(page, order, gfp_flags);

905

906

if (order && (gfp_flags & __GFP_COMP))

906

if (order && (gfp_flags & __GFP_COMP))

907

prep_compound_page(page, order);

907

prep_compound_page(page, order);

908

909

return 0;

909

return 0;

910

}

910

}

911

912

/*

912

/*

913

* Go through the free lists for the given migratetype and remove

913

* Go through the free lists for the given migratetype and remove

914

* the smallest available page from the freelists

914

* the smallest available page from the freelists

915

*/

915

*/

916

static inline

916

static inline

917

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

917

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

918

int migratetype)

918

int migratetype)

919

{

919

{

920

unsigned int current_order;

920

unsigned int current_order;

921

struct free_area *area;

921

struct free_area *area;

922

struct page *page;

922

struct page *page;

923

924

/* Find a page of the appropriate size in the preferred list */

924

/* Find a page of the appropriate size in the preferred list */

925

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

925

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

926

area = &(zone->free_area[current_order]);

926

area = &(zone->free_area[current_order]);

927

if (list_empty(&area->free_list[migratetype]))

927

if (list_empty(&area->free_list[migratetype]))

928

continue;

928

continue;

929

930

page = list_entry(area->free_list[migratetype].next,

930

page = list_entry(area->free_list[migratetype].next,

931

struct page, lru);

931

struct page, lru);

932

list_del(&page->lru);

932

list_del(&page->lru);

933

rmv_page_order(page);

933

rmv_page_order(page);

934

area->nr_free--;

934

area->nr_free--;

935

expand(zone, page, order, current_order, area, migratetype);

935

expand(zone, page, order, current_order, area, migratetype);

936

set_freepage_migratetype(page, migratetype);

936

set_freepage_migratetype(page, migratetype);

937

return page;

937

return page;

938

}

938

}

939

940

return NULL;

940

return NULL;

941

}

941

}

942

943

944

/*

944

/*

945

* This array describes the order lists are fallen back to when

945

* This array describes the order lists are fallen back to when

946

* the free lists for the desirable migrate type are depleted

946

* the free lists for the desirable migrate type are depleted

947

*/

947

*/

948

static int fallbacks[MIGRATE_TYPES][4] = {

948

static int fallbacks[MIGRATE_TYPES][4] = {

949

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

949

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

950

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

950

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

951

#ifdef CONFIG_CMA

951

#ifdef CONFIG_CMA

952

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

952

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

953

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

953

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

954

#else

954

#else

955

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

955

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

956

#endif

956

#endif

957

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

957

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

958

#ifdef CONFIG_MEMORY_ISOLATION

958

#ifdef CONFIG_MEMORY_ISOLATION

959

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

959

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

960

#endif

960

#endif

961

};

961

};

962

963

/*

963

/*

964

* Move the free pages in a range to the free lists of the requested type.

964

* Move the free pages in a range to the free lists of the requested type.

965

* Note that start_page and end_pages are not aligned on a pageblock

965

* Note that start_page and end_pages are not aligned on a pageblock

966

* boundary. If alignment is required, use move_freepages_block()

966

* boundary. If alignment is required, use move_freepages_block()

967

*/

967

*/

968

int move_freepages(struct zone *zone,

968

int move_freepages(struct zone *zone,

969

struct page *start_page, struct page *end_page,

969

struct page *start_page, struct page *end_page,

970

int migratetype)

970

int migratetype)

971

{

971

{

972

struct page *page;

972

struct page *page;

973

unsigned long order;

973

unsigned long order;

974

int pages_moved = 0;

974

int pages_moved = 0;

975

976

#ifndef CONFIG_HOLES_IN_ZONE

976

#ifndef CONFIG_HOLES_IN_ZONE

977

/*

977

/*

978

* page_zone is not safe to call in this context when

978

* page_zone is not safe to call in this context when

979

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

979

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

980

* anyway as we check zone boundaries in move_freepages_block().

980

* anyway as we check zone boundaries in move_freepages_block().

981

* Remove at a later date when no bug reports exist related to

981

* Remove at a later date when no bug reports exist related to

982

* grouping pages by mobility

982

* grouping pages by mobility

983

*/

983

*/

984

BUG_ON(page_zone(start_page) != page_zone(end_page));

984

BUG_ON(page_zone(start_page) != page_zone(end_page));

985

#endif

985

#endif

986

987

for (page = start_page; page <= end_page;) {

987

for (page = start_page; page <= end_page;) {

988

/* Make sure we are not inadvertently changing nodes */

988

/* Make sure we are not inadvertently changing nodes */

989

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

989

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

990

991

if (!pfn_valid_within(page_to_pfn(page))) {

991

if (!pfn_valid_within(page_to_pfn(page))) {

992

page++;

992

page++;

993

continue;

993

continue;

994

}

994

}

995

996

if (!PageBuddy(page)) {

996

if (!PageBuddy(page)) {

997

page++;

997

page++;

998

continue;

998

continue;

999

}

999

}

1000

1001

order = page_order(page);

1001

order = page_order(page);

1002

list_move(&page->lru,

1002

list_move(&page->lru,

1003

&zone->free_area[order].free_list[migratetype]);

1003

&zone->free_area[order].free_list[migratetype]);

1004

set_freepage_migratetype(page, migratetype);

1004

set_freepage_migratetype(page, migratetype);

1005

page += 1 << order;

1005

page += 1 << order;

1006

pages_moved += 1 << order;

1006

pages_moved += 1 << order;

1007

}

1007

}

1008

1009

return pages_moved;

1009

return pages_moved;

1010

}

1010

}

1011

1012

int move_freepages_block(struct zone *zone, struct page *page,

1012

int move_freepages_block(struct zone *zone, struct page *page,

1013

int migratetype)

1013

int migratetype)

1014

{

1014

{

1015

unsigned long start_pfn, end_pfn;

1015

unsigned long start_pfn, end_pfn;

1016

struct page *start_page, *end_page;

1016

struct page *start_page, *end_page;

1017

1018

start_pfn = page_to_pfn(page);

1018

start_pfn = page_to_pfn(page);

1019

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1019

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1020

start_page = pfn_to_page(start_pfn);

1020

start_page = pfn_to_page(start_pfn);

1021

end_page = start_page + pageblock_nr_pages - 1;

1021

end_page = start_page + pageblock_nr_pages - 1;

1022

end_pfn = start_pfn + pageblock_nr_pages - 1;

1022

end_pfn = start_pfn + pageblock_nr_pages - 1;

1023

1024

/* Do not cross zone boundaries */

1024

/* Do not cross zone boundaries */

1025

if (!zone_spans_pfn(zone, start_pfn))

1025

if (!zone_spans_pfn(zone, start_pfn))

1026

start_page = page;

1026

start_page = page;

1027

if (!zone_spans_pfn(zone, end_pfn))

1027

if (!zone_spans_pfn(zone, end_pfn))

1028

return 0;

1028

return 0;

1029

1030

return move_freepages(zone, start_page, end_page, migratetype);

1030

return move_freepages(zone, start_page, end_page, migratetype);

1031

}

1031

}

1032

1033

static void change_pageblock_range(struct page *pageblock_page,

1033

static void change_pageblock_range(struct page *pageblock_page,

1034

int start_order, int migratetype)

1034

int start_order, int migratetype)

1035

{

1035

{

1036

int nr_pageblocks = 1 << (start_order - pageblock_order);

1036

int nr_pageblocks = 1 << (start_order - pageblock_order);

1037

1038

while (nr_pageblocks--) {

1038

while (nr_pageblocks--) {

1039

set_pageblock_migratetype(pageblock_page, migratetype);

1039

set_pageblock_migratetype(pageblock_page, migratetype);

1040

pageblock_page += pageblock_nr_pages;

1040

pageblock_page += pageblock_nr_pages;

1041

}

1041

}

1042

}

1042

}

1043

1044

/*

1044

/*

1045

* If breaking a large block of pages, move all free pages to the preferred

1045

* If breaking a large block of pages, move all free pages to the preferred

1046

* allocation list. If falling back for a reclaimable kernel allocation, be

1046

* allocation list. If falling back for a reclaimable kernel allocation, be

1047

* more aggressive about taking ownership of free pages.

1047

* more aggressive about taking ownership of free pages.

1048

*

1048

*

1049

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1049

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1050

* nor move CMA pages to different free lists. We don't want unmovable pages

1050

* nor move CMA pages to different free lists. We don't want unmovable pages

1051

* to be allocated from MIGRATE_CMA areas.

1051

* to be allocated from MIGRATE_CMA areas.

1052

*

1052

*

1053

* Returns the new migratetype of the pageblock (or the same old migratetype

1053

* Returns the new migratetype of the pageblock (or the same old migratetype

1054

* if it was unchanged).

1054

* if it was unchanged).

1055

*/

1055

*/

1056

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1056

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1057

int start_type, int fallback_type)

1057

int start_type, int fallback_type)

1058

{

1058

{

1059

int current_order = page_order(page);

1059

int current_order = page_order(page);

1060

1061

/*

1061

/*

1062

* When borrowing from MIGRATE_CMA, we need to release the excess

1062

* When borrowing from MIGRATE_CMA, we need to release the excess

1063

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1063

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1064

* is set to CMA so it is returned to the correct freelist in case

1064

* is set to CMA so it is returned to the correct freelist in case

1065

* the page ends up being not actually allocated from the pcp lists.

1065

* the page ends up being not actually allocated from the pcp lists.

1066

*/

1066

*/

1067

if (is_migrate_cma(fallback_type))

1067

if (is_migrate_cma(fallback_type))

1068

return fallback_type;

1068

return fallback_type;

1069

1070

/* Take ownership for orders >= pageblock_order */

1070

/* Take ownership for orders >= pageblock_order */

1071

if (current_order >= pageblock_order) {

1071

if (current_order >= pageblock_order) {

1072

change_pageblock_range(page, current_order, start_type);

1072

change_pageblock_range(page, current_order, start_type);

1073

return start_type;

1073

return start_type;

1074

}

1074

}

1075

1076

if (current_order >= pageblock_order / 2 ||

1076

if (current_order >= pageblock_order / 2 ||

1077

start_type == MIGRATE_RECLAIMABLE ||

1077

start_type == MIGRATE_RECLAIMABLE ||

1078

page_group_by_mobility_disabled) {

1078

page_group_by_mobility_disabled) {

1079

int pages;

1079

int pages;

1080

1081

pages = move_freepages_block(zone, page, start_type);

1081

pages = move_freepages_block(zone, page, start_type);

1082

1083

/* Claim the whole block if over half of it is free */

1083

/* Claim the whole block if over half of it is free */

1084

if (pages >= (1 << (pageblock_order-1)) ||

1084

if (pages >= (1 << (pageblock_order-1)) ||

1085

page_group_by_mobility_disabled) {

1085

page_group_by_mobility_disabled) {

1086

1087

set_pageblock_migratetype(page, start_type);

1087

set_pageblock_migratetype(page, start_type);

1088

return start_type;

1088

return start_type;

1089

}

1089

}

1090

1091

}

1091

}

1092

1093

return fallback_type;

1093

return fallback_type;

1094

}

1094

}

1095

1096

/* Remove an element from the buddy allocator from the fallback list */

1096

/* Remove an element from the buddy allocator from the fallback list */

1097

static inline struct page *

1097

static inline struct page *

1098

__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)

1098

__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)

1099

{

1099

{

1100

struct free_area *area;

1100

struct free_area *area;

1101

unsigned int current_order;

1101

unsigned int current_order;

1102

struct page *page;

1102

struct page *page;

1103

int migratetype, new_type, i;

1103

int migratetype, new_type, i;

1104

1105

/* Find the largest possible block of pages in the other list */

1105

/* Find the largest possible block of pages in the other list */

1106

for (current_order = MAX_ORDER-1;

1106

for (current_order = MAX_ORDER-1;

1107

current_order >= order && current_order <= MAX_ORDER-1;

1107

current_order >= order && current_order <= MAX_ORDER-1;

1108

--current_order) {

1108

--current_order) {

1109

for (i = 0;; i++) {

1109

for (i = 0;; i++) {

1110

migratetype = fallbacks[start_migratetype][i];

1110

migratetype = fallbacks[start_migratetype][i];

1111

1112

/* MIGRATE_RESERVE handled later if necessary */

1112

/* MIGRATE_RESERVE handled later if necessary */

1113

if (migratetype == MIGRATE_RESERVE)

1113

if (migratetype == MIGRATE_RESERVE)

1114

break;

1114

break;

1115

1116

area = &(zone->free_area[current_order]);

1116

area = &(zone->free_area[current_order]);

1117

if (list_empty(&area->free_list[migratetype]))

1117

if (list_empty(&area->free_list[migratetype]))

1118

continue;

1118

continue;

1119

1120

page = list_entry(area->free_list[migratetype].next,

1120

page = list_entry(area->free_list[migratetype].next,

1121

struct page, lru);

1121

struct page, lru);

1122

area->nr_free--;

1122

area->nr_free--;

1123

1124

new_type = try_to_steal_freepages(zone, page,

1124

new_type = try_to_steal_freepages(zone, page,

1125

start_migratetype,

1125

start_migratetype,

1126

migratetype);

1126

migratetype);

1127

1128

/* Remove the page from the freelists */

1128

/* Remove the page from the freelists */

1129

list_del(&page->lru);

1129

list_del(&page->lru);

1130

rmv_page_order(page);

1130

rmv_page_order(page);

1131

1132

expand(zone, page, order, current_order, area,

1132

expand(zone, page, order, current_order, area,

1133

new_type);

1133

new_type);

1134

/* The freepage_migratetype may differ from pageblock's

1134

/* The freepage_migratetype may differ from pageblock's

1135

* migratetype depending on the decisions in

1135

* migratetype depending on the decisions in

1136

* try_to_steal_freepages. This is OK as long as it does

1136

* try_to_steal_freepages. This is OK as long as it does

1137

* not differ for MIGRATE_CMA type.

1137

* not differ for MIGRATE_CMA type.

1138

*/

1138

*/

1139

set_freepage_migratetype(page, new_type);

1139

set_freepage_migratetype(page, new_type);

1140

1141

trace_mm_page_alloc_extfrag(page, order, current_order,

1141

trace_mm_page_alloc_extfrag(page, order, current_order,

1142

start_migratetype, migratetype, new_type);

1142

start_migratetype, migratetype, new_type);

1143

1144

return page;

1144

return page;

1145

}

1145

}

1146

}

1146

}

1147

1148

return NULL;

1148

return NULL;

1149

}

1149

}

1150

1151

/*

1151

/*

1152

* Do the hard work of removing an element from the buddy allocator.

1152

* Do the hard work of removing an element from the buddy allocator.

1153

* Call me with the zone->lock already held.

1153

* Call me with the zone->lock already held.

1154

*/

1154

*/

1155

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1155

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1156

int migratetype)

1156

int migratetype)

1157

{

1157

{

1158

struct page *page;

1158

struct page *page;

1159

1160

retry_reserve:

1160

retry_reserve:

1161

page = __rmqueue_smallest(zone, order, migratetype);

1161

page = __rmqueue_smallest(zone, order, migratetype);

1162

1163

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1163

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1164

page = __rmqueue_fallback(zone, order, migratetype);

1164

page = __rmqueue_fallback(zone, order, migratetype);

1165

1166

/*

1166

/*

1167

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1167

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1168

* is used because __rmqueue_smallest is an inline function

1168

* is used because __rmqueue_smallest is an inline function

1169

* and we want just one call site

1169

* and we want just one call site

1170

*/

1170

*/

1171

if (!page) {

1171

if (!page) {

1172

migratetype = MIGRATE_RESERVE;

1172

migratetype = MIGRATE_RESERVE;

1173

goto retry_reserve;

1173

goto retry_reserve;

1174

}

1174

}

1175

}

1175

}

1176

1177

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1177

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1178

return page;

1178

return page;

1179

}

1179

}

1180

1181

/*

1181

/*

1182

* Obtain a specified number of elements from the buddy allocator, all under

1182

* Obtain a specified number of elements from the buddy allocator, all under

1183

* a single hold of the lock, for efficiency. Add them to the supplied list.

1183

* a single hold of the lock, for efficiency. Add them to the supplied list.

1184

* Returns the number of new pages which were placed at *list.

1184

* Returns the number of new pages which were placed at *list.

1185

*/

1185

*/

1186

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1186

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1187

unsigned long count, struct list_head *list,

1187

unsigned long count, struct list_head *list,

1188

int migratetype, bool cold)

1188

int migratetype, bool cold)

1189

{

1189

{

1190

int i;

1190

int i;

1191

1192

spin_lock(&zone->lock);

1192

spin_lock(&zone->lock);

1193

for (i = 0; i < count; ++i) {

1193

for (i = 0; i < count; ++i) {

1194

struct page *page = __rmqueue(zone, order, migratetype);

1194

struct page *page = __rmqueue(zone, order, migratetype);

1195

if (unlikely(page == NULL))

1195

if (unlikely(page == NULL))

1196

break;

1196

break;

1197

1198

/*

1198

/*

1199

* Split buddy pages returned by expand() are received here

1199

* Split buddy pages returned by expand() are received here

1200

* in physical page order. The page is added to the callers and

1200

* in physical page order. The page is added to the callers and

1201

* list and the list head then moves forward. From the callers

1201

* list and the list head then moves forward. From the callers

1202

* perspective, the linked list is ordered by page number in

1202

* perspective, the linked list is ordered by page number in

1203

* some conditions. This is useful for IO devices that can

1203

* some conditions. This is useful for IO devices that can

1204

* merge IO requests if the physical pages are ordered

1204

* merge IO requests if the physical pages are ordered

1205

* properly.

1205

* properly.

1206

*/

1206

*/

1207

if (likely(!cold))

1207

if (likely(!cold))

1208

list_add(&page->lru, list);

1208

list_add(&page->lru, list);

1209

else

1209

else

1210

list_add_tail(&page->lru, list);

1210

list_add_tail(&page->lru, list);

1211

list = &page->lru;

1211

list = &page->lru;

1212

if (is_migrate_cma(get_freepage_migratetype(page)))

1212

if (is_migrate_cma(get_freepage_migratetype(page)))

1213

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1213

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1214

-(1 << order));

1214

-(1 << order));

1215

}

1215

}

1216

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1216

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1217

spin_unlock(&zone->lock);

1217

spin_unlock(&zone->lock);

1218

return i;

1218

return i;

1219

}

1219

}

1220

1221

#ifdef CONFIG_NUMA

1221

#ifdef CONFIG_NUMA

1222

/*

1222

/*

1223

* Called from the vmstat counter updater to drain pagesets of this

1223

* Called from the vmstat counter updater to drain pagesets of this

1224

* currently executing processor on remote nodes after they have

1224

* currently executing processor on remote nodes after they have

1225

* expired.

1225

* expired.

1226

*

1226

*

1227

* Note that this function must be called with the thread pinned to

1227

* Note that this function must be called with the thread pinned to

1228

* a single processor.

1228

* a single processor.

1229

*/

1229

*/

1230

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1230

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1231

{

1231

{

1232

unsigned long flags;

1232

unsigned long flags;

1233

int to_drain;

1233

int to_drain;

1234

unsigned long batch;

1234

unsigned long batch;

1235

1236

local_irq_save(flags);

1236

local_irq_save(flags);

1237

batch = ACCESS_ONCE(pcp->batch);

1237

batch = ACCESS_ONCE(pcp->batch);

1238

if (pcp->count >= batch)

1238

if (pcp->count >= batch)

1239

to_drain = batch;

1239

to_drain = batch;

1240

else

1240

else

1241

to_drain = pcp->count;

1241

to_drain = pcp->count;

1242

if (to_drain > 0) {

1242

if (to_drain > 0) {

1243

free_pcppages_bulk(zone, to_drain, pcp);

1243

free_pcppages_bulk(zone, to_drain, pcp);

1244

pcp->count -= to_drain;

1244

pcp->count -= to_drain;

1245

}

1245

}

1246

local_irq_restore(flags);

1246

local_irq_restore(flags);

1247

}

1247

}

1248

#endif

1248

#endif

1249

1250

/*

1250

/*

1251

* Drain pages of the indicated processor.

1251

* Drain pages of the indicated processor.

1252

*

1252

*

1253

* The processor must either be the current processor and the

1253

* The processor must either be the current processor and the

1254

* thread pinned to the current processor or a processor that

1254

* thread pinned to the current processor or a processor that

1255

* is not online.

1255

* is not online.

1256

*/

1256

*/

1257

static void drain_pages(unsigned int cpu)

1257

static void drain_pages(unsigned int cpu)

1258

{

1258

{

1259

unsigned long flags;

1259

unsigned long flags;

1260

struct zone *zone;

1260

struct zone *zone;

1261

1262

for_each_populated_zone(zone) {

1262

for_each_populated_zone(zone) {

1263

struct per_cpu_pageset *pset;

1263

struct per_cpu_pageset *pset;

1264

struct per_cpu_pages *pcp;

1264

struct per_cpu_pages *pcp;

1265

1266

local_irq_save(flags);

1266

local_irq_save(flags);

1267

pset = per_cpu_ptr(zone->pageset, cpu);

1267

pset = per_cpu_ptr(zone->pageset, cpu);

1268

1269

pcp = &pset->pcp;

1269

pcp = &pset->pcp;

1270

if (pcp->count) {

1270

if (pcp->count) {

1271

free_pcppages_bulk(zone, pcp->count, pcp);

1271

free_pcppages_bulk(zone, pcp->count, pcp);

1272

pcp->count = 0;

1272

pcp->count = 0;

1273

}

1273

}

1274

local_irq_restore(flags);

1274

local_irq_restore(flags);

1275

}

1275

}

1276

}

1276

}

1277

1278

/*

1278

/*

1279

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1279

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1280

*/

1280

*/

1281

void drain_local_pages(void *arg)

1281

void drain_local_pages(void *arg)

1282

{

1282

{

1283

drain_pages(smp_processor_id());

1283

drain_pages(smp_processor_id());

1284

}

1284

}

1285

1286

/*

1286

/*

1287

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1287

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1288

*

1288

*

1289

* Note that this code is protected against sending an IPI to an offline

1289

* Note that this code is protected against sending an IPI to an offline

1290

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1290

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1291

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1291

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1292

* nothing keeps CPUs from showing up after we populated the cpumask and

1292

* nothing keeps CPUs from showing up after we populated the cpumask and

1293

* before the call to on_each_cpu_mask().

1293

* before the call to on_each_cpu_mask().

1294

*/

1294

*/

1295

void drain_all_pages(void)

1295

void drain_all_pages(void)

1296

{

1296

{

1297

int cpu;

1297

int cpu;

1298

struct per_cpu_pageset *pcp;

1298

struct per_cpu_pageset *pcp;

1299

struct zone *zone;

1299

struct zone *zone;

1300

1301

/*

1301

/*

1302

* Allocate in the BSS so we wont require allocation in

1302

* Allocate in the BSS so we wont require allocation in

1303

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1303

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1304

*/

1304

*/

1305

static cpumask_t cpus_with_pcps;

1305

static cpumask_t cpus_with_pcps;

1306

1307

/*

1307

/*

1308

* We don't care about racing with CPU hotplug event

1308

* We don't care about racing with CPU hotplug event

1309

* as offline notification will cause the notified

1309

* as offline notification will cause the notified

1310

* cpu to drain that CPU pcps and on_each_cpu_mask

1310

* cpu to drain that CPU pcps and on_each_cpu_mask

1311

* disables preemption as part of its processing

1311

* disables preemption as part of its processing

1312

*/

1312

*/

1313

for_each_online_cpu(cpu) {

1313

for_each_online_cpu(cpu) {

1314

bool has_pcps = false;

1314

bool has_pcps = false;

1315

for_each_populated_zone(zone) {

1315

for_each_populated_zone(zone) {

1316

pcp = per_cpu_ptr(zone->pageset, cpu);

1316

pcp = per_cpu_ptr(zone->pageset, cpu);

1317

if (pcp->pcp.count) {

1317

if (pcp->pcp.count) {

1318

has_pcps = true;

1318

has_pcps = true;

1319

break;

1319

break;

1320

}

1320

}

1321

}

1321

}

1322

if (has_pcps)

1322

if (has_pcps)

1323

cpumask_set_cpu(cpu, &cpus_with_pcps);

1323

cpumask_set_cpu(cpu, &cpus_with_pcps);

1324

else

1324

else

1325

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1325

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1326

}

1326

}

1327

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1327

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1328

}

1328

}

1329

1330

#ifdef CONFIG_HIBERNATION

1330

#ifdef CONFIG_HIBERNATION

1331

1332

void mark_free_pages(struct zone *zone)

1332

void mark_free_pages(struct zone *zone)

1333

{

1333

{

1334

unsigned long pfn, max_zone_pfn;

1334

unsigned long pfn, max_zone_pfn;

1335

unsigned long flags;

1335

unsigned long flags;

1336

unsigned int order, t;

1336

unsigned int order, t;

1337

struct list_head *curr;

1337

struct list_head *curr;

1338

1339

if (zone_is_empty(zone))

1339

if (zone_is_empty(zone))

1340

return;

1340

return;

1341

1342

spin_lock_irqsave(&zone->lock, flags);

1342

spin_lock_irqsave(&zone->lock, flags);

1343

1344

max_zone_pfn = zone_end_pfn(zone);

1344

max_zone_pfn = zone_end_pfn(zone);

1345

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1345

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1346

if (pfn_valid(pfn)) {

1346

if (pfn_valid(pfn)) {

1347

struct page *page = pfn_to_page(pfn);

1347

struct page *page = pfn_to_page(pfn);

1348

1349

if (!swsusp_page_is_forbidden(page))

1349

if (!swsusp_page_is_forbidden(page))

1350

swsusp_unset_page_free(page);

1350

swsusp_unset_page_free(page);

1351

}

1351

}

1352

1353

for_each_migratetype_order(order, t) {

1353

for_each_migratetype_order(order, t) {

1354

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1354

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1355

unsigned long i;

1355

unsigned long i;

1356

1357

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1357

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1358

for (i = 0; i < (1UL << order); i++)

1358

for (i = 0; i < (1UL << order); i++)

1359

swsusp_set_page_free(pfn_to_page(pfn + i));

1359

swsusp_set_page_free(pfn_to_page(pfn + i));

1360

}

1360

}

1361

}

1361

}

1362

spin_unlock_irqrestore(&zone->lock, flags);

1362

spin_unlock_irqrestore(&zone->lock, flags);

1363

}

1363

}

1364

#endif /* CONFIG_PM */

1364

#endif /* CONFIG_PM */

1365

1366

/*

1366

/*

1367

* Free a 0-order page

1367

* Free a 0-order page

1368

* cold == true ? free a cold page : free a hot page

1368

* cold == true ? free a cold page : free a hot page

1369

*/

1369

*/

1370

void free_hot_cold_page(struct page *page, bool cold)

1370

void free_hot_cold_page(struct page *page, bool cold)

1371

{

1371

{

1372

struct zone *zone = page_zone(page);

1372

struct zone *zone = page_zone(page);

1373

struct per_cpu_pages *pcp;

1373

struct per_cpu_pages *pcp;

1374

unsigned long flags;

1374

unsigned long flags;

1375

unsigned long pfn = page_to_pfn(page);

1375

unsigned long pfn = page_to_pfn(page);

1376

int migratetype;

1376

int migratetype;

1377

1378

if (!free_pages_prepare(page, 0))

1378

if (!free_pages_prepare(page, 0))

1379

return;

1379

return;

1380

1381

migratetype = get_pfnblock_migratetype(page, pfn);

1381

migratetype = get_pfnblock_migratetype(page, pfn);

1382

set_freepage_migratetype(page, migratetype);

1382

set_freepage_migratetype(page, migratetype);

1383

local_irq_save(flags);

1383

local_irq_save(flags);

1384

__count_vm_event(PGFREE);

1384

__count_vm_event(PGFREE);

1385

1386

/*

1386

/*

1387

* We only track unmovable, reclaimable and movable on pcp lists.

1387

* We only track unmovable, reclaimable and movable on pcp lists.

1388

* Free ISOLATE pages back to the allocator because they are being

1388

* Free ISOLATE pages back to the allocator because they are being

1389

* offlined but treat RESERVE as movable pages so we can get those

1389

* offlined but treat RESERVE as movable pages so we can get those

1390

* areas back if necessary. Otherwise, we may have to free

1390

* areas back if necessary. Otherwise, we may have to free

1391

* excessively into the page allocator

1391

* excessively into the page allocator

1392

*/

1392

*/

1393

if (migratetype >= MIGRATE_PCPTYPES) {

1393

if (migratetype >= MIGRATE_PCPTYPES) {

1394

if (unlikely(is_migrate_isolate(migratetype))) {

1394

if (unlikely(is_migrate_isolate(migratetype))) {

1395

free_one_page(zone, page, pfn, 0, migratetype);

1395

free_one_page(zone, page, pfn, 0, migratetype);

1396

goto out;

1396

goto out;

1397

}

1397

}

1398

migratetype = MIGRATE_MOVABLE;

1398

migratetype = MIGRATE_MOVABLE;

1399

}

1399

}

1400

1401

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1401

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1402

if (!cold)

1402

if (!cold)

1403

list_add(&page->lru, &pcp->lists[migratetype]);

1403

list_add(&page->lru, &pcp->lists[migratetype]);

1404

else

1404

else

1405

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1405

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1406

pcp->count++;

1406

pcp->count++;

1407

if (pcp->count >= pcp->high) {

1407

if (pcp->count >= pcp->high) {

1408

unsigned long batch = ACCESS_ONCE(pcp->batch);

1408

unsigned long batch = ACCESS_ONCE(pcp->batch);

1409

free_pcppages_bulk(zone, batch, pcp);

1409

free_pcppages_bulk(zone, batch, pcp);

1410

pcp->count -= batch;

1410

pcp->count -= batch;

1411

}

1411

}

1412

1413

out:

1413

out:

1414

local_irq_restore(flags);

1414

local_irq_restore(flags);

1415

}

1415

}

1416

1417

/*

1417

/*

1418

* Free a list of 0-order pages

1418

* Free a list of 0-order pages

1419

*/

1419

*/

1420

void free_hot_cold_page_list(struct list_head *list, bool cold)

1420

void free_hot_cold_page_list(struct list_head *list, bool cold)

1421

{

1421

{

1422

struct page *page, *next;

1422

struct page *page, *next;

1423

1424

list_for_each_entry_safe(page, next, list, lru) {

1424

list_for_each_entry_safe(page, next, list, lru) {

1425

trace_mm_page_free_batched(page, cold);

1425

trace_mm_page_free_batched(page, cold);

1426

free_hot_cold_page(page, cold);

1426

free_hot_cold_page(page, cold);

1427

}

1427

}

1428

}

1428

}

1429

1430

/*

1430

/*

1431

* split_page takes a non-compound higher-order page, and splits it into

1431

* split_page takes a non-compound higher-order page, and splits it into

1432

* n (1<<order) sub-pages: page[0..n]

1432

* n (1<<order) sub-pages: page[0..n]

1433

* Each sub-page must be freed individually.

1433

* Each sub-page must be freed individually.

1434

*

1434

*

1435

* Note: this is probably too low level an operation for use in drivers.

1435

* Note: this is probably too low level an operation for use in drivers.

1436

* Please consult with lkml before using this in your driver.

1436

* Please consult with lkml before using this in your driver.

1437

*/

1437

*/

1438

void split_page(struct page *page, unsigned int order)

1438

void split_page(struct page *page, unsigned int order)

1439

{

1439

{

1440

int i;

1440

int i;

1441

1442

VM_BUG_ON(PageCompound(page));

1442

VM_BUG_ON(PageCompound(page));

1443

VM_BUG_ON(!page_count(page));

1443

VM_BUG_ON(!page_count(page));

1444

1445

#ifdef CONFIG_KMEMCHECK

1445

#ifdef CONFIG_KMEMCHECK

1446

/*

1446

/*

1447

* Split shadow pages too, because free(page[0]) would

1447

* Split shadow pages too, because free(page[0]) would

1448

* otherwise free the whole shadow.

1448

* otherwise free the whole shadow.

1449

*/

1449

*/

1450

if (kmemcheck_page_is_tracked(page))

1450

if (kmemcheck_page_is_tracked(page))

1451

split_page(virt_to_page(page[0].shadow), order);

1451

split_page(virt_to_page(page[0].shadow), order);

1452

#endif

1452

#endif

1453

1454

for (i = 1; i < (1 << order); i++)

1454

for (i = 1; i < (1 << order); i++)

1455

set_page_refcounted(page + i);

1455

set_page_refcounted(page + i);

1456

}

1456

}

1457

EXPORT_SYMBOL_GPL(split_page);

1457

EXPORT_SYMBOL_GPL(split_page);

1458

1459

static int __isolate_free_page(struct page *page, unsigned int order)

1459

static int __isolate_free_page(struct page *page, unsigned int order)

1460

{

1460

{

1461

unsigned long watermark;

1461

unsigned long watermark;

1462

struct zone *zone;

1462

struct zone *zone;

1463

int mt;

1463

int mt;

1464

1465

BUG_ON(!PageBuddy(page));

1465

BUG_ON(!PageBuddy(page));

1466

1467

zone = page_zone(page);

1467

zone = page_zone(page);

1468

mt = get_pageblock_migratetype(page);

1468

mt = get_pageblock_migratetype(page);

1469

1470

if (!is_migrate_isolate(mt)) {

1470

if (!is_migrate_isolate(mt)) {

1471

/* Obey watermarks as if the page was being allocated */

1471

/* Obey watermarks as if the page was being allocated */

1472

watermark = low_wmark_pages(zone) + (1 << order);

1472

watermark = low_wmark_pages(zone) + (1 << order);

1473

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1473

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1474

return 0;

1474

return 0;

1475

1476

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1476

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1477

}

1477

}

1478

1479

/* Remove page from free list */

1479

/* Remove page from free list */

1480

list_del(&page->lru);

1480

list_del(&page->lru);

1481

zone->free_area[order].nr_free--;

1481

zone->free_area[order].nr_free--;

1482

rmv_page_order(page);

1482

rmv_page_order(page);

1483

1484

/* Set the pageblock if the isolated page is at least a pageblock */

1484

/* Set the pageblock if the isolated page is at least a pageblock */

1485

if (order >= pageblock_order - 1) {

1485

if (order >= pageblock_order - 1) {

1486

struct page *endpage = page + (1 << order) - 1;

1486

struct page *endpage = page + (1 << order) - 1;

1487

for (; page < endpage; page += pageblock_nr_pages) {

1487

for (; page < endpage; page += pageblock_nr_pages) {

1488

int mt = get_pageblock_migratetype(page);

1488

int mt = get_pageblock_migratetype(page);

1489

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1489

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1490

set_pageblock_migratetype(page,

1490

set_pageblock_migratetype(page,

1491

MIGRATE_MOVABLE);

1491

MIGRATE_MOVABLE);

1492

}

1492

}

1493

}

1493

}

1494

1495

return 1UL << order;

1495

return 1UL << order;

1496

}

1496

}

1497

1498

/*

1498

/*

1499

* Similar to split_page except the page is already free. As this is only

1499

* Similar to split_page except the page is already free. As this is only

1500

* being used for migration, the migratetype of the block also changes.

1500

* being used for migration, the migratetype of the block also changes.

1501

* As this is called with interrupts disabled, the caller is responsible

1501

* As this is called with interrupts disabled, the caller is responsible

1502

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1502

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1503

* are enabled.

1503

* are enabled.

1504

*

1504

*

1505

* Note: this is probably too low level an operation for use in drivers.

1505

* Note: this is probably too low level an operation for use in drivers.

1506

* Please consult with lkml before using this in your driver.

1506

* Please consult with lkml before using this in your driver.

1507

*/

1507

*/

1508

int split_free_page(struct page *page)

1508

int split_free_page(struct page *page)

1509

{

1509

{

1510

unsigned int order;

1510

unsigned int order;

1511

int nr_pages;

1511

int nr_pages;

1512

1513

order = page_order(page);

1513

order = page_order(page);

1514

1515

nr_pages = __isolate_free_page(page, order);

1515

nr_pages = __isolate_free_page(page, order);

1516

if (!nr_pages)

1516

if (!nr_pages)

1517

return 0;

1517

return 0;

1518

1519

/* Split into individual pages */

1519

/* Split into individual pages */

1520

set_page_refcounted(page);

1520

set_page_refcounted(page);

1521

split_page(page, order);

1521

split_page(page, order);

1522

return nr_pages;

1522

return nr_pages;

1523

}

1523

}

1524

1525

/*

1525

/*

1526

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1526

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1527

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1527

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1528

* or two.

1528

* or two.

1529

*/

1529

*/

1530

static inline

1530

static inline

1531

struct page *buffered_rmqueue(struct zone *preferred_zone,

1531

struct page *buffered_rmqueue(struct zone *preferred_zone,

1532

struct zone *zone, unsigned int order,

1532

struct zone *zone, unsigned int order,

1533

gfp_t gfp_flags, int migratetype)

1533

gfp_t gfp_flags, int migratetype)

1534

{

1534

{

1535

unsigned long flags;

1535

unsigned long flags;

1536

struct page *page;

1536

struct page *page;

1537

bool cold = ((gfp_flags & __GFP_COLD) != 0);

1537

bool cold = ((gfp_flags & __GFP_COLD) != 0);

1538

1539

again:

1539

again:

1540

if (likely(order == 0)) {

1540

if (likely(order == 0)) {

1541

struct per_cpu_pages *pcp;

1541

struct per_cpu_pages *pcp;

1542

struct list_head *list;

1542

struct list_head *list;

1543

1544

local_irq_save(flags);

1544

local_irq_save(flags);

1545

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1545

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1546

list = &pcp->lists[migratetype];

1546

list = &pcp->lists[migratetype];

1547

if (list_empty(list)) {

1547

if (list_empty(list)) {

1548

pcp->count += rmqueue_bulk(zone, 0,

1548

pcp->count += rmqueue_bulk(zone, 0,

1549

pcp->batch, list,

1549

pcp->batch, list,

1550

migratetype, cold);

1550

migratetype, cold);

1551

if (unlikely(list_empty(list)))

1551

if (unlikely(list_empty(list)))

1552

goto failed;

1552

goto failed;

1553

}

1553

}

1554

1555

if (cold)

1555

if (cold)

1556

page = list_entry(list->prev, struct page, lru);

1556

page = list_entry(list->prev, struct page, lru);

1557

else

1557

else

1558

page = list_entry(list->next, struct page, lru);

1558

page = list_entry(list->next, struct page, lru);

1559

1560

list_del(&page->lru);

1560

list_del(&page->lru);

1561

pcp->count--;

1561

pcp->count--;

1562

} else {

1562

} else {

1563

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1563

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1564

/*

1564

/*

1565

* __GFP_NOFAIL is not to be used in new code.

1565

* __GFP_NOFAIL is not to be used in new code.

1566

*

1566

*

1567

* All __GFP_NOFAIL callers should be fixed so that they

1567

* All __GFP_NOFAIL callers should be fixed so that they

1568

* properly detect and handle allocation failures.

1568

* properly detect and handle allocation failures.

1569

*

1569

*

1570

* We most definitely don't want callers attempting to

1570

* We most definitely don't want callers attempting to

1571

* allocate greater than order-1 page units with

1571

* allocate greater than order-1 page units with

1572

* __GFP_NOFAIL.

1572

* __GFP_NOFAIL.

1573

*/

1573

*/

1574

WARN_ON_ONCE(order > 1);

1574

WARN_ON_ONCE(order > 1);

1575

}

1575

}

1576

spin_lock_irqsave(&zone->lock, flags);

1576

spin_lock_irqsave(&zone->lock, flags);

1577

page = __rmqueue(zone, order, migratetype);

1577

page = __rmqueue(zone, order, migratetype);

1578

spin_unlock(&zone->lock);

1578

spin_unlock(&zone->lock);

1579

if (!page)

1579

if (!page)

1580

goto failed;

1580

goto failed;

1581

__mod_zone_freepage_state(zone, -(1 << order),

1581

__mod_zone_freepage_state(zone, -(1 << order),

1582

get_freepage_migratetype(page));

1582

get_freepage_migratetype(page));

1583

}

1583

}

1584

1585

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1585

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1586

1587

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1587

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1588

zone_statistics(preferred_zone, zone, gfp_flags);

1588

zone_statistics(preferred_zone, zone, gfp_flags);

1589

local_irq_restore(flags);

1589

local_irq_restore(flags);

1590

1591

VM_BUG_ON(bad_range(zone, page));

1591

VM_BUG_ON(bad_range(zone, page));

1592

if (prep_new_page(page, order, gfp_flags))

1592

if (prep_new_page(page, order, gfp_flags))

1593

goto again;

1593

goto again;

1594

return page;

1594

return page;

1595

1596

failed:

1596

failed:

1597

local_irq_restore(flags);

1597

local_irq_restore(flags);

1598

return NULL;

1598

return NULL;

1599

}

1599

}

1600

1601

#ifdef CONFIG_FAIL_PAGE_ALLOC

1601

#ifdef CONFIG_FAIL_PAGE_ALLOC

1602

1603

static struct {

1603

static struct {

1604

struct fault_attr attr;

1604

struct fault_attr attr;

1605

1606

u32 ignore_gfp_highmem;

1606

u32 ignore_gfp_highmem;

1607

u32 ignore_gfp_wait;

1607

u32 ignore_gfp_wait;

1608

u32 min_order;

1608

u32 min_order;

1609

} fail_page_alloc = {

1609

} fail_page_alloc = {

1610

.attr = FAULT_ATTR_INITIALIZER,

1610

.attr = FAULT_ATTR_INITIALIZER,

1611

.ignore_gfp_wait = 1,

1611

.ignore_gfp_wait = 1,

1612

.ignore_gfp_highmem = 1,

1612

.ignore_gfp_highmem = 1,

1613

.min_order = 1,

1613

.min_order = 1,

1614

};

1614

};

1615

1616

static int __init setup_fail_page_alloc(char *str)

1616

static int __init setup_fail_page_alloc(char *str)

1617

{

1617

{

1618

return setup_fault_attr(&fail_page_alloc.attr, str);

1618

return setup_fault_attr(&fail_page_alloc.attr, str);

1619

}

1619

}

1620

__setup("fail_page_alloc=", setup_fail_page_alloc);

1620

__setup("fail_page_alloc=", setup_fail_page_alloc);

1621

1622

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1622

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1623

{

1623

{

1624

if (order < fail_page_alloc.min_order)

1624

if (order < fail_page_alloc.min_order)

1625

return false;

1625

return false;

1626

if (gfp_mask & __GFP_NOFAIL)

1626

if (gfp_mask & __GFP_NOFAIL)

1627

return false;

1627

return false;

1628

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1628

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1629

return false;

1629

return false;

1630

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1630

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1631

return false;

1631

return false;

1632

1633

return should_fail(&fail_page_alloc.attr, 1 << order);

1633

return should_fail(&fail_page_alloc.attr, 1 << order);

1634

}

1634

}

1635

1636

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1636

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1637

1638

static int __init fail_page_alloc_debugfs(void)

1638

static int __init fail_page_alloc_debugfs(void)

1639

{

1639

{

1640

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1640

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1641

struct dentry *dir;

1641

struct dentry *dir;

1642

1643

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1643

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1644

&fail_page_alloc.attr);

1644

&fail_page_alloc.attr);

1645

if (IS_ERR(dir))

1645

if (IS_ERR(dir))

1646

return PTR_ERR(dir);

1646

return PTR_ERR(dir);

1647

1648

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1648

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1649

&fail_page_alloc.ignore_gfp_wait))

1649

&fail_page_alloc.ignore_gfp_wait))

1650

goto fail;

1650

goto fail;

1651

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1651

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1652

&fail_page_alloc.ignore_gfp_highmem))

1652

&fail_page_alloc.ignore_gfp_highmem))

1653

goto fail;

1653

goto fail;

1654

if (!debugfs_create_u32("min-order", mode, dir,

1654

if (!debugfs_create_u32("min-order", mode, dir,

1655

&fail_page_alloc.min_order))

1655

&fail_page_alloc.min_order))

1656

goto fail;

1656

goto fail;

1657

1658

return 0;

1658

return 0;

1659

fail:

1659

fail:

1660

debugfs_remove_recursive(dir);

1660

debugfs_remove_recursive(dir);

1661

1662

return -ENOMEM;

1662

return -ENOMEM;

1663

}

1663

}

1664

1665

late_initcall(fail_page_alloc_debugfs);

1665

late_initcall(fail_page_alloc_debugfs);

1666

1667

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1667

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1668

1669

#else /* CONFIG_FAIL_PAGE_ALLOC */

1669

#else /* CONFIG_FAIL_PAGE_ALLOC */

1670

1671

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1671

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1672

{

1672

{

1673

return false;

1673

return false;

1674

}

1674

}

1675

1676

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1676

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1677

1678

/*

1678

/*

1679

* Return true if free pages are above 'mark'. This takes into account the order

1679

* Return true if free pages are above 'mark'. This takes into account the order

1680

* of the allocation.

1680

* of the allocation.

1681

*/

1681

*/

1682

static bool __zone_watermark_ok(struct zone *z, unsigned int order,

1682

static bool __zone_watermark_ok(struct zone *z, unsigned int order,

1683

unsigned long mark, int classzone_idx, int alloc_flags,

1683

unsigned long mark, int classzone_idx, int alloc_flags,

1684

long free_pages)

1684

long free_pages)

1685

{

1685

{

1686

/* free_pages my go negative - that's OK */

1686

/* free_pages my go negative - that's OK */

1687

long min = mark;

1687

long min = mark;

1688

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1688

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1689

int o;

1689

int o;

1690

long free_cma = 0;

1690

long free_cma = 0;

1691

1692

free_pages -= (1 << order) - 1;

1692

free_pages -= (1 << order) - 1;

1693

if (alloc_flags & ALLOC_HIGH)

1693

if (alloc_flags & ALLOC_HIGH)

1694

min -= min / 2;

1694

min -= min / 2;

1695

if (alloc_flags & ALLOC_HARDER)

1695

if (alloc_flags & ALLOC_HARDER)

1696

min -= min / 4;

1696

min -= min / 4;

1697

#ifdef CONFIG_CMA

1697

#ifdef CONFIG_CMA

1698

/* If allocation can't use CMA areas don't use free CMA pages */

1698

/* If allocation can't use CMA areas don't use free CMA pages */

1699

if (!(alloc_flags & ALLOC_CMA))

1699

if (!(alloc_flags & ALLOC_CMA))

1700

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1700

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1701

#endif

1701

#endif

1702

1703

if (free_pages - free_cma <= min + lowmem_reserve)

1703

if (free_pages - free_cma <= min + lowmem_reserve)

1704

return false;

1704

return false;

1705

for (o = 0; o < order; o++) {

1705

for (o = 0; o < order; o++) {

1706

/* At the next order, this order's pages become unavailable */

1706

/* At the next order, this order's pages become unavailable */

1707

free_pages -= z->free_area[o].nr_free << o;

1707

free_pages -= z->free_area[o].nr_free << o;

1708

1709

/* Require fewer higher order pages to be free */

1709

/* Require fewer higher order pages to be free */

1710

min >>= 1;

1710

min >>= 1;

1711

1712

if (free_pages <= min)

1712

if (free_pages <= min)

1713

return false;

1713

return false;

1714

}

1714

}

1715

return true;

1715

return true;

1716

}

1716

}

1717

1718

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

1718

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

1719

int classzone_idx, int alloc_flags)

1719

int classzone_idx, int alloc_flags)

1720

{

1720

{

1721

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1721

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1722

zone_page_state(z, NR_FREE_PAGES));

1722

zone_page_state(z, NR_FREE_PAGES));

1723

}

1723

}

1724

1725

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

1725

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

1726

unsigned long mark, int classzone_idx, int alloc_flags)

1726

unsigned long mark, int classzone_idx, int alloc_flags)

1727

{

1727

{

1728

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1728

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1729

1730

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1730

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1731

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1731

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1732

1733

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1733

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1734

free_pages);

1734

free_pages);

1735

}

1735

}

1736

1737

#ifdef CONFIG_NUMA

1737

#ifdef CONFIG_NUMA

1738

/*

1738

/*

1739

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1739

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1740

* skip over zones that are not allowed by the cpuset, or that have

1740

* skip over zones that are not allowed by the cpuset, or that have

1741

* been recently (in last second) found to be nearly full. See further

1741

* been recently (in last second) found to be nearly full. See further

1742

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1742

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1743

* that have to skip over a lot of full or unallowed zones.

1743

* that have to skip over a lot of full or unallowed zones.

1744

*

1744

*

1745

* If the zonelist cache is present in the passed in zonelist, then

1745

* If the zonelist cache is present in the passed in zonelist, then

1746

* returns a pointer to the allowed node mask (either the current

1746

* returns a pointer to the allowed node mask (either the current

1747

* tasks mems_allowed, or node_states[N_MEMORY].)

1747

* tasks mems_allowed, or node_states[N_MEMORY].)

1748

*

1748

*

1749

* If the zonelist cache is not available for this zonelist, does

1749

* If the zonelist cache is not available for this zonelist, does

1750

* nothing and returns NULL.

1750

* nothing and returns NULL.

1751

*

1751

*

1752

* If the fullzones BITMAP in the zonelist cache is stale (more than

1752

* If the fullzones BITMAP in the zonelist cache is stale (more than

1753

* a second since last zap'd) then we zap it out (clear its bits.)

1753

* a second since last zap'd) then we zap it out (clear its bits.)

1754

*

1754

*

1755

* We hold off even calling zlc_setup, until after we've checked the

1755

* We hold off even calling zlc_setup, until after we've checked the

1756

* first zone in the zonelist, on the theory that most allocations will

1756

* first zone in the zonelist, on the theory that most allocations will

1757

* be satisfied from that first zone, so best to examine that zone as

1757

* be satisfied from that first zone, so best to examine that zone as

1758

* quickly as we can.

1758

* quickly as we can.

1759

*/

1759

*/

1760

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1760

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1761

{

1761

{

1762

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1762

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1763

nodemask_t *allowednodes; /* zonelist_cache approximation */

1763

nodemask_t *allowednodes; /* zonelist_cache approximation */

1764

1765

zlc = zonelist->zlcache_ptr;

1765

zlc = zonelist->zlcache_ptr;

1766

if (!zlc)

1766

if (!zlc)

1767

return NULL;

1767

return NULL;

1768

1769

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1769

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1770

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1770

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1771

zlc->last_full_zap = jiffies;

1771

zlc->last_full_zap = jiffies;

1772

}

1772

}

1773

1774

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1774

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1775

&cpuset_current_mems_allowed :

1775

&cpuset_current_mems_allowed :

1776

&node_states[N_MEMORY];

1776

&node_states[N_MEMORY];

1777

return allowednodes;

1777

return allowednodes;

1778

}

1778

}

1779

1780

/*

1780

/*

1781

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1781

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1782

* if it is worth looking at further for free memory:

1782

* if it is worth looking at further for free memory:

1783

* 1) Check that the zone isn't thought to be full (doesn't have its

1783

* 1) Check that the zone isn't thought to be full (doesn't have its

1784

* bit set in the zonelist_cache fullzones BITMAP).

1784

* bit set in the zonelist_cache fullzones BITMAP).

1785

* 2) Check that the zones node (obtained from the zonelist_cache

1785

* 2) Check that the zones node (obtained from the zonelist_cache

1786

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1786

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1787

* Return true (non-zero) if zone is worth looking at further, or

1787

* Return true (non-zero) if zone is worth looking at further, or

1788

* else return false (zero) if it is not.

1788

* else return false (zero) if it is not.

1789

*

1789

*

1790

* This check -ignores- the distinction between various watermarks,

1790

* This check -ignores- the distinction between various watermarks,

1791

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1791

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1792

* found to be full for any variation of these watermarks, it will

1792

* found to be full for any variation of these watermarks, it will

1793

* be considered full for up to one second by all requests, unless

1793

* be considered full for up to one second by all requests, unless

1794

* we are so low on memory on all allowed nodes that we are forced

1794

* we are so low on memory on all allowed nodes that we are forced

1795

* into the second scan of the zonelist.

1795

* into the second scan of the zonelist.

1796

*

1796

*

1797

* In the second scan we ignore this zonelist cache and exactly

1797

* In the second scan we ignore this zonelist cache and exactly

1798

* apply the watermarks to all zones, even it is slower to do so.

1798

* apply the watermarks to all zones, even it is slower to do so.

1799

* We are low on memory in the second scan, and should leave no stone

1799

* We are low on memory in the second scan, and should leave no stone

1800

* unturned looking for a free page.

1800

* unturned looking for a free page.

1801

*/

1801

*/

1802

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1802

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1803

nodemask_t *allowednodes)

1803

nodemask_t *allowednodes)

1804

{

1804

{

1805

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1805

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1806

int i; /* index of *z in zonelist zones */

1806

int i; /* index of *z in zonelist zones */

1807

int n; /* node that zone *z is on */

1807

int n; /* node that zone *z is on */

1808

1809

zlc = zonelist->zlcache_ptr;

1809

zlc = zonelist->zlcache_ptr;

1810

if (!zlc)

1810

if (!zlc)

1811

return 1;

1811

return 1;

1812

1813

i = z - zonelist->_zonerefs;

1813

i = z - zonelist->_zonerefs;

1814

n = zlc->z_to_n[i];

1814

n = zlc->z_to_n[i];

1815

1816

/* This zone is worth trying if it is allowed but not full */

1816

/* This zone is worth trying if it is allowed but not full */

1817

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1817

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1818

}

1818

}

1819

1820

/*

1820

/*

1821

* Given 'z' scanning a zonelist, set the corresponding bit in

1821

* Given 'z' scanning a zonelist, set the corresponding bit in

1822

* zlc->fullzones, so that subsequent attempts to allocate a page

1822

* zlc->fullzones, so that subsequent attempts to allocate a page

1823

* from that zone don't waste time re-examining it.

1823

* from that zone don't waste time re-examining it.

1824

*/

1824

*/

1825

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1825

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1826

{

1826

{

1827

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1827

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1828

int i; /* index of *z in zonelist zones */

1828

int i; /* index of *z in zonelist zones */

1829

1830

zlc = zonelist->zlcache_ptr;

1830

zlc = zonelist->zlcache_ptr;

1831

if (!zlc)

1831

if (!zlc)

1832

return;

1832

return;

1833

1834

i = z - zonelist->_zonerefs;

1834

i = z - zonelist->_zonerefs;

1835

1836

set_bit(i, zlc->fullzones);

1836

set_bit(i, zlc->fullzones);

1837

}

1837

}

1838

1839

/*

1839

/*

1840

* clear all zones full, called after direct reclaim makes progress so that

1840

* clear all zones full, called after direct reclaim makes progress so that

1841

* a zone that was recently full is not skipped over for up to a second

1841

* a zone that was recently full is not skipped over for up to a second

1842

*/

1842

*/

1843

static void zlc_clear_zones_full(struct zonelist *zonelist)

1843

static void zlc_clear_zones_full(struct zonelist *zonelist)

1844

{

1844

{

1845

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1845

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1846

1847

zlc = zonelist->zlcache_ptr;

1847

zlc = zonelist->zlcache_ptr;

1848

if (!zlc)

1848

if (!zlc)

1849

return;

1849

return;

1850

1851

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1851

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1852

}

1852

}

1853

1854

static bool zone_local(struct zone *local_zone, struct zone *zone)

1854

static bool zone_local(struct zone *local_zone, struct zone *zone)

1855

{

1855

{

1856

return local_zone->node == zone->node;

1856

return local_zone->node == zone->node;

1857

}

1857

}

1858

1859

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1859

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1860

{

1860

{

1861

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1861

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1862

}

1862

}

1863

1864

static void __paginginit init_zone_allows_reclaim(int nid)

1864

static void __paginginit init_zone_allows_reclaim(int nid)

1865

{

1865

{

1866

int i;

1866

int i;

1867

1868

for_each_node_state(i, N_MEMORY)

1868

for_each_node_state(i, N_MEMORY)

1869

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1869

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1870

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1870

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1871

else

1871

else

1872

zone_reclaim_mode = 1;

1872

zone_reclaim_mode = 1;

1873

}

1873

}

1874

1875

#else /* CONFIG_NUMA */

1875

#else /* CONFIG_NUMA */

1876

1877

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1877

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1878

{

1878

{

1879

return NULL;

1879

return NULL;

1880

}

1880

}

1881

1882

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1882

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1883

nodemask_t *allowednodes)

1883

nodemask_t *allowednodes)

1884

{

1884

{

1885

return 1;

1885

return 1;

1886

}

1886

}

1887

1888

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1888

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1889

{

1889

{

1890

}

1890

}

1891

1892

static void zlc_clear_zones_full(struct zonelist *zonelist)

1892

static void zlc_clear_zones_full(struct zonelist *zonelist)

1893

{

1893

{

1894

}

1894

}

1895

1896

static bool zone_local(struct zone *local_zone, struct zone *zone)

1896

static bool zone_local(struct zone *local_zone, struct zone *zone)

1897

{

1897

{

1898

return true;

1898

return true;

1899

}

1899

}

1900

1901

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1901

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1902

{

1902

{

1903

return true;

1903

return true;

1904

}

1904

}

1905

1906

static inline void init_zone_allows_reclaim(int nid)

1906

static inline void init_zone_allows_reclaim(int nid)

1907

{

1907

{

1908

}

1908

}

1909

#endif /* CONFIG_NUMA */

1909

#endif /* CONFIG_NUMA */

1910

1911

/*

1911

/*

1912

* get_page_from_freelist goes through the zonelist trying to allocate

1912

* get_page_from_freelist goes through the zonelist trying to allocate

1913

* a page.

1913

* a page.

1914

*/

1914

*/

1915

static struct page *

1915

static struct page *

1916

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1916

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1917

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1917

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1918

struct zone *preferred_zone, int classzone_idx, int migratetype)

1918

struct zone *preferred_zone, int classzone_idx, int migratetype)

1919

{

1919

{

1920

struct zoneref *z;

1920

struct zoneref *z;

1921

struct page *page = NULL;

1921

struct page *page = NULL;

1922

struct zone *zone;

1922

struct zone *zone;

1923

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1923

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1924

int zlc_active = 0; /* set if using zonelist_cache */

1924

int zlc_active = 0; /* set if using zonelist_cache */

1925

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1925

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1926

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1926

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1927

(gfp_mask & __GFP_WRITE);

1927

(gfp_mask & __GFP_WRITE);

1928

1929

zonelist_scan:

1929

zonelist_scan:

1930

/*

1930

/*

1931

* Scan zonelist, looking for a zone with enough free.

1931

* Scan zonelist, looking for a zone with enough free.

1932

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1932

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1933

*/

1933

*/

1934

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1934

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1935

high_zoneidx, nodemask) {

1935

high_zoneidx, nodemask) {

1936

unsigned long mark;

1936

unsigned long mark;

1937

1938

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1938

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1939

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1939

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1940

continue;

1940

continue;

1941

if (cpusets_enabled() &&

1941

if (cpusets_enabled() &&

1942

(alloc_flags & ALLOC_CPUSET) &&

1942

(alloc_flags & ALLOC_CPUSET) &&

1943

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1943

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1944

continue;

1944

continue;

1945

/*

1945

/*

1946

* Distribute pages in proportion to the individual

1946

* Distribute pages in proportion to the individual

1947

* zone size to ensure fair page aging. The zone a

1947

* zone size to ensure fair page aging. The zone a

1948

* page was allocated in should have no effect on the

1948

* page was allocated in should have no effect on the

1949

* time the page has in memory before being reclaimed.

1949

* time the page has in memory before being reclaimed.

1950

*/

1950

*/

1951

if (alloc_flags & ALLOC_FAIR) {

1951

if (alloc_flags & ALLOC_FAIR) {

1952

if (!zone_local(preferred_zone, zone))

1952

if (!zone_local(preferred_zone, zone))

1953

continue;

1953

continue;

1954

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1954

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1955

continue;

1955

continue;

1956

}

1956

}

1957

/*

1957

/*

1958

* When allocating a page cache page for writing, we

1958

* When allocating a page cache page for writing, we

1959

* want to get it from a zone that is within its dirty

1959

* want to get it from a zone that is within its dirty

1960

* limit, such that no single zone holds more than its

1960

* limit, such that no single zone holds more than its

1961

* proportional share of globally allowed dirty pages.

1961

* proportional share of globally allowed dirty pages.

1962

* The dirty limits take into account the zone's

1962

* The dirty limits take into account the zone's

1963

* lowmem reserves and high watermark so that kswapd

1963

* lowmem reserves and high watermark so that kswapd

1964

* should be able to balance it without having to

1964

* should be able to balance it without having to

1965

* write pages from its LRU list.

1965

* write pages from its LRU list.

1966

*

1966

*

1967

* This may look like it could increase pressure on

1967

* This may look like it could increase pressure on

1968

* lower zones by failing allocations in higher zones

1968

* lower zones by failing allocations in higher zones

1969

* before they are full. But the pages that do spill

1969

* before they are full. But the pages that do spill

1970

* over are limited as the lower zones are protected

1970

* over are limited as the lower zones are protected

1971

* by this very same mechanism. It should not become

1971

* by this very same mechanism. It should not become

1972

* a practical burden to them.

1972

* a practical burden to them.

1973

*

1973

*

1974

* XXX: For now, allow allocations to potentially

1974

* XXX: For now, allow allocations to potentially

1975

* exceed the per-zone dirty limit in the slowpath

1975

* exceed the per-zone dirty limit in the slowpath

1976

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1976

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1977

* which is important when on a NUMA setup the allowed

1977

* which is important when on a NUMA setup the allowed

1978

* zones are together not big enough to reach the

1978

* zones are together not big enough to reach the

1979

* global limit. The proper fix for these situations

1979

* global limit. The proper fix for these situations

1980

* will require awareness of zones in the

1980

* will require awareness of zones in the

1981

* dirty-throttling and the flusher threads.

1981

* dirty-throttling and the flusher threads.

1982

*/

1982

*/

1983

if (consider_zone_dirty && !zone_dirty_ok(zone))

1983

if (consider_zone_dirty && !zone_dirty_ok(zone))

1984

continue;

1984

continue;

1985

1986

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1986

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1987

if (!zone_watermark_ok(zone, order, mark,

1987

if (!zone_watermark_ok(zone, order, mark,

1988

classzone_idx, alloc_flags)) {

1988

classzone_idx, alloc_flags)) {

1989

int ret;

1989

int ret;

1990

1991

/* Checked here to keep the fast path fast */

1991

/* Checked here to keep the fast path fast */

1992

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1992

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1993

if (alloc_flags & ALLOC_NO_WATERMARKS)

1993

if (alloc_flags & ALLOC_NO_WATERMARKS)

1994

goto try_this_zone;

1994

goto try_this_zone;

1995

1996

if (IS_ENABLED(CONFIG_NUMA) &&

1996

if (IS_ENABLED(CONFIG_NUMA) &&

1997

!did_zlc_setup && nr_online_nodes > 1) {

1997

!did_zlc_setup && nr_online_nodes > 1) {

1998

/*

1998

/*

1999

* we do zlc_setup if there are multiple nodes

1999

* we do zlc_setup if there are multiple nodes

2000

* and before considering the first zone allowed

2000

* and before considering the first zone allowed

2001

* by the cpuset.

2001

* by the cpuset.

2002

*/

2002

*/

2003

allowednodes = zlc_setup(zonelist, alloc_flags);

2003

allowednodes = zlc_setup(zonelist, alloc_flags);

2004

zlc_active = 1;

2004

zlc_active = 1;

2005

did_zlc_setup = 1;

2005

did_zlc_setup = 1;

2006

}

2006

}

2007

2008

if (zone_reclaim_mode == 0 ||

2008

if (zone_reclaim_mode == 0 ||

2009

!zone_allows_reclaim(preferred_zone, zone))

2009

!zone_allows_reclaim(preferred_zone, zone))

2010

goto this_zone_full;

2010

goto this_zone_full;

2011

2012

/*

2012

/*

2013

* As we may have just activated ZLC, check if the first

2013

* As we may have just activated ZLC, check if the first

2014

* eligible zone has failed zone_reclaim recently.

2014

* eligible zone has failed zone_reclaim recently.

2015

*/

2015

*/

2016

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2016

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2017

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2017

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2018

continue;

2018

continue;

2019

2020

ret = zone_reclaim(zone, gfp_mask, order);

2020

ret = zone_reclaim(zone, gfp_mask, order);

2021

switch (ret) {

2021

switch (ret) {

2022

case ZONE_RECLAIM_NOSCAN:

2022

case ZONE_RECLAIM_NOSCAN:

2023

/* did not scan */

2023

/* did not scan */

2024

continue;

2024

continue;

2025

case ZONE_RECLAIM_FULL:

2025

case ZONE_RECLAIM_FULL:

2026

/* scanned but unreclaimable */

2026

/* scanned but unreclaimable */

2027

continue;

2027

continue;

2028

default:

2028

default:

2029

/* did we reclaim enough */

2029

/* did we reclaim enough */

2030

if (zone_watermark_ok(zone, order, mark,

2030

if (zone_watermark_ok(zone, order, mark,

2031

classzone_idx, alloc_flags))

2031

classzone_idx, alloc_flags))

2032

goto try_this_zone;

2032

goto try_this_zone;

2033

2034

/*

2034

/*

2035

* Failed to reclaim enough to meet watermark.

2035

* Failed to reclaim enough to meet watermark.

2036

* Only mark the zone full if checking the min

2036

* Only mark the zone full if checking the min

2037

* watermark or if we failed to reclaim just

2037

* watermark or if we failed to reclaim just

2038

* 1<<order pages or else the page allocator

2038

* 1<<order pages or else the page allocator

2039

* fastpath will prematurely mark zones full

2039

* fastpath will prematurely mark zones full

2040

* when the watermark is between the low and

2040

* when the watermark is between the low and

2041

* min watermarks.

2041

* min watermarks.

2042

*/

2042

*/

2043

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2043

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2044

ret == ZONE_RECLAIM_SOME)

2044

ret == ZONE_RECLAIM_SOME)

2045

goto this_zone_full;

2045

goto this_zone_full;

2046

2047

continue;

2047

continue;

2048

}

2048

}

2049

}

2049

}

2050

2051

try_this_zone:

2051

try_this_zone:

2052

page = buffered_rmqueue(preferred_zone, zone, order,

2052

page = buffered_rmqueue(preferred_zone, zone, order,

2053

gfp_mask, migratetype);

2053

gfp_mask, migratetype);

2054

if (page)

2054

if (page)

2055

break;

2055

break;

2056

this_zone_full:

2056

this_zone_full:

2057

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2057

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2058

zlc_mark_zone_full(zonelist, z);

2058

zlc_mark_zone_full(zonelist, z);

2059

}

2059

}

2060

2061

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2061

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2062

/* Disable zlc cache for second zonelist scan */

2062

/* Disable zlc cache for second zonelist scan */

2063

zlc_active = 0;

2063

zlc_active = 0;

2064

goto zonelist_scan;

2064

goto zonelist_scan;

2065

}

2065

}

2066

2067

if (page)

2067

if (page)

2068

/*

2068

/*

2069

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2069

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2070

* necessary to allocate the page. The expectation is

2070

* necessary to allocate the page. The expectation is

2071

* that the caller is taking steps that will free more

2071

* that the caller is taking steps that will free more

2072

* memory. The caller should avoid the page being used

2072

* memory. The caller should avoid the page being used

2073

* for !PFMEMALLOC purposes.

2073

* for !PFMEMALLOC purposes.

2074

*/

2074

*/

2075

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2075

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2076

2077

return page;

2077

return page;

2078

}

2078

}

2079

2080

/*

2080

/*

2081

* Large machines with many possible nodes should not always dump per-node

2081

* Large machines with many possible nodes should not always dump per-node

2082

* meminfo in irq context.

2082

* meminfo in irq context.

2083

*/

2083

*/

2084

static inline bool should_suppress_show_mem(void)

2084

static inline bool should_suppress_show_mem(void)

2085

{

2085

{

2086

bool ret = false;

2086

bool ret = false;

2087

2088

#if NODES_SHIFT > 8

2088

#if NODES_SHIFT > 8

2089

ret = in_interrupt();

2089

ret = in_interrupt();

2090

#endif

2090

#endif

2091

return ret;

2091

return ret;

2092

}

2092

}

2093

2094

static DEFINE_RATELIMIT_STATE(nopage_rs,

2094

static DEFINE_RATELIMIT_STATE(nopage_rs,

2095

DEFAULT_RATELIMIT_INTERVAL,

2095

DEFAULT_RATELIMIT_INTERVAL,

2096

DEFAULT_RATELIMIT_BURST);

2096

DEFAULT_RATELIMIT_BURST);

2097

2098

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2098

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2099

{

2099

{

2100

unsigned int filter = SHOW_MEM_FILTER_NODES;

2100

unsigned int filter = SHOW_MEM_FILTER_NODES;

2101

2102

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2102

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2103

debug_guardpage_minorder() > 0)

2103

debug_guardpage_minorder() > 0)

2104

return;

2104

return;

2105

2106

/*

2106

/*

2107

* Walking all memory to count page types is very expensive and should

2107

* Walking all memory to count page types is very expensive and should

2108

* be inhibited in non-blockable contexts.

2108

* be inhibited in non-blockable contexts.

2109

*/

2109

*/

2110

if (!(gfp_mask & __GFP_WAIT))

2110

if (!(gfp_mask & __GFP_WAIT))

2111

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2111

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2112

2113

/*

2113

/*

2114

* This documents exceptions given to allocations in certain

2114

* This documents exceptions given to allocations in certain

2115

* contexts that are allowed to allocate outside current's set

2115

* contexts that are allowed to allocate outside current's set

2116

* of allowed nodes.

2116

* of allowed nodes.

2117

*/

2117

*/

2118

if (!(gfp_mask & __GFP_NOMEMALLOC))

2118

if (!(gfp_mask & __GFP_NOMEMALLOC))

2119

if (test_thread_flag(TIF_MEMDIE) ||

2119

if (test_thread_flag(TIF_MEMDIE) ||

2120

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2120

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2121

filter &= ~SHOW_MEM_FILTER_NODES;

2121

filter &= ~SHOW_MEM_FILTER_NODES;

2122

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2122

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2123

filter &= ~SHOW_MEM_FILTER_NODES;

2123

filter &= ~SHOW_MEM_FILTER_NODES;

2124

2125

if (fmt) {

2125

if (fmt) {

2126

struct va_format vaf;

2126

struct va_format vaf;

2127

va_list args;

2127

va_list args;

2128

2129

va_start(args, fmt);

2129

va_start(args, fmt);

2130

2131

vaf.fmt = fmt;

2131

vaf.fmt = fmt;

2132

vaf.va = &args;

2132

vaf.va = &args;

2133

2134

pr_warn("%pV", &vaf);

2134

pr_warn("%pV", &vaf);

2135

2136

va_end(args);

2136

va_end(args);

2137

}

2137

}

2138

2139

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2139

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2140

current->comm, order, gfp_mask);

2140

current->comm, order, gfp_mask);

2141

2142

dump_stack();

2142

dump_stack();

2143

if (!should_suppress_show_mem())

2143

if (!should_suppress_show_mem())

2144

show_mem(filter);

2144

show_mem(filter);

2145

}

2145

}

2146

2147

static inline int

2147

static inline int

2148

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2148

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2149

unsigned long did_some_progress,

2149

unsigned long did_some_progress,

2150

unsigned long pages_reclaimed)

2150

unsigned long pages_reclaimed)

2151

{

2151

{

2152

/* Do not loop if specifically requested */

2152

/* Do not loop if specifically requested */

2153

if (gfp_mask & __GFP_NORETRY)

2153

if (gfp_mask & __GFP_NORETRY)

2154

return 0;

2154

return 0;

2155

2156

/* Always retry if specifically requested */

2156

/* Always retry if specifically requested */

2157

if (gfp_mask & __GFP_NOFAIL)

2157

if (gfp_mask & __GFP_NOFAIL)

2158

return 1;

2158

return 1;

2159

2160

/*

2160

/*

2161

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2161

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2162

* making forward progress without invoking OOM. Suspend also disables

2162

* making forward progress without invoking OOM. Suspend also disables

2163

* storage devices so kswapd will not help. Bail if we are suspending.

2163

* storage devices so kswapd will not help. Bail if we are suspending.

2164

*/

2164

*/

2165

if (!did_some_progress && pm_suspended_storage())

2165

if (!did_some_progress && pm_suspended_storage())

2166

return 0;

2166

return 0;

2167

2168

/*

2168

/*

2169

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2169

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2170

* means __GFP_NOFAIL, but that may not be true in other

2170

* means __GFP_NOFAIL, but that may not be true in other

2171

* implementations.

2171

* implementations.

2172

*/

2172

*/

2173

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2173

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2174

return 1;

2174

return 1;

2175

2176

/*

2176

/*

2177

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2177

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2178

* specified, then we retry until we no longer reclaim any pages

2178

* specified, then we retry until we no longer reclaim any pages

2179

* (above), or we've reclaimed an order of pages at least as

2179

* (above), or we've reclaimed an order of pages at least as

2180

* large as the allocation's order. In both cases, if the

2180

* large as the allocation's order. In both cases, if the

2181

* allocation still fails, we stop retrying.

2181

* allocation still fails, we stop retrying.

2182

*/

2182

*/

2183

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2183

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2184

return 1;

2184

return 1;

2185

2186

return 0;

2186

return 0;

2187

}

2187

}

2188

2189

static inline struct page *

2189

static inline struct page *

2190

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2190

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2191

struct zonelist *zonelist, enum zone_type high_zoneidx,

2191

struct zonelist *zonelist, enum zone_type high_zoneidx,

2192

nodemask_t *nodemask, struct zone *preferred_zone,

2192

nodemask_t *nodemask, struct zone *preferred_zone,

2193

int classzone_idx, int migratetype)

2193

int classzone_idx, int migratetype)

2194

{

2194

{

2195

struct page *page;

2195

struct page *page;

2196

2197

/* Acquire the OOM killer lock for the zones in zonelist */

2197

/* Acquire the OOM killer lock for the zones in zonelist */

2198

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2198

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2199

schedule_timeout_uninterruptible(1);

2199

schedule_timeout_uninterruptible(1);

2200

return NULL;

2200

return NULL;

2201

}

2201

}

2202

2203

/*

2203

/*

2204

* Go through the zonelist yet one more time, keep very high watermark

2204

* Go through the zonelist yet one more time, keep very high watermark

2205

* here, this is only to catch a parallel oom killing, we must fail if

2205

* here, this is only to catch a parallel oom killing, we must fail if

2206

* we're still under heavy pressure.

2206

* we're still under heavy pressure.

2207

*/

2207

*/

2208

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2208

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2209

order, zonelist, high_zoneidx,

2209

order, zonelist, high_zoneidx,

2210

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2210

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2211

preferred_zone, classzone_idx, migratetype);

2211

preferred_zone, classzone_idx, migratetype);

2212

if (page)

2212

if (page)

2213

goto out;

2213

goto out;

2214

2215

if (!(gfp_mask & __GFP_NOFAIL)) {

2215

if (!(gfp_mask & __GFP_NOFAIL)) {

2216

/* The OOM killer will not help higher order allocs */

2216

/* The OOM killer will not help higher order allocs */

2217

if (order > PAGE_ALLOC_COSTLY_ORDER)

2217

if (order > PAGE_ALLOC_COSTLY_ORDER)

2218

goto out;

2218

goto out;

2219

/* The OOM killer does not needlessly kill tasks for lowmem */

2219

/* The OOM killer does not needlessly kill tasks for lowmem */

2220

if (high_zoneidx < ZONE_NORMAL)

2220

if (high_zoneidx < ZONE_NORMAL)

2221

goto out;

2221

goto out;

2222

/*

2222

/*

2223

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2223

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2224

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2224

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2225

* The caller should handle page allocation failure by itself if

2225

* The caller should handle page allocation failure by itself if

2226

* it specifies __GFP_THISNODE.

2226

* it specifies __GFP_THISNODE.

2227

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2227

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2228

*/

2228

*/

2229

if (gfp_mask & __GFP_THISNODE)

2229

if (gfp_mask & __GFP_THISNODE)

2230

goto out;

2230

goto out;

2231

}

2231

}

2232

/* Exhausted what can be done so it's blamo time */

2232

/* Exhausted what can be done so it's blamo time */

2233

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2233

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2234

2235

out:

2235

out:

2236

clear_zonelist_oom(zonelist, gfp_mask);

2236

clear_zonelist_oom(zonelist, gfp_mask);

2237

return page;

2237

return page;

2238

}

2238

}

2239

2240

#ifdef CONFIG_COMPACTION

2240

#ifdef CONFIG_COMPACTION

2241

/* Try memory compaction for high-order allocations before reclaim */

2241

/* Try memory compaction for high-order allocations before reclaim */

2242

static struct page *

2242

static struct page *

2243

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2243

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2244

struct zonelist *zonelist, enum zone_type high_zoneidx,

2244

struct zonelist *zonelist, enum zone_type high_zoneidx,

2245

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2245

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2246

int classzone_idx, int migratetype, enum migrate_mode mode,

2246

int classzone_idx, int migratetype, enum migrate_mode mode,

2247

bool *contended_compaction, bool *deferred_compaction,

2247

bool *contended_compaction, bool *deferred_compaction,

2248

unsigned long *did_some_progress)

2248

unsigned long *did_some_progress)

2249

{

2249

{

2250

if (!order)

2250

if (!order)

2251

return NULL;

2251

return NULL;

2252

2253

if (compaction_deferred(preferred_zone, order)) {

2253

if (compaction_deferred(preferred_zone, order)) {

2254

*deferred_compaction = true;

2254

*deferred_compaction = true;

2255

return NULL;

2255

return NULL;

2256

}

2256

}

2257

2258

current->flags |= PF_MEMALLOC;

2258

current->flags |= PF_MEMALLOC;

2259

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2259

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2260

nodemask, mode,

2260

nodemask, mode,

2261

contended_compaction);

2261

contended_compaction);

2262

current->flags &= ~PF_MEMALLOC;

2262

current->flags &= ~PF_MEMALLOC;

2263

2264

if (*did_some_progress != COMPACT_SKIPPED) {

2264

if (*did_some_progress != COMPACT_SKIPPED) {

2265

struct page *page;

2265

struct page *page;

2266

2267

/* Page migration frees to the PCP lists but we want merging */

2267

/* Page migration frees to the PCP lists but we want merging */

2268

drain_pages(get_cpu());

2268

drain_pages(get_cpu());

2269

put_cpu();

2269

put_cpu();

2270

2271

page = get_page_from_freelist(gfp_mask, nodemask,

2271

page = get_page_from_freelist(gfp_mask, nodemask,

2272

order, zonelist, high_zoneidx,

2272

order, zonelist, high_zoneidx,

2273

alloc_flags & ~ALLOC_NO_WATERMARKS,

2273

alloc_flags & ~ALLOC_NO_WATERMARKS,

2274

preferred_zone, classzone_idx, migratetype);

2274

preferred_zone, classzone_idx, migratetype);

2275

if (page) {

2275

if (page) {

2276

preferred_zone->compact_blockskip_flush = false;

2276

preferred_zone->compact_blockskip_flush = false;

2277

compaction_defer_reset(preferred_zone, order, true);

2277

compaction_defer_reset(preferred_zone, order, true);

2278

count_vm_event(COMPACTSUCCESS);

2278

count_vm_event(COMPACTSUCCESS);

2279

return page;

2279

return page;

2280

}

2280

}

2281

2282

/*

2282

/*

2283

* It's bad if compaction run occurs and fails.

2283

* It's bad if compaction run occurs and fails.

2284

* The most likely reason is that pages exist,

2284

* The most likely reason is that pages exist,

2285

* but not enough to satisfy watermarks.

2285

* but not enough to satisfy watermarks.

2286

*/

2286

*/

2287

count_vm_event(COMPACTFAIL);

2287

count_vm_event(COMPACTFAIL);

2288

2289

/*

2289

/*

2290

* As async compaction considers a subset of pageblocks, only

2290

* As async compaction considers a subset of pageblocks, only

2291

* defer if the failure was a sync compaction failure.

2291

* defer if the failure was a sync compaction failure.

2292

*/

2292

*/

2293

if (mode != MIGRATE_ASYNC)

2293

if (mode != MIGRATE_ASYNC)

2294

defer_compaction(preferred_zone, order);

2294

defer_compaction(preferred_zone, order);

2295

2296

cond_resched();

2296

cond_resched();

2297

}

2297

}

2298

2299

return NULL;

2299

return NULL;

2300

}

2300

}

2301

#else

2301

#else

2302

static inline struct page *

2302

static inline struct page *

2303

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2303

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2304

struct zonelist *zonelist, enum zone_type high_zoneidx,

2304

struct zonelist *zonelist, enum zone_type high_zoneidx,

2305

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2305

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2306

int classzone_idx, int migratetype,

2306

int classzone_idx, int migratetype,

2307

enum migrate_mode mode, bool *contended_compaction,

2307

enum migrate_mode mode, bool *contended_compaction,

2308

bool *deferred_compaction, unsigned long *did_some_progress)

2308

bool *deferred_compaction, unsigned long *did_some_progress)

2309

{

2309

{

2310

return NULL;

2310

return NULL;

2311

}

2311

}

2312

#endif /* CONFIG_COMPACTION */

2312

#endif /* CONFIG_COMPACTION */

2313

2314

/* Perform direct synchronous page reclaim */

2314

/* Perform direct synchronous page reclaim */

2315

static int

2315

static int

2316

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2316

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2317

nodemask_t *nodemask)

2317

nodemask_t *nodemask)

2318

{

2318

{

2319

struct reclaim_state reclaim_state;

2319

struct reclaim_state reclaim_state;

2320

int progress;

2320

int progress;

2321

2322

cond_resched();

2322

cond_resched();

2323

2324

/* We now go into synchronous reclaim */

2324

/* We now go into synchronous reclaim */

2325

cpuset_memory_pressure_bump();

2325

cpuset_memory_pressure_bump();

2326

current->flags |= PF_MEMALLOC;

2326

current->flags |= PF_MEMALLOC;

2327

lockdep_set_current_reclaim_state(gfp_mask);

2327

lockdep_set_current_reclaim_state(gfp_mask);

2328

reclaim_state.reclaimed_slab = 0;

2328

reclaim_state.reclaimed_slab = 0;

2329

current->reclaim_state = &reclaim_state;

2329

current->reclaim_state = &reclaim_state;

2330

2331

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2331

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2332

2333

current->reclaim_state = NULL;

2333

current->reclaim_state = NULL;

2334

lockdep_clear_current_reclaim_state();

2334

lockdep_clear_current_reclaim_state();

2335

current->flags &= ~PF_MEMALLOC;

2335

current->flags &= ~PF_MEMALLOC;

2336

2337

cond_resched();

2337

cond_resched();

2338

2339

return progress;

2339

return progress;

2340

}

2340

}

2341

2342

/* The really slow allocator path where we enter direct reclaim */

2342

/* The really slow allocator path where we enter direct reclaim */

2343

static inline struct page *

2343

static inline struct page *

2344

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2344

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2345

struct zonelist *zonelist, enum zone_type high_zoneidx,

2345

struct zonelist *zonelist, enum zone_type high_zoneidx,

2346

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2346

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2347

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2347

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2348

{

2348

{

2349

struct page *page = NULL;

2349

struct page *page = NULL;

2350

bool drained = false;

2350

bool drained = false;

2351

2352

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2352

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2353

nodemask);

2353

nodemask);

2354

if (unlikely(!(*did_some_progress)))

2354

if (unlikely(!(*did_some_progress)))

2355

return NULL;

2355

return NULL;

2356

2357

/* After successful reclaim, reconsider all zones for allocation */

2357

/* After successful reclaim, reconsider all zones for allocation */

2358

if (IS_ENABLED(CONFIG_NUMA))

2358

if (IS_ENABLED(CONFIG_NUMA))

2359

zlc_clear_zones_full(zonelist);

2359

zlc_clear_zones_full(zonelist);

2360

2361

retry:

2361

retry:

2362

page = get_page_from_freelist(gfp_mask, nodemask, order,

2362

page = get_page_from_freelist(gfp_mask, nodemask, order,

2363

zonelist, high_zoneidx,

2363

zonelist, high_zoneidx,

2364

alloc_flags & ~ALLOC_NO_WATERMARKS,

2364

alloc_flags & ~ALLOC_NO_WATERMARKS,

2365

preferred_zone, classzone_idx,

2365

preferred_zone, classzone_idx,

2366

migratetype);

2366

migratetype);

2367

2368

/*

2368

/*

2369

* If an allocation failed after direct reclaim, it could be because

2369

* If an allocation failed after direct reclaim, it could be because

2370

* pages are pinned on the per-cpu lists. Drain them and try again

2370

* pages are pinned on the per-cpu lists. Drain them and try again

2371

*/

2371

*/

2372

if (!page && !drained) {

2372

if (!page && !drained) {

2373

drain_all_pages();

2373

drain_all_pages();

2374

drained = true;

2374

drained = true;

2375

goto retry;

2375

goto retry;

2376

}

2376

}

2377

2378

return page;

2378

return page;

2379

}

2379

}

2380

2381

/*

2381

/*

2382

* This is called in the allocator slow-path if the allocation request is of

2382

* This is called in the allocator slow-path if the allocation request is of

2383

* sufficient urgency to ignore watermarks and take other desperate measures

2383

* sufficient urgency to ignore watermarks and take other desperate measures

2384

*/

2384

*/

2385

static inline struct page *

2385

static inline struct page *

2386

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2386

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2387

struct zonelist *zonelist, enum zone_type high_zoneidx,

2387

struct zonelist *zonelist, enum zone_type high_zoneidx,

2388

nodemask_t *nodemask, struct zone *preferred_zone,

2388

nodemask_t *nodemask, struct zone *preferred_zone,

2389

int classzone_idx, int migratetype)

2389

int classzone_idx, int migratetype)

2390

{

2390

{

2391

struct page *page;

2391

struct page *page;

2392

2393

do {

2393

do {

2394

page = get_page_from_freelist(gfp_mask, nodemask, order,

2394

page = get_page_from_freelist(gfp_mask, nodemask, order,

2395

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2395

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2396

preferred_zone, classzone_idx, migratetype);

2396

preferred_zone, classzone_idx, migratetype);

2397

2398

if (!page && gfp_mask & __GFP_NOFAIL)

2398

if (!page && gfp_mask & __GFP_NOFAIL)

2399

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2399

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2400

} while (!page && (gfp_mask & __GFP_NOFAIL));

2400

} while (!page && (gfp_mask & __GFP_NOFAIL));

2401

2402

return page;

2402

return page;

2403

}

2403

}

2404

2405

static void reset_alloc_batches(struct zonelist *zonelist,

2405

static void reset_alloc_batches(struct zonelist *zonelist,

2406

enum zone_type high_zoneidx,

2406

enum zone_type high_zoneidx,

2407

struct zone *preferred_zone)

2407

struct zone *preferred_zone)

2408

{

2408

{

2409

struct zoneref *z;

2409

struct zoneref *z;

2410

struct zone *zone;

2410

struct zone *zone;

2411

2412

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2412

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2413

/*

2413

/*

2414

* Only reset the batches of zones that were actually

2414

* Only reset the batches of zones that were actually

2415

* considered in the fairness pass, we don't want to

2415

* considered in the fairness pass, we don't want to

2416

* trash fairness information for zones that are not

2416

* trash fairness information for zones that are not

2417

* actually part of this zonelist's round-robin cycle.

2417

* actually part of this zonelist's round-robin cycle.

2418

*/

2418

*/

2419

if (!zone_local(preferred_zone, zone))

2419

if (!zone_local(preferred_zone, zone))

2420

continue;

2420

continue;

2421

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2421

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2422

high_wmark_pages(zone) - low_wmark_pages(zone) -

2422

high_wmark_pages(zone) - low_wmark_pages(zone) -

2423

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2423

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2424

}

2424

}

2425

}

2425

}

2426

2427

static void wake_all_kswapds(unsigned int order,

2427

static void wake_all_kswapds(unsigned int order,

2428

struct zonelist *zonelist,

2428

struct zonelist *zonelist,

2429

enum zone_type high_zoneidx,

2429

enum zone_type high_zoneidx,

2430

struct zone *preferred_zone)

2430

struct zone *preferred_zone)

2431

{

2431

{

2432

struct zoneref *z;

2432

struct zoneref *z;

2433

struct zone *zone;

2433

struct zone *zone;

2434

2435

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2435

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2436

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2436

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2437

}

2437

}

2438

2439

static inline int

2439

static inline int

2440

gfp_to_alloc_flags(gfp_t gfp_mask)

2440

gfp_to_alloc_flags(gfp_t gfp_mask)

2441

{

2441

{

2442

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2442

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2443

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2443

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2444

2445

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2445

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2446

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2446

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2447

2448

/*

2448

/*

2449

* The caller may dip into page reserves a bit more if the caller

2449

* The caller may dip into page reserves a bit more if the caller

2450

* cannot run direct reclaim, or if the caller has realtime scheduling

2450

* cannot run direct reclaim, or if the caller has realtime scheduling

2451

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2451

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2452

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2452

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2453

*/

2453

*/

2454

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2454

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2455

2456

if (atomic) {

2456

if (atomic) {

2457

/*

2457

/*

2458

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2458

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2459

* if it can't schedule.

2459

* if it can't schedule.

2460

*/

2460

*/

2461

if (!(gfp_mask & __GFP_NOMEMALLOC))

2461

if (!(gfp_mask & __GFP_NOMEMALLOC))

2462

alloc_flags |= ALLOC_HARDER;

2462

alloc_flags |= ALLOC_HARDER;

2463

/*

2463

/*

2464

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2464

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2465

* comment for __cpuset_node_allowed_softwall().

2465

* comment for __cpuset_node_allowed_softwall().

2466

*/

2466

*/

2467

alloc_flags &= ~ALLOC_CPUSET;

2467

alloc_flags &= ~ALLOC_CPUSET;

2468

} else if (unlikely(rt_task(current)) && !in_interrupt())

2468

} else if (unlikely(rt_task(current)) && !in_interrupt())

2469

alloc_flags |= ALLOC_HARDER;

2469

alloc_flags |= ALLOC_HARDER;

2470

2471

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2471

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2472

if (gfp_mask & __GFP_MEMALLOC)

2472

if (gfp_mask & __GFP_MEMALLOC)

2473

alloc_flags |= ALLOC_NO_WATERMARKS;

2473

alloc_flags |= ALLOC_NO_WATERMARKS;

2474

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2474

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2475

alloc_flags |= ALLOC_NO_WATERMARKS;

2475

alloc_flags |= ALLOC_NO_WATERMARKS;

2476

else if (!in_interrupt() &&

2476

else if (!in_interrupt() &&

2477

((current->flags & PF_MEMALLOC) ||

2477

((current->flags & PF_MEMALLOC) ||

2478

unlikely(test_thread_flag(TIF_MEMDIE))))

2478

unlikely(test_thread_flag(TIF_MEMDIE))))

2479

alloc_flags |= ALLOC_NO_WATERMARKS;

2479

alloc_flags |= ALLOC_NO_WATERMARKS;

2480

}

2480

}

2481

#ifdef CONFIG_CMA

2481

#ifdef CONFIG_CMA

2482

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2482

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2483

alloc_flags |= ALLOC_CMA;

2483

alloc_flags |= ALLOC_CMA;

2484

#endif

2484

#endif

2485

return alloc_flags;

2485

return alloc_flags;

2486

}

2486

}

2487

2488

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2488

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2489

{

2489

{

2490

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2490

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2491

}

2491

}

2492

2493

static inline struct page *

2493

static inline struct page *

2494

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2494

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2495

struct zonelist *zonelist, enum zone_type high_zoneidx,

2495

struct zonelist *zonelist, enum zone_type high_zoneidx,

2496

nodemask_t *nodemask, struct zone *preferred_zone,

2496

nodemask_t *nodemask, struct zone *preferred_zone,

2497

int classzone_idx, int migratetype)

2497

int classzone_idx, int migratetype)

2498

{

2498

{

2499

const gfp_t wait = gfp_mask & __GFP_WAIT;

2499

const gfp_t wait = gfp_mask & __GFP_WAIT;

2500

struct page *page = NULL;

2500

struct page *page = NULL;

2501

int alloc_flags;

2501

int alloc_flags;

2502

unsigned long pages_reclaimed = 0;

2502

unsigned long pages_reclaimed = 0;

2503

unsigned long did_some_progress;

2503

unsigned long did_some_progress;

2504

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2504

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2505

bool deferred_compaction = false;

2505

bool deferred_compaction = false;

2506

bool contended_compaction = false;

2506

bool contended_compaction = false;

2507

2508

/*

2508

/*

2509

* In the slowpath, we sanity check order to avoid ever trying to

2509

* In the slowpath, we sanity check order to avoid ever trying to

2510

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2510

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2511

* be using allocators in order of preference for an area that is

2511

* be using allocators in order of preference for an area that is

2512

* too large.

2512

* too large.

2513

*/

2513

*/

2514

if (order >= MAX_ORDER) {

2514

if (order >= MAX_ORDER) {

2515

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2515

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2516

return NULL;

2516

return NULL;

2517

}

2517

}

2518

2519

/*

2519

/*

2520

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2520

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2521

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2521

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2522

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2522

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2523

* using a larger set of nodes after it has established that the

2523

* using a larger set of nodes after it has established that the

2524

* allowed per node queues are empty and that nodes are

2524

* allowed per node queues are empty and that nodes are

2525

* over allocated.

2525

* over allocated.

2526

*/

2526

*/

2527

if (IS_ENABLED(CONFIG_NUMA) &&

2527

if (IS_ENABLED(CONFIG_NUMA) &&

2528

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2528

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2529

goto nopage;

2529

goto nopage;

2530

2531

restart:

2531

restart:

2532

if (!(gfp_mask & __GFP_NO_KSWAPD))

2532

if (!(gfp_mask & __GFP_NO_KSWAPD))

2533

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2533

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2534

2535

/*

2535

/*

2536

* OK, we're below the kswapd watermark and have kicked background

2536

* OK, we're below the kswapd watermark and have kicked background

2537

* reclaim. Now things get more complex, so set up alloc_flags according

2537

* reclaim. Now things get more complex, so set up alloc_flags according

2538

* to how we want to proceed.

2538

* to how we want to proceed.

2539

*/

2539

*/

2540

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2540

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2541

2542

/*

2542

/*

2543

* Find the true preferred zone if the allocation is unconstrained by

2543

* Find the true preferred zone if the allocation is unconstrained by

2544

* cpusets.

2544

* cpusets.

2545

*/

2545

*/

2546

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2546

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2547

struct zoneref *preferred_zoneref;

2547

struct zoneref *preferred_zoneref;

2548

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2548

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2549

NULL,

2549

NULL,

2550

&preferred_zone);

2550

&preferred_zone);

2551

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2551

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2552

}

2552

}

2553

2554

rebalance:

2554

rebalance:

2555

/* This is the last chance, in general, before the goto nopage. */

2555

/* This is the last chance, in general, before the goto nopage. */

2556

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2556

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2557

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2557

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2558

preferred_zone, classzone_idx, migratetype);

2558

preferred_zone, classzone_idx, migratetype);

2559

if (page)

2559

if (page)

2560

goto got_pg;

2560

goto got_pg;

2561

2562

/* Allocate without watermarks if the context allows */

2562

/* Allocate without watermarks if the context allows */

2563

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2563

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2564

/*

2564

/*

2565

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2565

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2566

* the allocation is high priority and these type of

2566

* the allocation is high priority and these type of

2567

* allocations are system rather than user orientated

2567

* allocations are system rather than user orientated

2568

*/

2568

*/

2569

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2569

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2570

2571

page = __alloc_pages_high_priority(gfp_mask, order,

2571

page = __alloc_pages_high_priority(gfp_mask, order,

2572

zonelist, high_zoneidx, nodemask,

2572

zonelist, high_zoneidx, nodemask,

2573

preferred_zone, classzone_idx, migratetype);

2573

preferred_zone, classzone_idx, migratetype);

2574

if (page) {

2574

if (page) {

2575

goto got_pg;

2575

goto got_pg;

2576

}

2576

}

2577

}

2577

}

2578

2579

/* Atomic allocations - we can't balance anything */

2579

/* Atomic allocations - we can't balance anything */

2580

if (!wait)

2580

if (!wait)

2581

goto nopage;

2581

goto nopage;

2582

2583

/* Avoid recursion of direct reclaim */

2583

/* Avoid recursion of direct reclaim */

2584

if (current->flags & PF_MEMALLOC)

2584

if (current->flags & PF_MEMALLOC)

2585

goto nopage;

2585

goto nopage;

2586

2587

/* Avoid allocations with no watermarks from looping endlessly */

2587

/* Avoid allocations with no watermarks from looping endlessly */

2588

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2588

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2589

goto nopage;

2589

goto nopage;

2590

2591

/*

2591

/*

2592

* Try direct compaction. The first pass is asynchronous. Subsequent

2592

* Try direct compaction. The first pass is asynchronous. Subsequent

2593

* attempts after direct reclaim are synchronous

2593

* attempts after direct reclaim are synchronous

2594

*/

2594

*/

2595

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2595

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2596

high_zoneidx, nodemask, alloc_flags,

2596

high_zoneidx, nodemask, alloc_flags,

2597

preferred_zone,

2597

preferred_zone,

2598

classzone_idx, migratetype,

2598

classzone_idx, migratetype,

2599

migration_mode, &contended_compaction,

2599

migration_mode, &contended_compaction,

2600

&deferred_compaction,

2600

&deferred_compaction,

2601

&did_some_progress);

2601

&did_some_progress);

2602

if (page)

2602

if (page)

2603

goto got_pg;

2603

goto got_pg;

2604

migration_mode = MIGRATE_SYNC_LIGHT;

2604

migration_mode = MIGRATE_SYNC_LIGHT;

2605

2606

/*

2606

/*

2607

* If compaction is deferred for high-order allocations, it is because

2607

* If compaction is deferred for high-order allocations, it is because

2608

* sync compaction recently failed. In this is the case and the caller

2608

* sync compaction recently failed. In this is the case and the caller

2609

* requested a movable allocation that does not heavily disrupt the

2609

* requested a movable allocation that does not heavily disrupt the

2610

* system then fail the allocation instead of entering direct reclaim.

2610

* system then fail the allocation instead of entering direct reclaim.

2611

*/

2611

*/

2612

if ((deferred_compaction || contended_compaction) &&

2612

if ((deferred_compaction || contended_compaction) &&

2613

(gfp_mask & __GFP_NO_KSWAPD))

2613

(gfp_mask & __GFP_NO_KSWAPD))

2614

goto nopage;

2614

goto nopage;

2615

2616

/* Try direct reclaim and then allocating */

2616

/* Try direct reclaim and then allocating */

2617

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2617

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2618

zonelist, high_zoneidx,

2618

zonelist, high_zoneidx,

2619

nodemask,

2619

nodemask,

2620

alloc_flags, preferred_zone,

2620

alloc_flags, preferred_zone,

2621

classzone_idx, migratetype,

2621

classzone_idx, migratetype,

2622

&did_some_progress);

2622

&did_some_progress);

2623

if (page)

2623

if (page)

2624

goto got_pg;

2624

goto got_pg;

2625

2626

/*

2626

/*

2627

* If we failed to make any progress reclaiming, then we are

2627

* If we failed to make any progress reclaiming, then we are

2628

* running out of options and have to consider going OOM

2628

* running out of options and have to consider going OOM

2629

*/

2629

*/

2630

if (!did_some_progress) {

2630

if (!did_some_progress) {

2631

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2631

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2632

if (oom_killer_disabled)

2632

if (oom_killer_disabled)

2633

goto nopage;

2633

goto nopage;

2634

/* Coredumps can quickly deplete all memory reserves */

2634

/* Coredumps can quickly deplete all memory reserves */

2635

if ((current->flags & PF_DUMPCORE) &&

2635

if ((current->flags & PF_DUMPCORE) &&

2636

!(gfp_mask & __GFP_NOFAIL))

2636

!(gfp_mask & __GFP_NOFAIL))

2637

goto nopage;

2637

goto nopage;

2638

page = __alloc_pages_may_oom(gfp_mask, order,

2638

page = __alloc_pages_may_oom(gfp_mask, order,

2639

zonelist, high_zoneidx,

2639

zonelist, high_zoneidx,

2640

nodemask, preferred_zone,

2640

nodemask, preferred_zone,

2641

classzone_idx, migratetype);

2641

classzone_idx, migratetype);

2642

if (page)

2642

if (page)

2643

goto got_pg;

2643

goto got_pg;

2644

2645

if (!(gfp_mask & __GFP_NOFAIL)) {

2645

if (!(gfp_mask & __GFP_NOFAIL)) {

2646

/*

2646

/*

2647

* The oom killer is not called for high-order

2647

* The oom killer is not called for high-order

2648

* allocations that may fail, so if no progress

2648

* allocations that may fail, so if no progress

2649

* is being made, there are no other options and

2649

* is being made, there are no other options and

2650

* retrying is unlikely to help.

2650

* retrying is unlikely to help.

2651

*/

2651

*/

2652

if (order > PAGE_ALLOC_COSTLY_ORDER)

2652

if (order > PAGE_ALLOC_COSTLY_ORDER)

2653

goto nopage;

2653

goto nopage;

2654

/*

2654

/*

2655

* The oom killer is not called for lowmem

2655

* The oom killer is not called for lowmem

2656

* allocations to prevent needlessly killing

2656

* allocations to prevent needlessly killing

2657

* innocent tasks.

2657

* innocent tasks.

2658

*/

2658

*/

2659

if (high_zoneidx < ZONE_NORMAL)

2659

if (high_zoneidx < ZONE_NORMAL)

2660

goto nopage;

2660

goto nopage;

2661

}

2661

}

2662

2663

goto restart;

2663

goto restart;

2664

}

2664

}

2665

}

2665

}

2666

2667

/* Check if we should retry the allocation */

2667

/* Check if we should retry the allocation */

2668

pages_reclaimed += did_some_progress;

2668

pages_reclaimed += did_some_progress;

2669

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2669

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2670

pages_reclaimed)) {

2670

pages_reclaimed)) {

2671

/* Wait for some write requests to complete then retry */

2671

/* Wait for some write requests to complete then retry */

2672

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2672

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2673

goto rebalance;

2673

goto rebalance;

2674

} else {

2674

} else {

2675

/*

2675

/*

2676

* High-order allocations do not necessarily loop after

2676

* High-order allocations do not necessarily loop after

2677

* direct reclaim and reclaim/compaction depends on compaction

2677

* direct reclaim and reclaim/compaction depends on compaction

2678

* being called after reclaim so call directly if necessary

2678

* being called after reclaim so call directly if necessary

2679

*/

2679

*/

2680

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2680

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2681

high_zoneidx, nodemask, alloc_flags,

2681

high_zoneidx, nodemask, alloc_flags,

2682

preferred_zone,

2682

preferred_zone,

2683

classzone_idx, migratetype,

2683

classzone_idx, migratetype,

2684

migration_mode, &contended_compaction,

2684

migration_mode, &contended_compaction,

2685

&deferred_compaction,

2685

&deferred_compaction,

2686

&did_some_progress);

2686

&did_some_progress);

2687

if (page)

2687

if (page)

2688

goto got_pg;

2688

goto got_pg;

2689

}

2689

}

2690

2691

nopage:

2691

nopage:

2692

warn_alloc_failed(gfp_mask, order, NULL);

2692

warn_alloc_failed(gfp_mask, order, NULL);

2693

return page;

2693

return page;

2694

got_pg:

2694

got_pg:

2695

if (kmemcheck_enabled)

2695

if (kmemcheck_enabled)

2696

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2696

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2697

2698

return page;

2698

return page;

2699

}

2699

}

2700

2701

/*

2701

/*

2702

* This is the 'heart' of the zoned buddy allocator.

2702

* This is the 'heart' of the zoned buddy allocator.

2703

*/

2703

*/

2704

struct page *

2704

struct page *

2705

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2705

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2706

struct zonelist *zonelist, nodemask_t *nodemask)

2706

struct zonelist *zonelist, nodemask_t *nodemask)

2707

{

2707

{

2708

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2708

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2709

struct zone *preferred_zone;

2709

struct zone *preferred_zone;

2710

struct zoneref *preferred_zoneref;

2710

struct zoneref *preferred_zoneref;

2711

struct page *page = NULL;

2711

struct page *page = NULL;

2712

int migratetype = allocflags_to_migratetype(gfp_mask);

2712

int migratetype = allocflags_to_migratetype(gfp_mask);

2713

unsigned int cpuset_mems_cookie;

2713

unsigned int cpuset_mems_cookie;

2714

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2714

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2715

struct mem_cgroup *memcg = NULL;

2715

struct mem_cgroup *memcg = NULL;

2716

int classzone_idx;

2716

int classzone_idx;

2717

2718

gfp_mask &= gfp_allowed_mask;

2718

gfp_mask &= gfp_allowed_mask;

2719

2720

lockdep_trace_alloc(gfp_mask);

2720

lockdep_trace_alloc(gfp_mask);

2721

2722

might_sleep_if(gfp_mask & __GFP_WAIT);

2722

might_sleep_if(gfp_mask & __GFP_WAIT);

2723

2724

if (should_fail_alloc_page(gfp_mask, order))

2724

if (should_fail_alloc_page(gfp_mask, order))

2725

return NULL;

2725

return NULL;

2726

2727

/*

2727

/*

2728

* Check the zones suitable for the gfp_mask contain at least one

2728

* Check the zones suitable for the gfp_mask contain at least one

2729

* valid zone. It's possible to have an empty zonelist as a result

2729

* valid zone. It's possible to have an empty zonelist as a result

2730

* of GFP_THISNODE and a memoryless node

2730

* of GFP_THISNODE and a memoryless node

2731

*/

2731

*/

2732

if (unlikely(!zonelist->_zonerefs->zone))

2732

if (unlikely(!zonelist->_zonerefs->zone))

2733

return NULL;

2733

return NULL;

2734

2735

/*

2735

/*

2736

* Will only have any effect when __GFP_KMEMCG is set. This is

2736

* Will only have any effect when __GFP_KMEMCG is set. This is

2737

* verified in the (always inline) callee

2737

* verified in the (always inline) callee

2738

*/

2738

*/

2739

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2739

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2740

return NULL;

2740

return NULL;

2741

2742

retry_cpuset:

2742

retry_cpuset:

2743

cpuset_mems_cookie = read_mems_allowed_begin();

2743

cpuset_mems_cookie = read_mems_allowed_begin();

2744

2745

/* The preferred zone is used for statistics later */

2745

/* The preferred zone is used for statistics later */

2746

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2746

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2747

nodemask ? : &cpuset_current_mems_allowed,

2747

nodemask ? : &cpuset_current_mems_allowed,

2748

&preferred_zone);

2748

&preferred_zone);

2749

if (!preferred_zone)

2749

if (!preferred_zone)

2750

goto out;

2750

goto out;

2751

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2751

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2752

2753

#ifdef CONFIG_CMA

2753

#ifdef CONFIG_CMA

2754

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2754

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2755

alloc_flags |= ALLOC_CMA;

2755

alloc_flags |= ALLOC_CMA;

2756

#endif

2756

#endif

2757

retry:

2757

retry:

2758

/* First allocation attempt */

2758

/* First allocation attempt */

2759

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2759

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2760

zonelist, high_zoneidx, alloc_flags,

2760

zonelist, high_zoneidx, alloc_flags,

2761

preferred_zone, classzone_idx, migratetype);

2761

preferred_zone, classzone_idx, migratetype);

2762

if (unlikely(!page)) {

2762

if (unlikely(!page)) {

2763

/*

2763

/*

2764

* The first pass makes sure allocations are spread

2764

* The first pass makes sure allocations are spread

2765

* fairly within the local node. However, the local

2765

* fairly within the local node. However, the local

2766

* node might have free pages left after the fairness

2766

* node might have free pages left after the fairness

2767

* batches are exhausted, and remote zones haven't

2767

* batches are exhausted, and remote zones haven't

2768

* even been considered yet. Try once more without

2768

* even been considered yet. Try once more without

2769

* fairness, and include remote zones now, before

2769

* fairness, and include remote zones now, before

2770

* entering the slowpath and waking kswapd: prefer

2770

* entering the slowpath and waking kswapd: prefer

2771

* spilling to a remote zone over swapping locally.

2771

* spilling to a remote zone over swapping locally.

2772

*/

2772

*/

2773

if (alloc_flags & ALLOC_FAIR) {

2773

if (alloc_flags & ALLOC_FAIR) {

2774

reset_alloc_batches(zonelist, high_zoneidx,

2774

reset_alloc_batches(zonelist, high_zoneidx,

2775

preferred_zone);

2775

preferred_zone);

2776

alloc_flags &= ~ALLOC_FAIR;

2776

alloc_flags &= ~ALLOC_FAIR;

2777

goto retry;

2777

goto retry;

2778

}

2778

}

2779

/*

2779

/*

2780

* Runtime PM, block IO and its error handling path

2780

* Runtime PM, block IO and its error handling path

2781

* can deadlock because I/O on the device might not

2781

* can deadlock because I/O on the device might not

2782

* complete.

2782

* complete.

2783

*/

2783

*/

2784

gfp_mask = memalloc_noio_flags(gfp_mask);

2784

gfp_mask = memalloc_noio_flags(gfp_mask);

2785

page = __alloc_pages_slowpath(gfp_mask, order,

2785

page = __alloc_pages_slowpath(gfp_mask, order,

2786

zonelist, high_zoneidx, nodemask,

2786

zonelist, high_zoneidx, nodemask,

2787

preferred_zone, classzone_idx, migratetype);

2787

preferred_zone, classzone_idx, migratetype);

2788

}

2788

}

2789

2790

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2790

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2791

2792

out:

2792

out:

2793

/*

2793

/*

2794

* When updating a task's mems_allowed, it is possible to race with

2794

* When updating a task's mems_allowed, it is possible to race with

2795

* parallel threads in such a way that an allocation can fail while

2795

* parallel threads in such a way that an allocation can fail while

2796

* the mask is being updated. If a page allocation is about to fail,

2796

* the mask is being updated. If a page allocation is about to fail,

2797

* check if the cpuset changed during allocation and if so, retry.

2797

* check if the cpuset changed during allocation and if so, retry.

2798

*/

2798

*/

2799

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2799

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2800

goto retry_cpuset;

2800

goto retry_cpuset;

2801

2802

memcg_kmem_commit_charge(page, memcg, order);

2802

memcg_kmem_commit_charge(page, memcg, order);

2803

2804

return page;

2804

return page;

2805

}

2805

}

2806

EXPORT_SYMBOL(__alloc_pages_nodemask);

2806

EXPORT_SYMBOL(__alloc_pages_nodemask);

2807

2808

/*

2808

/*

2809

* Common helper functions.

2809

* Common helper functions.

2810

*/

2810

*/

2811

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2811

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2812

{

2812

{

2813

struct page *page;

2813

struct page *page;

2814

2815

/*

2815

/*

2816

* __get_free_pages() returns a 32-bit address, which cannot represent

2816

* __get_free_pages() returns a 32-bit address, which cannot represent

2817

* a highmem page

2817

* a highmem page

2818

*/

2818

*/

2819

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2819

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2820

2821

page = alloc_pages(gfp_mask, order);

2821

page = alloc_pages(gfp_mask, order);

2822

if (!page)

2822

if (!page)

2823

return 0;

2823

return 0;

2824

return (unsigned long) page_address(page);

2824

return (unsigned long) page_address(page);

2825

}

2825

}

2826

EXPORT_SYMBOL(__get_free_pages);

2826

EXPORT_SYMBOL(__get_free_pages);

2827

2828

unsigned long get_zeroed_page(gfp_t gfp_mask)

2828

unsigned long get_zeroed_page(gfp_t gfp_mask)

2829

{

2829

{

2830

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2830

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2831

}

2831

}

2832

EXPORT_SYMBOL(get_zeroed_page);

2832

EXPORT_SYMBOL(get_zeroed_page);

2833

2834

void __free_pages(struct page *page, unsigned int order)

2834

void __free_pages(struct page *page, unsigned int order)

2835

{

2835

{

2836

if (put_page_testzero(page)) {

2836

if (put_page_testzero(page)) {

2837

if (order == 0)

2837

if (order == 0)

2838

free_hot_cold_page(page, false);

2838

free_hot_cold_page(page, false);

2839

else

2839

else

2840

__free_pages_ok(page, order);

2840

__free_pages_ok(page, order);

2841

}

2841

}

2842

}

2842

}

2843

2844

EXPORT_SYMBOL(__free_pages);

2844

EXPORT_SYMBOL(__free_pages);

2845

2846

void free_pages(unsigned long addr, unsigned int order)

2846

void free_pages(unsigned long addr, unsigned int order)

2847

{

2847

{

2848

if (addr != 0) {

2848

if (addr != 0) {

2849

VM_BUG_ON(!virt_addr_valid((void *)addr));

2849

VM_BUG_ON(!virt_addr_valid((void *)addr));

2850

__free_pages(virt_to_page((void *)addr), order);

2850

__free_pages(virt_to_page((void *)addr), order);

2851

}

2851

}

2852

}

2852

}

2853

2854

EXPORT_SYMBOL(free_pages);

2854

EXPORT_SYMBOL(free_pages);

2855

2856

/*

2856

/*

2857

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2857

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2858

* pages allocated with __GFP_KMEMCG.

2858

* pages allocated with __GFP_KMEMCG.

2859

*

2859

*

2860

* Those pages are accounted to a particular memcg, embedded in the

2860

* Those pages are accounted to a particular memcg, embedded in the

2861

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2861

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2862

* for that information only to find out that it is NULL for users who have no

2862

* for that information only to find out that it is NULL for users who have no

2863

* interest in that whatsoever, we provide these functions.

2863

* interest in that whatsoever, we provide these functions.

2864

*

2864

*

2865

* The caller knows better which flags it relies on.

2865

* The caller knows better which flags it relies on.

2866

*/

2866

*/

2867

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2867

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2868

{

2868

{

2869

memcg_kmem_uncharge_pages(page, order);

2869

memcg_kmem_uncharge_pages(page, order);

2870

__free_pages(page, order);

2870

__free_pages(page, order);

2871

}

2871

}

2872

2873

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2873

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2874

{

2874

{

2875

if (addr != 0) {

2875

if (addr != 0) {

2876

VM_BUG_ON(!virt_addr_valid((void *)addr));

2876

VM_BUG_ON(!virt_addr_valid((void *)addr));

2877

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2877

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2878

}

2878

}

2879

}

2879

}

2880

2881

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2881

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2882

{

2882

{

2883

if (addr) {

2883

if (addr) {

2884

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2884

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2885

unsigned long used = addr + PAGE_ALIGN(size);

2885

unsigned long used = addr + PAGE_ALIGN(size);

2886

2887

split_page(virt_to_page((void *)addr), order);

2887

split_page(virt_to_page((void *)addr), order);

2888

while (used < alloc_end) {

2888

while (used < alloc_end) {

2889

free_page(used);

2889

free_page(used);

2890

used += PAGE_SIZE;

2890

used += PAGE_SIZE;

2891

}

2891

}

2892

}

2892

}

2893

return (void *)addr;

2893

return (void *)addr;

2894

}

2894

}

2895

2896

/**

2896

/**

2897

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2897

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2898

* @size: the number of bytes to allocate

2898

* @size: the number of bytes to allocate

2899

* @gfp_mask: GFP flags for the allocation

2899

* @gfp_mask: GFP flags for the allocation

2900

*

2900

*

2901

* This function is similar to alloc_pages(), except that it allocates the

2901

* This function is similar to alloc_pages(), except that it allocates the

2902

* minimum number of pages to satisfy the request. alloc_pages() can only

2902

* minimum number of pages to satisfy the request. alloc_pages() can only

2903

* allocate memory in power-of-two pages.

2903

* allocate memory in power-of-two pages.

2904

*

2904

*

2905

* This function is also limited by MAX_ORDER.

2905

* This function is also limited by MAX_ORDER.

2906

*

2906

*

2907

* Memory allocated by this function must be released by free_pages_exact().

2907

* Memory allocated by this function must be released by free_pages_exact().

2908

*/

2908

*/

2909

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2909

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2910

{

2910

{

2911

unsigned int order = get_order(size);

2911

unsigned int order = get_order(size);

2912

unsigned long addr;

2912

unsigned long addr;

2913

2914

addr = __get_free_pages(gfp_mask, order);

2914

addr = __get_free_pages(gfp_mask, order);

2915

return make_alloc_exact(addr, order, size);

2915

return make_alloc_exact(addr, order, size);

2916

}

2916

}

2917

EXPORT_SYMBOL(alloc_pages_exact);

2917

EXPORT_SYMBOL(alloc_pages_exact);

2918

2919

/**

2919

/**

2920

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2920

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2921

* pages on a node.

2921

* pages on a node.

2922

* @nid: the preferred node ID where memory should be allocated

2922

* @nid: the preferred node ID where memory should be allocated

2923

* @size: the number of bytes to allocate

2923

* @size: the number of bytes to allocate

2924

* @gfp_mask: GFP flags for the allocation

2924

* @gfp_mask: GFP flags for the allocation

2925

*

2925

*

2926

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2926

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2927

* back.

2927

* back.

2928

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2928

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2929

* but is not exact.

2929

* but is not exact.

2930

*/

2930

*/

2931

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2931

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2932

{

2932

{

2933

unsigned order = get_order(size);

2933

unsigned order = get_order(size);

2934

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2934

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2935

if (!p)

2935

if (!p)

2936

return NULL;

2936

return NULL;

2937

return make_alloc_exact((unsigned long)page_address(p), order, size);

2937

return make_alloc_exact((unsigned long)page_address(p), order, size);

2938

}

2938

}

2939

EXPORT_SYMBOL(alloc_pages_exact_nid);

2939

EXPORT_SYMBOL(alloc_pages_exact_nid);

2940

2941

/**

2941

/**

2942

* free_pages_exact - release memory allocated via alloc_pages_exact()

2942

* free_pages_exact - release memory allocated via alloc_pages_exact()

2943

* @virt: the value returned by alloc_pages_exact.

2943

* @virt: the value returned by alloc_pages_exact.

2944

* @size: size of allocation, same value as passed to alloc_pages_exact().

2944

* @size: size of allocation, same value as passed to alloc_pages_exact().

2945

*

2945

*

2946

* Release the memory allocated by a previous call to alloc_pages_exact.

2946

* Release the memory allocated by a previous call to alloc_pages_exact.

2947

*/

2947

*/

2948

void free_pages_exact(void *virt, size_t size)

2948

void free_pages_exact(void *virt, size_t size)

2949

{

2949

{

2950

unsigned long addr = (unsigned long)virt;

2950

unsigned long addr = (unsigned long)virt;

2951

unsigned long end = addr + PAGE_ALIGN(size);

2951

unsigned long end = addr + PAGE_ALIGN(size);

2952

2953

while (addr < end) {

2953

while (addr < end) {

2954

free_page(addr);

2954

free_page(addr);

2955

addr += PAGE_SIZE;

2955

addr += PAGE_SIZE;

2956

}

2956

}

2957

}

2957

}

2958

EXPORT_SYMBOL(free_pages_exact);

2958

EXPORT_SYMBOL(free_pages_exact);

2959

2960

/**

2960

/**

2961

* nr_free_zone_pages - count number of pages beyond high watermark

2961

* nr_free_zone_pages - count number of pages beyond high watermark

2962

* @offset: The zone index of the highest zone

2962

* @offset: The zone index of the highest zone

2963

*

2963

*

2964

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2964

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2965

* high watermark within all zones at or below a given zone index. For each

2965

* high watermark within all zones at or below a given zone index. For each

2966

* zone, the number of pages is calculated as:

2966

* zone, the number of pages is calculated as:

2967

* managed_pages - high_pages

2967

* managed_pages - high_pages

2968

*/

2968

*/

2969

static unsigned long nr_free_zone_pages(int offset)

2969

static unsigned long nr_free_zone_pages(int offset)

2970

{

2970

{

2971

struct zoneref *z;

2971

struct zoneref *z;

2972

struct zone *zone;

2972

struct zone *zone;

2973

2974

/* Just pick one node, since fallback list is circular */

2974

/* Just pick one node, since fallback list is circular */

2975

unsigned long sum = 0;

2975

unsigned long sum = 0;

2976

2977

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2977

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2978

2979

for_each_zone_zonelist(zone, z, zonelist, offset) {

2979

for_each_zone_zonelist(zone, z, zonelist, offset) {

2980

unsigned long size = zone->managed_pages;

2980

unsigned long size = zone->managed_pages;

2981

unsigned long high = high_wmark_pages(zone);

2981

unsigned long high = high_wmark_pages(zone);

2982

if (size > high)

2982

if (size > high)

2983

sum += size - high;

2983

sum += size - high;

2984

}

2984

}

2985

2986

return sum;

2986

return sum;

2987

}

2987

}

2988

2989

/**

2989

/**

2990

* nr_free_buffer_pages - count number of pages beyond high watermark

2990

* nr_free_buffer_pages - count number of pages beyond high watermark

2991

*

2991

*

2992

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2992

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2993

* watermark within ZONE_DMA and ZONE_NORMAL.

2993

* watermark within ZONE_DMA and ZONE_NORMAL.

2994

*/

2994

*/

2995

unsigned long nr_free_buffer_pages(void)

2995

unsigned long nr_free_buffer_pages(void)

2996

{

2996

{

2997

return nr_free_zone_pages(gfp_zone(GFP_USER));

2997

return nr_free_zone_pages(gfp_zone(GFP_USER));

2998

}

2998

}

2999

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2999

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

3000

3001

/**

3001

/**

3002

* nr_free_pagecache_pages - count number of pages beyond high watermark

3002

* nr_free_pagecache_pages - count number of pages beyond high watermark

3003

*

3003

*

3004

* nr_free_pagecache_pages() counts the number of pages which are beyond the

3004

* nr_free_pagecache_pages() counts the number of pages which are beyond the

3005

* high watermark within all zones.

3005

* high watermark within all zones.

3006

*/

3006

*/

3007

unsigned long nr_free_pagecache_pages(void)

3007

unsigned long nr_free_pagecache_pages(void)

3008

{

3008

{

3009

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3009

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3010

}

3010

}

3011

3012

static inline void show_node(struct zone *zone)

3012

static inline void show_node(struct zone *zone)

3013

{

3013

{

3014

if (IS_ENABLED(CONFIG_NUMA))

3014

if (IS_ENABLED(CONFIG_NUMA))

3015

printk("Node %d ", zone_to_nid(zone));

3015

printk("Node %d ", zone_to_nid(zone));

3016

}

3016

}

3017

3018

void si_meminfo(struct sysinfo *val)

3018

void si_meminfo(struct sysinfo *val)

3019

{

3019

{

3020

val->totalram = totalram_pages;

3020

val->totalram = totalram_pages;

3021

val->sharedram = 0;

3021

val->sharedram = 0;

3022

val->freeram = global_page_state(NR_FREE_PAGES);

3022

val->freeram = global_page_state(NR_FREE_PAGES);

3023

val->bufferram = nr_blockdev_pages();

3023

val->bufferram = nr_blockdev_pages();

3024

val->totalhigh = totalhigh_pages;

3024

val->totalhigh = totalhigh_pages;

3025

val->freehigh = nr_free_highpages();

3025

val->freehigh = nr_free_highpages();

3026

val->mem_unit = PAGE_SIZE;

3026

val->mem_unit = PAGE_SIZE;

3027

}

3027

}

3028

3029

EXPORT_SYMBOL(si_meminfo);

3029

EXPORT_SYMBOL(si_meminfo);

3030

3031

#ifdef CONFIG_NUMA

3031

#ifdef CONFIG_NUMA

3032

void si_meminfo_node(struct sysinfo *val, int nid)

3032

void si_meminfo_node(struct sysinfo *val, int nid)

3033

{

3033

{

3034

int zone_type; /* needs to be signed */

3034

int zone_type; /* needs to be signed */

3035

unsigned long managed_pages = 0;

3035

unsigned long managed_pages = 0;

3036

pg_data_t *pgdat = NODE_DATA(nid);

3036

pg_data_t *pgdat = NODE_DATA(nid);

3037

3038

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3038

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3039

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3039

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3040

val->totalram = managed_pages;

3040

val->totalram = managed_pages;

3041

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3041

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3042

#ifdef CONFIG_HIGHMEM

3042

#ifdef CONFIG_HIGHMEM

3043

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3043

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3044

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3044

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3045

NR_FREE_PAGES);

3045

NR_FREE_PAGES);

3046

#else

3046

#else

3047

val->totalhigh = 0;

3047

val->totalhigh = 0;

3048

val->freehigh = 0;

3048

val->freehigh = 0;

3049

#endif

3049

#endif

3050

val->mem_unit = PAGE_SIZE;

3050

val->mem_unit = PAGE_SIZE;

3051

}

3051

}

3052

#endif

3052

#endif

3053

3054

/*

3054

/*

3055

* Determine whether the node should be displayed or not, depending on whether

3055

* Determine whether the node should be displayed or not, depending on whether

3056

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3056

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3057

*/

3057

*/

3058

bool skip_free_areas_node(unsigned int flags, int nid)

3058

bool skip_free_areas_node(unsigned int flags, int nid)

3059

{

3059

{

3060

bool ret = false;

3060

bool ret = false;

3061

unsigned int cpuset_mems_cookie;

3061

unsigned int cpuset_mems_cookie;

3062

3063

if (!(flags & SHOW_MEM_FILTER_NODES))

3063

if (!(flags & SHOW_MEM_FILTER_NODES))

3064

goto out;

3064

goto out;

3065

3066

do {

3066

do {

3067

cpuset_mems_cookie = read_mems_allowed_begin();

3067

cpuset_mems_cookie = read_mems_allowed_begin();

3068

ret = !node_isset(nid, cpuset_current_mems_allowed);

3068

ret = !node_isset(nid, cpuset_current_mems_allowed);

3069

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3069

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3070

out:

3070

out:

3071

return ret;

3071

return ret;

3072

}

3072

}

3073

3074

#define K(x) ((x) << (PAGE_SHIFT-10))

3074

#define K(x) ((x) << (PAGE_SHIFT-10))

3075

3076

static void show_migration_types(unsigned char type)

3076

static void show_migration_types(unsigned char type)

3077

{

3077

{

3078

static const char types[MIGRATE_TYPES] = {

3078

static const char types[MIGRATE_TYPES] = {

3079

[MIGRATE_UNMOVABLE] = 'U',

3079

[MIGRATE_UNMOVABLE] = 'U',

3080

[MIGRATE_RECLAIMABLE] = 'E',

3080

[MIGRATE_RECLAIMABLE] = 'E',

3081

[MIGRATE_MOVABLE] = 'M',

3081

[MIGRATE_MOVABLE] = 'M',

3082

[MIGRATE_RESERVE] = 'R',

3082

[MIGRATE_RESERVE] = 'R',

3083

#ifdef CONFIG_CMA

3083

#ifdef CONFIG_CMA

3084

[MIGRATE_CMA] = 'C',

3084

[MIGRATE_CMA] = 'C',

3085

#endif

3085

#endif

3086

#ifdef CONFIG_MEMORY_ISOLATION

3086

#ifdef CONFIG_MEMORY_ISOLATION

3087

[MIGRATE_ISOLATE] = 'I',

3087

[MIGRATE_ISOLATE] = 'I',

3088

#endif

3088

#endif

3089

};

3089

};

3090

char tmp[MIGRATE_TYPES + 1];

3090

char tmp[MIGRATE_TYPES + 1];

3091

char *p = tmp;

3091

char *p = tmp;

3092

int i;

3092

int i;

3093

3094

for (i = 0; i < MIGRATE_TYPES; i++) {

3094

for (i = 0; i < MIGRATE_TYPES; i++) {

3095

if (type & (1 << i))

3095

if (type & (1 << i))

3096

*p++ = types[i];

3096

*p++ = types[i];

3097

}

3097

}

3098

3099

*p = '\0';

3099

*p = '\0';

3100

printk("(%s) ", tmp);

3100

printk("(%s) ", tmp);

3101

}

3101

}

3102

3103

/*

3103

/*

3104

* Show free area list (used inside shift_scroll-lock stuff)

3104

* Show free area list (used inside shift_scroll-lock stuff)

3105

* We also calculate the percentage fragmentation. We do this by counting the

3105

* We also calculate the percentage fragmentation. We do this by counting the

3106

* memory on each free list with the exception of the first item on the list.

3106

* memory on each free list with the exception of the first item on the list.

3107

* Suppresses nodes that are not allowed by current's cpuset if

3107

* Suppresses nodes that are not allowed by current's cpuset if

3108

* SHOW_MEM_FILTER_NODES is passed.

3108

* SHOW_MEM_FILTER_NODES is passed.

3109

*/

3109

*/

3110

void show_free_areas(unsigned int filter)

3110

void show_free_areas(unsigned int filter)

3111

{

3111

{

3112

int cpu;

3112

int cpu;

3113

struct zone *zone;

3113

struct zone *zone;

3114

3115

for_each_populated_zone(zone) {

3115

for_each_populated_zone(zone) {

3116

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3116

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3117

continue;

3117

continue;

3118

show_node(zone);

3118

show_node(zone);

3119

printk("%s per-cpu:\n", zone->name);

3119

printk("%s per-cpu:\n", zone->name);

3120

3121

for_each_online_cpu(cpu) {

3121

for_each_online_cpu(cpu) {

3122

struct per_cpu_pageset *pageset;

3122

struct per_cpu_pageset *pageset;

3123

3124

pageset = per_cpu_ptr(zone->pageset, cpu);

3124

pageset = per_cpu_ptr(zone->pageset, cpu);

3125

3126

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3126

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3127

cpu, pageset->pcp.high,

3127

cpu, pageset->pcp.high,

3128

pageset->pcp.batch, pageset->pcp.count);

3128

pageset->pcp.batch, pageset->pcp.count);

3129

}

3129

}

3130

}

3130

}

3131

3132

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3132

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3133

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3133

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3134

" unevictable:%lu"

3134

" unevictable:%lu"

3135

" dirty:%lu writeback:%lu unstable:%lu\n"

3135

" dirty:%lu writeback:%lu unstable:%lu\n"

3136

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3136

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3137

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3137

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3138

" free_cma:%lu\n",

3138

" free_cma:%lu\n",

3139

global_page_state(NR_ACTIVE_ANON),

3139

global_page_state(NR_ACTIVE_ANON),

3140

global_page_state(NR_INACTIVE_ANON),

3140

global_page_state(NR_INACTIVE_ANON),

3141

global_page_state(NR_ISOLATED_ANON),

3141

global_page_state(NR_ISOLATED_ANON),

3142

global_page_state(NR_ACTIVE_FILE),

3142

global_page_state(NR_ACTIVE_FILE),

3143

global_page_state(NR_INACTIVE_FILE),

3143

global_page_state(NR_INACTIVE_FILE),

3144

global_page_state(NR_ISOLATED_FILE),

3144

global_page_state(NR_ISOLATED_FILE),

3145

global_page_state(NR_UNEVICTABLE),

3145

global_page_state(NR_UNEVICTABLE),

3146

global_page_state(NR_FILE_DIRTY),

3146

global_page_state(NR_FILE_DIRTY),

3147

global_page_state(NR_WRITEBACK),

3147

global_page_state(NR_WRITEBACK),

3148

global_page_state(NR_UNSTABLE_NFS),

3148

global_page_state(NR_UNSTABLE_NFS),

3149

global_page_state(NR_FREE_PAGES),

3149

global_page_state(NR_FREE_PAGES),

3150

global_page_state(NR_SLAB_RECLAIMABLE),

3150

global_page_state(NR_SLAB_RECLAIMABLE),

3151

global_page_state(NR_SLAB_UNRECLAIMABLE),

3151

global_page_state(NR_SLAB_UNRECLAIMABLE),

3152

global_page_state(NR_FILE_MAPPED),

3152

global_page_state(NR_FILE_MAPPED),

3153

global_page_state(NR_SHMEM),

3153

global_page_state(NR_SHMEM),

3154

global_page_state(NR_PAGETABLE),

3154

global_page_state(NR_PAGETABLE),

3155

global_page_state(NR_BOUNCE),

3155

global_page_state(NR_BOUNCE),

3156

global_page_state(NR_FREE_CMA_PAGES));

3156

global_page_state(NR_FREE_CMA_PAGES));

3157

3158

for_each_populated_zone(zone) {

3158

for_each_populated_zone(zone) {

3159

int i;

3159

int i;

3160

3161

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3161

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3162

continue;

3162

continue;

3163

show_node(zone);

3163

show_node(zone);

3164

printk("%s"

3164

printk("%s"

3165

" free:%lukB"

3165

" free:%lukB"

3166

" min:%lukB"

3166

" min:%lukB"

3167

" low:%lukB"

3167

" low:%lukB"

3168

" high:%lukB"

3168

" high:%lukB"

3169

" active_anon:%lukB"

3169

" active_anon:%lukB"

3170

" inactive_anon:%lukB"

3170

" inactive_anon:%lukB"

3171

" active_file:%lukB"

3171

" active_file:%lukB"

3172

" inactive_file:%lukB"

3172

" inactive_file:%lukB"

3173

" unevictable:%lukB"

3173

" unevictable:%lukB"

3174

" isolated(anon):%lukB"

3174

" isolated(anon):%lukB"

3175

" isolated(file):%lukB"

3175

" isolated(file):%lukB"

3176

" present:%lukB"

3176

" present:%lukB"

3177

" managed:%lukB"

3177

" managed:%lukB"

3178

" mlocked:%lukB"

3178

" mlocked:%lukB"

3179

" dirty:%lukB"

3179

" dirty:%lukB"

3180

" writeback:%lukB"

3180

" writeback:%lukB"

3181

" mapped:%lukB"

3181

" mapped:%lukB"

3182

" shmem:%lukB"

3182

" shmem:%lukB"

3183

" slab_reclaimable:%lukB"

3183

" slab_reclaimable:%lukB"

3184

" slab_unreclaimable:%lukB"

3184

" slab_unreclaimable:%lukB"

3185

" kernel_stack:%lukB"

3185

" kernel_stack:%lukB"

3186

" pagetables:%lukB"

3186

" pagetables:%lukB"

3187

" unstable:%lukB"

3187

" unstable:%lukB"

3188

" bounce:%lukB"

3188

" bounce:%lukB"

3189

" free_cma:%lukB"

3189

" free_cma:%lukB"

3190

" writeback_tmp:%lukB"

3190

" writeback_tmp:%lukB"

3191

" pages_scanned:%lu"

3191

" pages_scanned:%lu"

3192

" all_unreclaimable? %s"

3192

" all_unreclaimable? %s"

3193

"\n",

3193

"\n",

3194

zone->name,

3194

zone->name,

3195

K(zone_page_state(zone, NR_FREE_PAGES)),

3195

K(zone_page_state(zone, NR_FREE_PAGES)),

3196

K(min_wmark_pages(zone)),

3196

K(min_wmark_pages(zone)),

3197

K(low_wmark_pages(zone)),

3197

K(low_wmark_pages(zone)),

3198

K(high_wmark_pages(zone)),

3198

K(high_wmark_pages(zone)),

3199

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3199

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3200

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3200

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3201

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3201

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3202

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3202

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3203

K(zone_page_state(zone, NR_UNEVICTABLE)),

3203

K(zone_page_state(zone, NR_UNEVICTABLE)),

3204

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3204

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3205

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3205

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3206

K(zone->present_pages),

3206

K(zone->present_pages),

3207

K(zone->managed_pages),

3207

K(zone->managed_pages),

3208

K(zone_page_state(zone, NR_MLOCK)),

3208

K(zone_page_state(zone, NR_MLOCK)),

3209

K(zone_page_state(zone, NR_FILE_DIRTY)),

3209

K(zone_page_state(zone, NR_FILE_DIRTY)),

3210

K(zone_page_state(zone, NR_WRITEBACK)),

3210

K(zone_page_state(zone, NR_WRITEBACK)),

3211

K(zone_page_state(zone, NR_FILE_MAPPED)),

3211

K(zone_page_state(zone, NR_FILE_MAPPED)),

3212

K(zone_page_state(zone, NR_SHMEM)),

3212

K(zone_page_state(zone, NR_SHMEM)),

3213

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3213

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3214

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3214

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3215

zone_page_state(zone, NR_KERNEL_STACK) *

3215

zone_page_state(zone, NR_KERNEL_STACK) *

3216

THREAD_SIZE / 1024,

3216

THREAD_SIZE / 1024,

3217

K(zone_page_state(zone, NR_PAGETABLE)),

3217

K(zone_page_state(zone, NR_PAGETABLE)),

3218

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3218

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3219

K(zone_page_state(zone, NR_BOUNCE)),

3219

K(zone_page_state(zone, NR_BOUNCE)),

3220

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3220

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3221

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3221

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3222

zone->pages_scanned,

3222

zone->pages_scanned,

3223

(!zone_reclaimable(zone) ? "yes" : "no")

3223

(!zone_reclaimable(zone) ? "yes" : "no")

3224

);

3224

);

3225

printk("lowmem_reserve[]:");

3225

printk("lowmem_reserve[]:");

3226

for (i = 0; i < MAX_NR_ZONES; i++)

3226

for (i = 0; i < MAX_NR_ZONES; i++)

3227

printk(" %lu", zone->lowmem_reserve[i]);

3227

printk(" %lu", zone->lowmem_reserve[i]);

3228

printk("\n");

3228

printk("\n");

3229

}

3229

}

3230

3231

for_each_populated_zone(zone) {

3231

for_each_populated_zone(zone) {

3232

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3232

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3233

unsigned char types[MAX_ORDER];

3233

unsigned char types[MAX_ORDER];

3234

3235

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3235

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3236

continue;

3236

continue;

3237

show_node(zone);

3237

show_node(zone);

3238

printk("%s: ", zone->name);

3238

printk("%s: ", zone->name);

3239

3240

spin_lock_irqsave(&zone->lock, flags);

3240

spin_lock_irqsave(&zone->lock, flags);

3241

for (order = 0; order < MAX_ORDER; order++) {

3241

for (order = 0; order < MAX_ORDER; order++) {

3242

struct free_area *area = &zone->free_area[order];

3242

struct free_area *area = &zone->free_area[order];

3243

int type;

3243

int type;

3244

3245

nr[order] = area->nr_free;

3245

nr[order] = area->nr_free;

3246

total += nr[order] << order;

3246

total += nr[order] << order;

3247

3248

types[order] = 0;

3248

types[order] = 0;

3249

for (type = 0; type < MIGRATE_TYPES; type++) {

3249

for (type = 0; type < MIGRATE_TYPES; type++) {

3250

if (!list_empty(&area->free_list[type]))

3250

if (!list_empty(&area->free_list[type]))

3251

types[order] |= 1 << type;

3251

types[order] |= 1 << type;

3252

}

3252

}

3253

}

3253

}

3254

spin_unlock_irqrestore(&zone->lock, flags);

3254

spin_unlock_irqrestore(&zone->lock, flags);

3255

for (order = 0; order < MAX_ORDER; order++) {

3255

for (order = 0; order < MAX_ORDER; order++) {

3256

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3256

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3257

if (nr[order])

3257

if (nr[order])

3258

show_migration_types(types[order]);

3258

show_migration_types(types[order]);

3259

}

3259

}

3260

printk("= %lukB\n", K(total));

3260

printk("= %lukB\n", K(total));

3261

}

3261

}

3262

3263

hugetlb_show_meminfo();

3263

hugetlb_show_meminfo();

3264

3265

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3265

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3266

3267

show_swap_cache_info();

3267

show_swap_cache_info();

3268

}

3268

}

3269

3270

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3270

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3271

{

3271

{

3272

zoneref->zone = zone;

3272

zoneref->zone = zone;

3273

zoneref->zone_idx = zone_idx(zone);

3273

zoneref->zone_idx = zone_idx(zone);

3274

}

3274

}

3275

3276

/*

3276

/*

3277

* Builds allocation fallback zone lists.

3277

* Builds allocation fallback zone lists.

3278

*

3278

*

3279

* Add all populated zones of a node to the zonelist.

3279

* Add all populated zones of a node to the zonelist.

3280

*/

3280

*/

3281

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3281

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3282

int nr_zones)

3282

int nr_zones)

3283

{

3283

{

3284

struct zone *zone;

3284

struct zone *zone;

3285

enum zone_type zone_type = MAX_NR_ZONES;

3285

enum zone_type zone_type = MAX_NR_ZONES;

3286

3287

do {

3287

do {

3288

zone_type--;

3288

zone_type--;

3289

zone = pgdat->node_zones + zone_type;

3289

zone = pgdat->node_zones + zone_type;

3290

if (populated_zone(zone)) {

3290

if (populated_zone(zone)) {

3291

zoneref_set_zone(zone,

3291

zoneref_set_zone(zone,

3292

&zonelist->_zonerefs[nr_zones++]);

3292

&zonelist->_zonerefs[nr_zones++]);

3293

check_highest_zone(zone_type);

3293

check_highest_zone(zone_type);

3294

}

3294

}

3295

} while (zone_type);

3295

} while (zone_type);

3296

3297

return nr_zones;

3297

return nr_zones;

3298

}

3298

}

3299

3300

3301

/*

3301

/*

3302

* zonelist_order:

3302

* zonelist_order:

3303

* 0 = automatic detection of better ordering.

3303

* 0 = automatic detection of better ordering.

3304

* 1 = order by ([node] distance, -zonetype)

3304

* 1 = order by ([node] distance, -zonetype)

3305

* 2 = order by (-zonetype, [node] distance)

3305

* 2 = order by (-zonetype, [node] distance)

3306

*

3306

*

3307

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3307

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3308

* the same zonelist. So only NUMA can configure this param.

3308

* the same zonelist. So only NUMA can configure this param.

3309

*/

3309

*/

3310

#define ZONELIST_ORDER_DEFAULT 0

3310

#define ZONELIST_ORDER_DEFAULT 0

3311

#define ZONELIST_ORDER_NODE 1

3311

#define ZONELIST_ORDER_NODE 1

3312

#define ZONELIST_ORDER_ZONE 2

3312

#define ZONELIST_ORDER_ZONE 2

3313

3314

/* zonelist order in the kernel.

3314

/* zonelist order in the kernel.

3315

* set_zonelist_order() will set this to NODE or ZONE.

3315

* set_zonelist_order() will set this to NODE or ZONE.

3316

*/

3316

*/

3317

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3317

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3318

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3318

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3319

3320

3321

#ifdef CONFIG_NUMA

3321

#ifdef CONFIG_NUMA

3322

/* The value user specified ....changed by config */

3322

/* The value user specified ....changed by config */

3323

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3323

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3324

/* string for sysctl */

3324

/* string for sysctl */

3325

#define NUMA_ZONELIST_ORDER_LEN 16

3325

#define NUMA_ZONELIST_ORDER_LEN 16

3326

char numa_zonelist_order[16] = "default";

3326

char numa_zonelist_order[16] = "default";

3327

3328

/*

3328

/*

3329

* interface for configure zonelist ordering.

3329

* interface for configure zonelist ordering.

3330

* command line option "numa_zonelist_order"

3330

* command line option "numa_zonelist_order"

3331

* = "[dD]efault - default, automatic configuration.

3331

* = "[dD]efault - default, automatic configuration.

3332

* = "[nN]ode - order by node locality, then by zone within node

3332

* = "[nN]ode - order by node locality, then by zone within node

3333

* = "[zZ]one - order by zone, then by locality within zone

3333

* = "[zZ]one - order by zone, then by locality within zone

3334

*/

3334

*/

3335

3336

static int __parse_numa_zonelist_order(char *s)

3336

static int __parse_numa_zonelist_order(char *s)

3337

{

3337

{

3338

if (*s == 'd' || *s == 'D') {

3338

if (*s == 'd' || *s == 'D') {

3339

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3339

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3340

} else if (*s == 'n' || *s == 'N') {

3340

} else if (*s == 'n' || *s == 'N') {

3341

user_zonelist_order = ZONELIST_ORDER_NODE;

3341

user_zonelist_order = ZONELIST_ORDER_NODE;

3342

} else if (*s == 'z' || *s == 'Z') {

3342

} else if (*s == 'z' || *s == 'Z') {

3343

user_zonelist_order = ZONELIST_ORDER_ZONE;

3343

user_zonelist_order = ZONELIST_ORDER_ZONE;

3344

} else {

3344

} else {

3345

printk(KERN_WARNING

3345

printk(KERN_WARNING

3346

"Ignoring invalid numa_zonelist_order value: "

3346

"Ignoring invalid numa_zonelist_order value: "

3347

"%s\n", s);

3347

"%s\n", s);

3348

return -EINVAL;

3348

return -EINVAL;

3349

}

3349

}

3350

return 0;

3350

return 0;

3351

}

3351

}

3352

3353

static __init int setup_numa_zonelist_order(char *s)

3353

static __init int setup_numa_zonelist_order(char *s)

3354

{

3354

{

3355

int ret;

3355

int ret;

3356

3357

if (!s)

3357

if (!s)

3358

return 0;

3358

return 0;

3359

3360

ret = __parse_numa_zonelist_order(s);

3360

ret = __parse_numa_zonelist_order(s);

3361

if (ret == 0)

3361

if (ret == 0)

3362

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3362

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3363

3364

return ret;

3364

return ret;

3365

}

3365

}

3366

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3366

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3367

3368

/*

3368

/*

3369

* sysctl handler for numa_zonelist_order

3369

* sysctl handler for numa_zonelist_order

3370

*/

3370

*/

3371

int numa_zonelist_order_handler(ctl_table *table, int write,

3371

int numa_zonelist_order_handler(ctl_table *table, int write,

3372

void __user *buffer, size_t *length,

3372

void __user *buffer, size_t *length,

3373

loff_t *ppos)

3373

loff_t *ppos)

3374

{

3374

{

3375

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3375

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3376

int ret;

3376

int ret;

3377

static DEFINE_MUTEX(zl_order_mutex);

3377

static DEFINE_MUTEX(zl_order_mutex);

3378

3379

mutex_lock(&zl_order_mutex);

3379

mutex_lock(&zl_order_mutex);

3380

if (write) {

3380

if (write) {

3381

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3381

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3382

ret = -EINVAL;

3382

ret = -EINVAL;

3383

goto out;

3383

goto out;

3384

}

3384

}

3385

strcpy(saved_string, (char *)table->data);

3385

strcpy(saved_string, (char *)table->data);

3386

}

3386

}

3387

ret = proc_dostring(table, write, buffer, length, ppos);

3387

ret = proc_dostring(table, write, buffer, length, ppos);

3388

if (ret)

3388

if (ret)

3389

goto out;

3389

goto out;

3390

if (write) {

3390

if (write) {

3391

int oldval = user_zonelist_order;

3391

int oldval = user_zonelist_order;

3392

3393

ret = __parse_numa_zonelist_order((char *)table->data);

3393

ret = __parse_numa_zonelist_order((char *)table->data);

3394

if (ret) {

3394

if (ret) {

3395

/*

3395

/*

3396

* bogus value. restore saved string

3396

* bogus value. restore saved string

3397

*/

3397

*/

3398

strncpy((char *)table->data, saved_string,

3398

strncpy((char *)table->data, saved_string,

3399

NUMA_ZONELIST_ORDER_LEN);

3399

NUMA_ZONELIST_ORDER_LEN);

3400

user_zonelist_order = oldval;

3400

user_zonelist_order = oldval;

3401

} else if (oldval != user_zonelist_order) {

3401

} else if (oldval != user_zonelist_order) {

3402

mutex_lock(&zonelists_mutex);

3402

mutex_lock(&zonelists_mutex);

3403

build_all_zonelists(NULL, NULL);

3403

build_all_zonelists(NULL, NULL);

3404

mutex_unlock(&zonelists_mutex);

3404

mutex_unlock(&zonelists_mutex);

3405

}

3405

}

3406

}

3406

}

3407

out:

3407

out:

3408

mutex_unlock(&zl_order_mutex);

3408

mutex_unlock(&zl_order_mutex);

3409

return ret;

3409

return ret;

3410

}

3410

}

3411

3412

3413

#define MAX_NODE_LOAD (nr_online_nodes)

3413

#define MAX_NODE_LOAD (nr_online_nodes)

3414

static int node_load[MAX_NUMNODES];

3414

static int node_load[MAX_NUMNODES];

3415

3416

/**

3416

/**

3417

* find_next_best_node - find the next node that should appear in a given node's fallback list

3417

* find_next_best_node - find the next node that should appear in a given node's fallback list

3418

* @node: node whose fallback list we're appending

3418

* @node: node whose fallback list we're appending

3419

* @used_node_mask: nodemask_t of already used nodes

3419

* @used_node_mask: nodemask_t of already used nodes

3420

*

3420

*

3421

* We use a number of factors to determine which is the next node that should

3421

* We use a number of factors to determine which is the next node that should

3422

* appear on a given node's fallback list. The node should not have appeared

3422

* appear on a given node's fallback list. The node should not have appeared

3423

* already in @node's fallback list, and it should be the next closest node

3423

* already in @node's fallback list, and it should be the next closest node

3424

* according to the distance array (which contains arbitrary distance values

3424

* according to the distance array (which contains arbitrary distance values

3425

* from each node to each node in the system), and should also prefer nodes

3425

* from each node to each node in the system), and should also prefer nodes

3426

* with no CPUs, since presumably they'll have very little allocation pressure

3426

* with no CPUs, since presumably they'll have very little allocation pressure

3427

* on them otherwise.

3427

* on them otherwise.

3428

* It returns -1 if no node is found.

3428

* It returns -1 if no node is found.

3429

*/

3429

*/

3430

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3430

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3431

{

3431

{

3432

int n, val;

3432

int n, val;

3433

int min_val = INT_MAX;

3433

int min_val = INT_MAX;

3434

int best_node = NUMA_NO_NODE;

3434

int best_node = NUMA_NO_NODE;

3435

const struct cpumask *tmp = cpumask_of_node(0);

3435

const struct cpumask *tmp = cpumask_of_node(0);

3436

3437

/* Use the local node if we haven't already */

3437

/* Use the local node if we haven't already */

3438

if (!node_isset(node, *used_node_mask)) {

3438

if (!node_isset(node, *used_node_mask)) {

3439

node_set(node, *used_node_mask);

3439

node_set(node, *used_node_mask);

3440

return node;

3440

return node;

3441

}

3441

}

3442

3443

for_each_node_state(n, N_MEMORY) {

3443

for_each_node_state(n, N_MEMORY) {

3444

3445

/* Don't want a node to appear more than once */

3445

/* Don't want a node to appear more than once */

3446

if (node_isset(n, *used_node_mask))

3446

if (node_isset(n, *used_node_mask))

3447

continue;

3447

continue;

3448

3449

/* Use the distance array to find the distance */

3449

/* Use the distance array to find the distance */

3450

val = node_distance(node, n);

3450

val = node_distance(node, n);

3451

3452

/* Penalize nodes under us ("prefer the next node") */

3452

/* Penalize nodes under us ("prefer the next node") */

3453

val += (n < node);

3453

val += (n < node);

3454

3455

/* Give preference to headless and unused nodes */

3455

/* Give preference to headless and unused nodes */

3456

tmp = cpumask_of_node(n);

3456

tmp = cpumask_of_node(n);

3457

if (!cpumask_empty(tmp))

3457

if (!cpumask_empty(tmp))

3458

val += PENALTY_FOR_NODE_WITH_CPUS;

3458

val += PENALTY_FOR_NODE_WITH_CPUS;

3459

3460

/* Slight preference for less loaded node */

3460

/* Slight preference for less loaded node */

3461

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3461

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3462

val += node_load[n];

3462

val += node_load[n];

3463

3464

if (val < min_val) {

3464

if (val < min_val) {

3465

min_val = val;

3465

min_val = val;

3466

best_node = n;

3466

best_node = n;

3467

}

3467

}

3468

}

3468

}

3469

3470

if (best_node >= 0)

3470

if (best_node >= 0)

3471

node_set(best_node, *used_node_mask);

3471

node_set(best_node, *used_node_mask);

3472

3473

return best_node;

3473

return best_node;

3474

}

3474

}

3475

3476

3477

/*

3477

/*

3478

* Build zonelists ordered by node and zones within node.

3478

* Build zonelists ordered by node and zones within node.

3479

* This results in maximum locality--normal zone overflows into local

3479

* This results in maximum locality--normal zone overflows into local

3480

* DMA zone, if any--but risks exhausting DMA zone.

3480

* DMA zone, if any--but risks exhausting DMA zone.

3481

*/

3481

*/

3482

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3482

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3483

{

3483

{

3484

int j;

3484

int j;

3485

struct zonelist *zonelist;

3485

struct zonelist *zonelist;

3486

3487

zonelist = &pgdat->node_zonelists[0];

3487

zonelist = &pgdat->node_zonelists[0];

3488

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3488

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3489

;

3489

;

3490

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3490

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3491

zonelist->_zonerefs[j].zone = NULL;

3491

zonelist->_zonerefs[j].zone = NULL;

3492

zonelist->_zonerefs[j].zone_idx = 0;

3492

zonelist->_zonerefs[j].zone_idx = 0;

3493

}

3493

}

3494

3495

/*

3495

/*

3496

* Build gfp_thisnode zonelists

3496

* Build gfp_thisnode zonelists

3497

*/

3497

*/

3498

static void build_thisnode_zonelists(pg_data_t *pgdat)

3498

static void build_thisnode_zonelists(pg_data_t *pgdat)

3499

{

3499

{

3500

int j;

3500

int j;

3501

struct zonelist *zonelist;

3501

struct zonelist *zonelist;

3502

3503

zonelist = &pgdat->node_zonelists[1];

3503

zonelist = &pgdat->node_zonelists[1];

3504

j = build_zonelists_node(pgdat, zonelist, 0);

3504

j = build_zonelists_node(pgdat, zonelist, 0);

3505

zonelist->_zonerefs[j].zone = NULL;

3505

zonelist->_zonerefs[j].zone = NULL;

3506

zonelist->_zonerefs[j].zone_idx = 0;

3506

zonelist->_zonerefs[j].zone_idx = 0;

3507

}

3507

}

3508

3509

/*

3509

/*

3510

* Build zonelists ordered by zone and nodes within zones.

3510

* Build zonelists ordered by zone and nodes within zones.

3511

* This results in conserving DMA zone[s] until all Normal memory is

3511

* This results in conserving DMA zone[s] until all Normal memory is

3512

* exhausted, but results in overflowing to remote node while memory

3512

* exhausted, but results in overflowing to remote node while memory

3513

* may still exist in local DMA zone.

3513

* may still exist in local DMA zone.

3514

*/

3514

*/

3515

static int node_order[MAX_NUMNODES];

3515

static int node_order[MAX_NUMNODES];

3516

3517

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3517

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3518

{

3518

{

3519

int pos, j, node;

3519

int pos, j, node;

3520

int zone_type; /* needs to be signed */

3520

int zone_type; /* needs to be signed */

3521

struct zone *z;

3521

struct zone *z;

3522

struct zonelist *zonelist;

3522

struct zonelist *zonelist;

3523

3524

zonelist = &pgdat->node_zonelists[0];

3524

zonelist = &pgdat->node_zonelists[0];

3525

pos = 0;

3525

pos = 0;

3526

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3526

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3527

for (j = 0; j < nr_nodes; j++) {

3527

for (j = 0; j < nr_nodes; j++) {

3528

node = node_order[j];

3528

node = node_order[j];

3529

z = &NODE_DATA(node)->node_zones[zone_type];

3529

z = &NODE_DATA(node)->node_zones[zone_type];

3530

if (populated_zone(z)) {

3530

if (populated_zone(z)) {

3531

zoneref_set_zone(z,

3531

zoneref_set_zone(z,

3532

&zonelist->_zonerefs[pos++]);

3532

&zonelist->_zonerefs[pos++]);

3533

check_highest_zone(zone_type);

3533

check_highest_zone(zone_type);

3534

}

3534

}

3535

}

3535

}

3536

}

3536

}

3537

zonelist->_zonerefs[pos].zone = NULL;

3537

zonelist->_zonerefs[pos].zone = NULL;

3538

zonelist->_zonerefs[pos].zone_idx = 0;

3538

zonelist->_zonerefs[pos].zone_idx = 0;

3539

}

3539

}

3540

3541

static int default_zonelist_order(void)

3541

static int default_zonelist_order(void)

3542

{

3542

{

3543

int nid, zone_type;

3543

int nid, zone_type;

3544

unsigned long low_kmem_size, total_size;

3544

unsigned long low_kmem_size, total_size;

3545

struct zone *z;

3545

struct zone *z;

3546

int average_size;

3546

int average_size;

3547

/*

3547

/*

3548

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3548

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3549

* If they are really small and used heavily, the system can fall

3549

* If they are really small and used heavily, the system can fall

3550

* into OOM very easily.

3550

* into OOM very easily.

3551

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3551

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3552

*/

3552

*/

3553

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3553

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3554

low_kmem_size = 0;

3554

low_kmem_size = 0;

3555

total_size = 0;

3555

total_size = 0;

3556

for_each_online_node(nid) {

3556

for_each_online_node(nid) {

3557

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3557

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3558

z = &NODE_DATA(nid)->node_zones[zone_type];

3558

z = &NODE_DATA(nid)->node_zones[zone_type];

3559

if (populated_zone(z)) {

3559

if (populated_zone(z)) {

3560

if (zone_type < ZONE_NORMAL)

3560

if (zone_type < ZONE_NORMAL)

3561

low_kmem_size += z->managed_pages;

3561

low_kmem_size += z->managed_pages;

3562

total_size += z->managed_pages;

3562

total_size += z->managed_pages;

3563

} else if (zone_type == ZONE_NORMAL) {

3563

} else if (zone_type == ZONE_NORMAL) {

3564

/*

3564

/*

3565

* If any node has only lowmem, then node order

3565

* If any node has only lowmem, then node order

3566

* is preferred to allow kernel allocations

3566

* is preferred to allow kernel allocations

3567

* locally; otherwise, they can easily infringe

3567

* locally; otherwise, they can easily infringe

3568

* on other nodes when there is an abundance of

3568

* on other nodes when there is an abundance of

3569

* lowmem available to allocate from.

3569

* lowmem available to allocate from.

3570

*/

3570

*/

3571

return ZONELIST_ORDER_NODE;

3571

return ZONELIST_ORDER_NODE;

3572

}

3572

}

3573

}

3573

}

3574

}

3574

}

3575

if (!low_kmem_size || /* there are no DMA area. */

3575

if (!low_kmem_size || /* there are no DMA area. */

3576

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3576

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3577

return ZONELIST_ORDER_NODE;

3577

return ZONELIST_ORDER_NODE;

3578

/*

3578

/*

3579

* look into each node's config.

3579

* look into each node's config.

3580

* If there is a node whose DMA/DMA32 memory is very big area on

3580

* If there is a node whose DMA/DMA32 memory is very big area on

3581

* local memory, NODE_ORDER may be suitable.

3581

* local memory, NODE_ORDER may be suitable.

3582

*/

3582

*/

3583

average_size = total_size /

3583

average_size = total_size /

3584

(nodes_weight(node_states[N_MEMORY]) + 1);

3584

(nodes_weight(node_states[N_MEMORY]) + 1);

3585

for_each_online_node(nid) {

3585

for_each_online_node(nid) {

3586

low_kmem_size = 0;

3586

low_kmem_size = 0;

3587

total_size = 0;

3587

total_size = 0;

3588

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3588

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3589

z = &NODE_DATA(nid)->node_zones[zone_type];

3589

z = &NODE_DATA(nid)->node_zones[zone_type];

3590

if (populated_zone(z)) {

3590

if (populated_zone(z)) {

3591

if (zone_type < ZONE_NORMAL)

3591

if (zone_type < ZONE_NORMAL)

3592

low_kmem_size += z->present_pages;

3592

low_kmem_size += z->present_pages;

3593

total_size += z->present_pages;

3593

total_size += z->present_pages;

3594

}

3594

}

3595

}

3595

}

3596

if (low_kmem_size &&

3596

if (low_kmem_size &&

3597

total_size > average_size && /* ignore small node */

3597

total_size > average_size && /* ignore small node */

3598

low_kmem_size > total_size * 70/100)

3598

low_kmem_size > total_size * 70/100)

3599

return ZONELIST_ORDER_NODE;

3599

return ZONELIST_ORDER_NODE;

3600

}

3600

}

3601

return ZONELIST_ORDER_ZONE;

3601

return ZONELIST_ORDER_ZONE;

3602

}

3602

}

3603

3604

static void set_zonelist_order(void)

3604

static void set_zonelist_order(void)

3605

{

3605

{

3606

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3606

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3607

current_zonelist_order = default_zonelist_order();

3607

current_zonelist_order = default_zonelist_order();

3608

else

3608

else

3609

current_zonelist_order = user_zonelist_order;

3609

current_zonelist_order = user_zonelist_order;

3610

}

3610

}

3611

3612

static void build_zonelists(pg_data_t *pgdat)

3612

static void build_zonelists(pg_data_t *pgdat)

3613

{

3613

{

3614

int j, node, load;

3614

int j, node, load;

3615

enum zone_type i;

3615

enum zone_type i;

3616

nodemask_t used_mask;

3616

nodemask_t used_mask;

3617

int local_node, prev_node;

3617

int local_node, prev_node;

3618

struct zonelist *zonelist;

3618

struct zonelist *zonelist;

3619

int order = current_zonelist_order;

3619

int order = current_zonelist_order;

3620

3621

/* initialize zonelists */

3621

/* initialize zonelists */

3622

for (i = 0; i < MAX_ZONELISTS; i++) {

3622

for (i = 0; i < MAX_ZONELISTS; i++) {

3623

zonelist = pgdat->node_zonelists + i;

3623

zonelist = pgdat->node_zonelists + i;

3624

zonelist->_zonerefs[0].zone = NULL;

3624

zonelist->_zonerefs[0].zone = NULL;

3625

zonelist->_zonerefs[0].zone_idx = 0;

3625

zonelist->_zonerefs[0].zone_idx = 0;

3626

}

3626

}

3627

3628

/* NUMA-aware ordering of nodes */

3628

/* NUMA-aware ordering of nodes */

3629

local_node = pgdat->node_id;

3629

local_node = pgdat->node_id;

3630

load = nr_online_nodes;

3630

load = nr_online_nodes;

3631

prev_node = local_node;

3631

prev_node = local_node;

3632

nodes_clear(used_mask);

3632

nodes_clear(used_mask);

3633

3634

memset(node_order, 0, sizeof(node_order));

3634

memset(node_order, 0, sizeof(node_order));

3635

j = 0;

3635

j = 0;

3636

3637

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3637

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3638

/*

3638

/*

3639

* We don't want to pressure a particular node.

3639

* We don't want to pressure a particular node.

3640

* So adding penalty to the first node in same

3640

* So adding penalty to the first node in same

3641

* distance group to make it round-robin.

3641

* distance group to make it round-robin.

3642

*/

3642

*/

3643

if (node_distance(local_node, node) !=

3643

if (node_distance(local_node, node) !=

3644

node_distance(local_node, prev_node))

3644

node_distance(local_node, prev_node))

3645

node_load[node] = load;

3645

node_load[node] = load;

3646

3647

prev_node = node;

3647

prev_node = node;

3648

load--;

3648

load--;

3649

if (order == ZONELIST_ORDER_NODE)

3649

if (order == ZONELIST_ORDER_NODE)

3650

build_zonelists_in_node_order(pgdat, node);

3650

build_zonelists_in_node_order(pgdat, node);

3651

else

3651

else

3652

node_order[j++] = node; /* remember order */

3652

node_order[j++] = node; /* remember order */

3653

}

3653

}

3654

3655

if (order == ZONELIST_ORDER_ZONE) {

3655

if (order == ZONELIST_ORDER_ZONE) {

3656

/* calculate node order -- i.e., DMA last! */

3656

/* calculate node order -- i.e., DMA last! */

3657

build_zonelists_in_zone_order(pgdat, j);

3657

build_zonelists_in_zone_order(pgdat, j);

3658

}

3658

}

3659

3660

build_thisnode_zonelists(pgdat);

3660

build_thisnode_zonelists(pgdat);

3661

}

3661

}

3662

3663

/* Construct the zonelist performance cache - see further mmzone.h */

3663

/* Construct the zonelist performance cache - see further mmzone.h */

3664

static void build_zonelist_cache(pg_data_t *pgdat)

3664

static void build_zonelist_cache(pg_data_t *pgdat)

3665

{

3665

{

3666

struct zonelist *zonelist;

3666

struct zonelist *zonelist;

3667

struct zonelist_cache *zlc;

3667

struct zonelist_cache *zlc;

3668

struct zoneref *z;

3668

struct zoneref *z;

3669

3670

zonelist = &pgdat->node_zonelists[0];

3670

zonelist = &pgdat->node_zonelists[0];

3671

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3671

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3672

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3672

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3673

for (z = zonelist->_zonerefs; z->zone; z++)

3673

for (z = zonelist->_zonerefs; z->zone; z++)

3674

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3674

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3675

}

3675

}

3676

3677

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3677

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3678

/*

3678

/*

3679

* Return node id of node used for "local" allocations.

3679

* Return node id of node used for "local" allocations.

3680

* I.e., first node id of first zone in arg node's generic zonelist.

3680

* I.e., first node id of first zone in arg node's generic zonelist.

3681

* Used for initializing percpu 'numa_mem', which is used primarily

3681

* Used for initializing percpu 'numa_mem', which is used primarily

3682

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3682

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3683

*/

3683

*/

3684

int local_memory_node(int node)

3684

int local_memory_node(int node)

3685

{

3685

{

3686

struct zone *zone;

3686

struct zone *zone;

3687

3688

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3688

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3689

gfp_zone(GFP_KERNEL),

3689

gfp_zone(GFP_KERNEL),

3690

NULL,

3690

NULL,

3691

&zone);

3691

&zone);

3692

return zone->node;

3692

return zone->node;

3693

}

3693

}

3694

#endif

3694

#endif

3695

3696

#else /* CONFIG_NUMA */

3696

#else /* CONFIG_NUMA */

3697

3698

static void set_zonelist_order(void)

3698

static void set_zonelist_order(void)

3699

{

3699

{

3700

current_zonelist_order = ZONELIST_ORDER_ZONE;

3700

current_zonelist_order = ZONELIST_ORDER_ZONE;

3701

}

3701

}

3702

3703

static void build_zonelists(pg_data_t *pgdat)

3703

static void build_zonelists(pg_data_t *pgdat)

3704

{

3704

{

3705

int node, local_node;

3705

int node, local_node;

3706

enum zone_type j;

3706

enum zone_type j;

3707

struct zonelist *zonelist;

3707

struct zonelist *zonelist;

3708

3709

local_node = pgdat->node_id;

3709

local_node = pgdat->node_id;

3710

3711

zonelist = &pgdat->node_zonelists[0];

3711

zonelist = &pgdat->node_zonelists[0];

3712

j = build_zonelists_node(pgdat, zonelist, 0);

3712

j = build_zonelists_node(pgdat, zonelist, 0);

3713

3714

/*

3714

/*

3715

* Now we build the zonelist so that it contains the zones

3715

* Now we build the zonelist so that it contains the zones

3716

* of all the other nodes.

3716

* of all the other nodes.

3717

* We don't want to pressure a particular node, so when

3717

* We don't want to pressure a particular node, so when

3718

* building the zones for node N, we make sure that the

3718

* building the zones for node N, we make sure that the

3719

* zones coming right after the local ones are those from

3719

* zones coming right after the local ones are those from

3720

* node N+1 (modulo N)

3720

* node N+1 (modulo N)

3721

*/

3721

*/

3722

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3722

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3723

if (!node_online(node))

3723

if (!node_online(node))

3724

continue;

3724

continue;

3725

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3725

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3726

}

3726

}

3727

for (node = 0; node < local_node; node++) {

3727

for (node = 0; node < local_node; node++) {

3728

if (!node_online(node))

3728

if (!node_online(node))

3729

continue;

3729

continue;

3730

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3730

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3731

}

3731

}

3732

3733

zonelist->_zonerefs[j].zone = NULL;

3733

zonelist->_zonerefs[j].zone = NULL;

3734

zonelist->_zonerefs[j].zone_idx = 0;

3734

zonelist->_zonerefs[j].zone_idx = 0;

3735

}

3735

}

3736

3737

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3737

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3738

static void build_zonelist_cache(pg_data_t *pgdat)

3738

static void build_zonelist_cache(pg_data_t *pgdat)

3739

{

3739

{

3740

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3740

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3741

}

3741

}

3742

3743

#endif /* CONFIG_NUMA */

3743

#endif /* CONFIG_NUMA */

3744

3745

/*

3745

/*

3746

* Boot pageset table. One per cpu which is going to be used for all

3746

* Boot pageset table. One per cpu which is going to be used for all

3747

* zones and all nodes. The parameters will be set in such a way

3747

* zones and all nodes. The parameters will be set in such a way

3748

* that an item put on a list will immediately be handed over to

3748

* that an item put on a list will immediately be handed over to

3749

* the buddy list. This is safe since pageset manipulation is done

3749

* the buddy list. This is safe since pageset manipulation is done

3750

* with interrupts disabled.

3750

* with interrupts disabled.

3751

*

3751

*

3752

* The boot_pagesets must be kept even after bootup is complete for

3752

* The boot_pagesets must be kept even after bootup is complete for

3753

* unused processors and/or zones. They do play a role for bootstrapping

3753

* unused processors and/or zones. They do play a role for bootstrapping

3754

* hotplugged processors.

3754

* hotplugged processors.

3755

*

3755

*

3756

* zoneinfo_show() and maybe other functions do

3756

* zoneinfo_show() and maybe other functions do

3757

* not check if the processor is online before following the pageset pointer.

3757

* not check if the processor is online before following the pageset pointer.

3758

* Other parts of the kernel may not check if the zone is available.

3758

* Other parts of the kernel may not check if the zone is available.

3759

*/

3759

*/

3760

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3760

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3761

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3761

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3762

static void setup_zone_pageset(struct zone *zone);

3762

static void setup_zone_pageset(struct zone *zone);

3763

3764

/*

3764

/*

3765

* Global mutex to protect against size modification of zonelists

3765

* Global mutex to protect against size modification of zonelists

3766

* as well as to serialize pageset setup for the new populated zone.

3766

* as well as to serialize pageset setup for the new populated zone.

3767

*/

3767

*/

3768

DEFINE_MUTEX(zonelists_mutex);

3768

DEFINE_MUTEX(zonelists_mutex);

3769

3770

/* return values int ....just for stop_machine() */

3770

/* return values int ....just for stop_machine() */

3771

static int __build_all_zonelists(void *data)

3771

static int __build_all_zonelists(void *data)

3772

{

3772

{

3773

int nid;

3773

int nid;

3774

int cpu;

3774

int cpu;

3775

pg_data_t *self = data;

3775

pg_data_t *self = data;

3776

3777

#ifdef CONFIG_NUMA

3777

#ifdef CONFIG_NUMA

3778

memset(node_load, 0, sizeof(node_load));

3778

memset(node_load, 0, sizeof(node_load));

3779

#endif

3779

#endif

3780

3781

if (self && !node_online(self->node_id)) {

3781

if (self && !node_online(self->node_id)) {

3782

build_zonelists(self);

3782

build_zonelists(self);

3783

build_zonelist_cache(self);

3783

build_zonelist_cache(self);

3784

}

3784

}

3785

3786

for_each_online_node(nid) {

3786

for_each_online_node(nid) {

3787

pg_data_t *pgdat = NODE_DATA(nid);

3787

pg_data_t *pgdat = NODE_DATA(nid);

3788

3789

build_zonelists(pgdat);

3789

build_zonelists(pgdat);

3790

build_zonelist_cache(pgdat);

3790

build_zonelist_cache(pgdat);

3791

}

3791

}

3792

3793

/*

3793

/*

3794

* Initialize the boot_pagesets that are going to be used

3794

* Initialize the boot_pagesets that are going to be used

3795

* for bootstrapping processors. The real pagesets for

3795

* for bootstrapping processors. The real pagesets for

3796

* each zone will be allocated later when the per cpu

3796

* each zone will be allocated later when the per cpu

3797

* allocator is available.

3797

* allocator is available.

3798

*

3798

*

3799

* boot_pagesets are used also for bootstrapping offline

3799

* boot_pagesets are used also for bootstrapping offline

3800

* cpus if the system is already booted because the pagesets

3800

* cpus if the system is already booted because the pagesets

3801

* are needed to initialize allocators on a specific cpu too.

3801

* are needed to initialize allocators on a specific cpu too.

3802

* F.e. the percpu allocator needs the page allocator which

3802

* F.e. the percpu allocator needs the page allocator which

3803

* needs the percpu allocator in order to allocate its pagesets

3803

* needs the percpu allocator in order to allocate its pagesets

3804

* (a chicken-egg dilemma).

3804

* (a chicken-egg dilemma).

3805

*/

3805

*/

3806

for_each_possible_cpu(cpu) {

3806

for_each_possible_cpu(cpu) {

3807

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3807

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3808

3809

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3809

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3810

/*

3810

/*

3811

* We now know the "local memory node" for each node--

3811

* We now know the "local memory node" for each node--

3812

* i.e., the node of the first zone in the generic zonelist.

3812

* i.e., the node of the first zone in the generic zonelist.

3813

* Set up numa_mem percpu variable for on-line cpus. During

3813

* Set up numa_mem percpu variable for on-line cpus. During

3814

* boot, only the boot cpu should be on-line; we'll init the

3814

* boot, only the boot cpu should be on-line; we'll init the

3815

* secondary cpus' numa_mem as they come on-line. During

3815

* secondary cpus' numa_mem as they come on-line. During

3816

* node/memory hotplug, we'll fixup all on-line cpus.

3816

* node/memory hotplug, we'll fixup all on-line cpus.

3817

*/

3817

*/

3818

if (cpu_online(cpu))

3818

if (cpu_online(cpu))

3819

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3819

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3820

#endif

3820

#endif

3821

}

3821

}

3822

3823

return 0;

3823

return 0;

3824

}

3824

}

3825

3826

/*

3826

/*

3827

* Called with zonelists_mutex held always

3827

* Called with zonelists_mutex held always

3828

* unless system_state == SYSTEM_BOOTING.

3828

* unless system_state == SYSTEM_BOOTING.

3829

*/

3829

*/

3830

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3830

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3831

{

3831

{

3832

set_zonelist_order();

3832

set_zonelist_order();

3833

3834

if (system_state == SYSTEM_BOOTING) {

3834

if (system_state == SYSTEM_BOOTING) {

3835

__build_all_zonelists(NULL);

3835

__build_all_zonelists(NULL);

3836

mminit_verify_zonelist();

3836

mminit_verify_zonelist();

3837

cpuset_init_current_mems_allowed();

3837

cpuset_init_current_mems_allowed();

3838

} else {

3838

} else {

3839

#ifdef CONFIG_MEMORY_HOTPLUG

3839

#ifdef CONFIG_MEMORY_HOTPLUG

3840

if (zone)

3840

if (zone)

3841

setup_zone_pageset(zone);

3841

setup_zone_pageset(zone);

3842

#endif

3842

#endif

3843

/* we have to stop all cpus to guarantee there is no user

3843

/* we have to stop all cpus to guarantee there is no user

3844

of zonelist */

3844

of zonelist */

3845

stop_machine(__build_all_zonelists, pgdat, NULL);

3845

stop_machine(__build_all_zonelists, pgdat, NULL);

3846

/* cpuset refresh routine should be here */

3846

/* cpuset refresh routine should be here */

3847

}

3847

}

3848

vm_total_pages = nr_free_pagecache_pages();

3848

vm_total_pages = nr_free_pagecache_pages();

3849

/*

3849

/*

3850

* Disable grouping by mobility if the number of pages in the

3850

* Disable grouping by mobility if the number of pages in the

3851

* system is too low to allow the mechanism to work. It would be

3851

* system is too low to allow the mechanism to work. It would be

3852

* more accurate, but expensive to check per-zone. This check is

3852

* more accurate, but expensive to check per-zone. This check is

3853

* made on memory-hotadd so a system can start with mobility

3853

* made on memory-hotadd so a system can start with mobility

3854

* disabled and enable it later

3854

* disabled and enable it later

3855

*/

3855

*/

3856

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3856

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3857

page_group_by_mobility_disabled = 1;

3857

page_group_by_mobility_disabled = 1;

3858

else

3858

else

3859

page_group_by_mobility_disabled = 0;

3859

page_group_by_mobility_disabled = 0;

3860

3861

printk("Built %i zonelists in %s order, mobility grouping %s. "

3861

printk("Built %i zonelists in %s order, mobility grouping %s. "

3862

"Total pages: %ld\n",

3862

"Total pages: %ld\n",

3863

nr_online_nodes,

3863

nr_online_nodes,

3864

zonelist_order_name[current_zonelist_order],

3864

zonelist_order_name[current_zonelist_order],

3865

page_group_by_mobility_disabled ? "off" : "on",

3865

page_group_by_mobility_disabled ? "off" : "on",

3866

vm_total_pages);

3866

vm_total_pages);

3867

#ifdef CONFIG_NUMA

3867

#ifdef CONFIG_NUMA

3868

printk("Policy zone: %s\n", zone_names[policy_zone]);

3868

printk("Policy zone: %s\n", zone_names[policy_zone]);

3869

#endif

3869

#endif

3870

}

3870

}

3871

3872

/*

3872

/*

3873

* Helper functions to size the waitqueue hash table.

3873

* Helper functions to size the waitqueue hash table.

3874

* Essentially these want to choose hash table sizes sufficiently

3874

* Essentially these want to choose hash table sizes sufficiently

3875

* large so that collisions trying to wait on pages are rare.

3875

* large so that collisions trying to wait on pages are rare.

3876

* But in fact, the number of active page waitqueues on typical

3876

* But in fact, the number of active page waitqueues on typical

3877

* systems is ridiculously low, less than 200. So this is even

3877

* systems is ridiculously low, less than 200. So this is even

3878

* conservative, even though it seems large.

3878

* conservative, even though it seems large.

3879

*

3879

*

3880

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3880

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3881

* waitqueues, i.e. the size of the waitq table given the number of pages.

3881

* waitqueues, i.e. the size of the waitq table given the number of pages.

3882

*/

3882

*/

3883

#define PAGES_PER_WAITQUEUE 256

3883

#define PAGES_PER_WAITQUEUE 256

3884

3885

#ifndef CONFIG_MEMORY_HOTPLUG

3885

#ifndef CONFIG_MEMORY_HOTPLUG

3886

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3886

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3887

{

3887

{

3888

unsigned long size = 1;

3888

unsigned long size = 1;

3889

3890

pages /= PAGES_PER_WAITQUEUE;

3890

pages /= PAGES_PER_WAITQUEUE;

3891

3892

while (size < pages)

3892

while (size < pages)

3893

size <<= 1;

3893

size <<= 1;

3894

3895

/*

3895

/*

3896

* Once we have dozens or even hundreds of threads sleeping

3896

* Once we have dozens or even hundreds of threads sleeping

3897

* on IO we've got bigger problems than wait queue collision.

3897

* on IO we've got bigger problems than wait queue collision.

3898

* Limit the size of the wait table to a reasonable size.

3898

* Limit the size of the wait table to a reasonable size.

3899

*/

3899

*/

3900

size = min(size, 4096UL);

3900

size = min(size, 4096UL);

3901

3902

return max(size, 4UL);

3902

return max(size, 4UL);

3903

}

3903

}

3904

#else

3904

#else

3905

/*

3905

/*

3906

* A zone's size might be changed by hot-add, so it is not possible to determine

3906

* A zone's size might be changed by hot-add, so it is not possible to determine

3907

* a suitable size for its wait_table. So we use the maximum size now.

3907

* a suitable size for its wait_table. So we use the maximum size now.

3908

*

3908

*

3909

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3909

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3910

*

3910

*

3911

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3911

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3912

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3912

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3913

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3913

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3914

*

3914

*

3915

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3915

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3916

* or more by the traditional way. (See above). It equals:

3916

* or more by the traditional way. (See above). It equals:

3917

*

3917

*

3918

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3918

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3919

* ia64(16K page size) : = ( 8G + 4M)byte.

3919

* ia64(16K page size) : = ( 8G + 4M)byte.

3920

* powerpc (64K page size) : = (32G +16M)byte.

3920

* powerpc (64K page size) : = (32G +16M)byte.

3921

*/

3921

*/

3922

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3922

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3923

{

3923

{

3924

return 4096UL;

3924

return 4096UL;

3925

}

3925

}

3926

#endif

3926

#endif

3927

3928

/*

3928

/*

3929

* This is an integer logarithm so that shifts can be used later

3929

* This is an integer logarithm so that shifts can be used later

3930

* to extract the more random high bits from the multiplicative

3930

* to extract the more random high bits from the multiplicative

3931

* hash function before the remainder is taken.

3931

* hash function before the remainder is taken.

3932

*/

3932

*/

3933

static inline unsigned long wait_table_bits(unsigned long size)

3933

static inline unsigned long wait_table_bits(unsigned long size)

3934

{

3934

{

3935

return ffz(~size);

3935

return ffz(~size);

3936

}

3936

}

3937

3938

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3938

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3939

3940

/*

3940

/*

3941

* Check if a pageblock contains reserved pages

3941

* Check if a pageblock contains reserved pages

3942

*/

3942

*/

3943

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3943

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3944

{

3944

{

3945

unsigned long pfn;

3945

unsigned long pfn;

3946

3947

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3947

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3948

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3948

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3949

return 1;

3949

return 1;

3950

}

3950

}

3951

return 0;

3951

return 0;

3952

}

3952

}

3953

3954

/*

3954

/*

3955

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3955

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3956

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3956

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3957

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3957

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3958

* higher will lead to a bigger reserve which will get freed as contiguous

3958

* higher will lead to a bigger reserve which will get freed as contiguous

3959

* blocks as reclaim kicks in

3959

* blocks as reclaim kicks in

3960

*/

3960

*/

3961

static void setup_zone_migrate_reserve(struct zone *zone)

3961

static void setup_zone_migrate_reserve(struct zone *zone)

3962

{

3962

{

3963

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3963

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3964

struct page *page;

3964

struct page *page;

3965

unsigned long block_migratetype;

3965

unsigned long block_migratetype;

3966

int reserve;

3966

int reserve;

3967

int old_reserve;

3967

int old_reserve;

3968

3969

/*

3969

/*

3970

* Get the start pfn, end pfn and the number of blocks to reserve

3970

* Get the start pfn, end pfn and the number of blocks to reserve

3971

* We have to be careful to be aligned to pageblock_nr_pages to

3971

* We have to be careful to be aligned to pageblock_nr_pages to

3972

* make sure that we always check pfn_valid for the first page in

3972

* make sure that we always check pfn_valid for the first page in

3973

* the block.

3973

* the block.

3974

*/

3974

*/

3975

start_pfn = zone->zone_start_pfn;

3975

start_pfn = zone->zone_start_pfn;

3976

end_pfn = zone_end_pfn(zone);

3976

end_pfn = zone_end_pfn(zone);

3977

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3977

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3978

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3978

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3979

pageblock_order;

3979

pageblock_order;

3980

3981

/*

3981

/*

3982

* Reserve blocks are generally in place to help high-order atomic

3982

* Reserve blocks are generally in place to help high-order atomic

3983

* allocations that are short-lived. A min_free_kbytes value that

3983

* allocations that are short-lived. A min_free_kbytes value that

3984

* would result in more than 2 reserve blocks for atomic allocations

3984

* would result in more than 2 reserve blocks for atomic allocations

3985

* is assumed to be in place to help anti-fragmentation for the

3985

* is assumed to be in place to help anti-fragmentation for the

3986

* future allocation of hugepages at runtime.

3986

* future allocation of hugepages at runtime.

3987

*/

3987

*/

3988

reserve = min(2, reserve);

3988

reserve = min(2, reserve);

3989

old_reserve = zone->nr_migrate_reserve_block;

3989

old_reserve = zone->nr_migrate_reserve_block;

3990

3991

/* When memory hot-add, we almost always need to do nothing */

3991

/* When memory hot-add, we almost always need to do nothing */

3992

if (reserve == old_reserve)

3992

if (reserve == old_reserve)

3993

return;

3993

return;

3994

zone->nr_migrate_reserve_block = reserve;

3994

zone->nr_migrate_reserve_block = reserve;

3995

3996

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3996

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3997

if (!pfn_valid(pfn))

3997

if (!pfn_valid(pfn))

3998

continue;

3998

continue;

3999

page = pfn_to_page(pfn);

3999

page = pfn_to_page(pfn);

4000

4001

/* Watch out for overlapping nodes */

4001

/* Watch out for overlapping nodes */

4002

if (page_to_nid(page) != zone_to_nid(zone))

4002

if (page_to_nid(page) != zone_to_nid(zone))

4003

continue;

4003

continue;

4004

4005

block_migratetype = get_pageblock_migratetype(page);

4005

block_migratetype = get_pageblock_migratetype(page);

4006

4007

/* Only test what is necessary when the reserves are not met */

4007

/* Only test what is necessary when the reserves are not met */

4008

if (reserve > 0) {

4008

if (reserve > 0) {

4009

/*

4009

/*

4010

* Blocks with reserved pages will never free, skip

4010

* Blocks with reserved pages will never free, skip

4011

* them.

4011

* them.

4012

*/

4012

*/

4013

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4013

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4014

if (pageblock_is_reserved(pfn, block_end_pfn))

4014

if (pageblock_is_reserved(pfn, block_end_pfn))

4015

continue;

4015

continue;

4016

4017

/* If this block is reserved, account for it */

4017

/* If this block is reserved, account for it */

4018

if (block_migratetype == MIGRATE_RESERVE) {

4018

if (block_migratetype == MIGRATE_RESERVE) {

4019

reserve--;

4019

reserve--;

4020

continue;

4020

continue;

4021

}

4021

}

4022

4023

/* Suitable for reserving if this block is movable */

4023

/* Suitable for reserving if this block is movable */

4024

if (block_migratetype == MIGRATE_MOVABLE) {

4024

if (block_migratetype == MIGRATE_MOVABLE) {

4025

set_pageblock_migratetype(page,

4025

set_pageblock_migratetype(page,

4026

MIGRATE_RESERVE);

4026

MIGRATE_RESERVE);

4027

move_freepages_block(zone, page,

4027

move_freepages_block(zone, page,

4028

MIGRATE_RESERVE);

4028

MIGRATE_RESERVE);

4029

reserve--;

4029

reserve--;

4030

continue;

4030

continue;

4031

}

4031

}

4032

} else if (!old_reserve) {

4032

} else if (!old_reserve) {

4033

/*

4033

/*

4034

* At boot time we don't need to scan the whole zone

4034

* At boot time we don't need to scan the whole zone

4035

* for turning off MIGRATE_RESERVE.

4035

* for turning off MIGRATE_RESERVE.

4036

*/

4036

*/

4037

break;

4037

break;

4038

}

4038

}

4039

4040

/*

4040

/*

4041

* If the reserve is met and this is a previous reserved block,

4041

* If the reserve is met and this is a previous reserved block,

4042

* take it back

4042

* take it back

4043

*/

4043

*/

4044

if (block_migratetype == MIGRATE_RESERVE) {

4044

if (block_migratetype == MIGRATE_RESERVE) {

4045

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4045

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4046

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4046

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4047

}

4047

}

4048

}

4048

}

4049

}

4049

}

4050

4051

/*

4051

/*

4052

* Initially all pages are reserved - free ones are freed

4052

* Initially all pages are reserved - free ones are freed

4053

* up by free_all_bootmem() once the early boot process is

4053

* up by free_all_bootmem() once the early boot process is

4054

* done. Non-atomic initialization, single-pass.

4054

* done. Non-atomic initialization, single-pass.

4055

*/

4055

*/

4056

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4056

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4057

unsigned long start_pfn, enum memmap_context context)

4057

unsigned long start_pfn, enum memmap_context context)

4058

{

4058

{

4059

struct page *page;

4059

struct page *page;

4060

unsigned long end_pfn = start_pfn + size;

4060

unsigned long end_pfn = start_pfn + size;

4061

unsigned long pfn;

4061

unsigned long pfn;

4062

struct zone *z;

4062

struct zone *z;

4063

4064

if (highest_memmap_pfn < end_pfn - 1)

4064

if (highest_memmap_pfn < end_pfn - 1)

4065

highest_memmap_pfn = end_pfn - 1;

4065

highest_memmap_pfn = end_pfn - 1;

4066

4067

z = &NODE_DATA(nid)->node_zones[zone];

4067

z = &NODE_DATA(nid)->node_zones[zone];

4068

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4068

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4069

/*

4069

/*

4070

* There can be holes in boot-time mem_map[]s

4070

* There can be holes in boot-time mem_map[]s

4071

* handed to this function. They do not

4071

* handed to this function. They do not

4072

* exist on hotplugged memory.

4072

* exist on hotplugged memory.

4073

*/

4073

*/

4074

if (context == MEMMAP_EARLY) {

4074

if (context == MEMMAP_EARLY) {

4075

if (!early_pfn_valid(pfn))

4075

if (!early_pfn_valid(pfn))

4076

continue;

4076

continue;

4077

if (!early_pfn_in_nid(pfn, nid))

4077

if (!early_pfn_in_nid(pfn, nid))

4078

continue;

4078

continue;

4079

}

4079

}

4080

page = pfn_to_page(pfn);

4080

page = pfn_to_page(pfn);

4081

set_page_links(page, zone, nid, pfn);

4081

set_page_links(page, zone, nid, pfn);

4082

mminit_verify_page_links(page, zone, nid, pfn);

4082

mminit_verify_page_links(page, zone, nid, pfn);

4083

init_page_count(page);

4083

init_page_count(page);

4084

page_mapcount_reset(page);

4084

page_mapcount_reset(page);

4085

page_nid_reset_last(page);

4085

page_nid_reset_last(page);

4086

SetPageReserved(page);

4086

SetPageReserved(page);

4087

/*

4087

/*

4088

* Mark the block movable so that blocks are reserved for

4088

* Mark the block movable so that blocks are reserved for

4089

* movable at startup. This will force kernel allocations

4089

* movable at startup. This will force kernel allocations

4090

* to reserve their blocks rather than leaking throughout

4090

* to reserve their blocks rather than leaking throughout

4091

* the address space during boot when many long-lived

4091

* the address space during boot when many long-lived

4092

* kernel allocations are made. Later some blocks near

4092

* kernel allocations are made. Later some blocks near

4093

* the start are marked MIGRATE_RESERVE by

4093

* the start are marked MIGRATE_RESERVE by

4094

* setup_zone_migrate_reserve()

4094

* setup_zone_migrate_reserve()

4095

*

4095

*

4096

* bitmap is created for zone's valid pfn range. but memmap

4096

* bitmap is created for zone's valid pfn range. but memmap

4097

* can be created for invalid pages (for alignment)

4097

* can be created for invalid pages (for alignment)

4098

* check here not to call set_pageblock_migratetype() against

4098

* check here not to call set_pageblock_migratetype() against

4099

* pfn out of zone.

4099

* pfn out of zone.

4100

*/

4100

*/

4101

if ((z->zone_start_pfn <= pfn)

4101

if ((z->zone_start_pfn <= pfn)

4102

&& (pfn < zone_end_pfn(z))

4102

&& (pfn < zone_end_pfn(z))

4103

&& !(pfn & (pageblock_nr_pages - 1)))

4103

&& !(pfn & (pageblock_nr_pages - 1)))

4104

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4104

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4105

4106

INIT_LIST_HEAD(&page->lru);

4106

INIT_LIST_HEAD(&page->lru);

4107

#ifdef WANT_PAGE_VIRTUAL

4107

#ifdef WANT_PAGE_VIRTUAL

4108

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4108

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4109

if (!is_highmem_idx(zone))

4109

if (!is_highmem_idx(zone))

4110

set_page_address(page, __va(pfn << PAGE_SHIFT));

4110

set_page_address(page, __va(pfn << PAGE_SHIFT));

4111

#endif

4111

#endif

4112

}

4112

}

4113

}

4113

}

4114

4115

static void __meminit zone_init_free_lists(struct zone *zone)

4115

static void __meminit zone_init_free_lists(struct zone *zone)

4116

{

4116

{

4117

unsigned int order, t;

4117

unsigned int order, t;

4118

for_each_migratetype_order(order, t) {

4118

for_each_migratetype_order(order, t) {

4119

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4119

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4120

zone->free_area[order].nr_free = 0;

4120

zone->free_area[order].nr_free = 0;

4121

}

4121

}

4122

}

4122

}

4123

4124

#ifndef __HAVE_ARCH_MEMMAP_INIT

4124

#ifndef __HAVE_ARCH_MEMMAP_INIT

4125

#define memmap_init(size, nid, zone, start_pfn) \

4125

#define memmap_init(size, nid, zone, start_pfn) \

4126

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4126

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4127

#endif

4127

#endif

4128

4129

static int zone_batchsize(struct zone *zone)

4129

static int zone_batchsize(struct zone *zone)

4130

{

4130

{

4131

#ifdef CONFIG_MMU

4131

#ifdef CONFIG_MMU

4132

int batch;

4132

int batch;

4133

4134

/*

4134

/*

4135

* The per-cpu-pages pools are set to around 1000th of the

4135

* The per-cpu-pages pools are set to around 1000th of the

4136

* size of the zone. But no more than 1/2 of a meg.

4136

* size of the zone. But no more than 1/2 of a meg.

4137

*

4137

*

4138

* OK, so we don't know how big the cache is. So guess.

4138

* OK, so we don't know how big the cache is. So guess.

4139

*/

4139

*/

4140

batch = zone->managed_pages / 1024;

4140

batch = zone->managed_pages / 1024;

4141

if (batch * PAGE_SIZE > 512 * 1024)

4141

if (batch * PAGE_SIZE > 512 * 1024)

4142

batch = (512 * 1024) / PAGE_SIZE;

4142

batch = (512 * 1024) / PAGE_SIZE;

4143

batch /= 4; /* We effectively *= 4 below */

4143

batch /= 4; /* We effectively *= 4 below */

4144

if (batch < 1)

4144

if (batch < 1)

4145

batch = 1;

4145

batch = 1;

4146

4147

/*

4147

/*

4148

* Clamp the batch to a 2^n - 1 value. Having a power

4148

* Clamp the batch to a 2^n - 1 value. Having a power

4149

* of 2 value was found to be more likely to have

4149

* of 2 value was found to be more likely to have

4150

* suboptimal cache aliasing properties in some cases.

4150

* suboptimal cache aliasing properties in some cases.

4151

*

4151

*

4152

* For example if 2 tasks are alternately allocating

4152

* For example if 2 tasks are alternately allocating

4153

* batches of pages, one task can end up with a lot

4153

* batches of pages, one task can end up with a lot

4154

* of pages of one half of the possible page colors

4154

* of pages of one half of the possible page colors

4155

* and the other with pages of the other colors.

4155

* and the other with pages of the other colors.

4156

*/

4156

*/

4157

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4157

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4158

4159

return batch;

4159

return batch;

4160

4161

#else

4161

#else

4162

/* The deferral and batching of frees should be suppressed under NOMMU

4162

/* The deferral and batching of frees should be suppressed under NOMMU

4163

* conditions.

4163

* conditions.

4164

*

4164

*

4165

* The problem is that NOMMU needs to be able to allocate large chunks

4165

* The problem is that NOMMU needs to be able to allocate large chunks

4166

* of contiguous memory as there's no hardware page translation to

4166

* of contiguous memory as there's no hardware page translation to

4167

* assemble apparent contiguous memory from discontiguous pages.

4167

* assemble apparent contiguous memory from discontiguous pages.

4168

*

4168

*

4169

* Queueing large contiguous runs of pages for batching, however,

4169

* Queueing large contiguous runs of pages for batching, however,

4170

* causes the pages to actually be freed in smaller chunks. As there

4170

* causes the pages to actually be freed in smaller chunks. As there

4171

* can be a significant delay between the individual batches being

4171

* can be a significant delay between the individual batches being

4172

* recycled, this leads to the once large chunks of space being

4172

* recycled, this leads to the once large chunks of space being

4173

* fragmented and becoming unavailable for high-order allocations.

4173

* fragmented and becoming unavailable for high-order allocations.

4174

*/

4174

*/

4175

return 0;

4175

return 0;

4176

#endif

4176

#endif

4177

}

4177

}

4178

4179

/*

4179

/*

4180

* pcp->high and pcp->batch values are related and dependent on one another:

4180

* pcp->high and pcp->batch values are related and dependent on one another:

4181

* ->batch must never be higher then ->high.

4181

* ->batch must never be higher then ->high.

4182

* The following function updates them in a safe manner without read side

4182

* The following function updates them in a safe manner without read side

4183

* locking.

4183

* locking.

4184

*

4184

*

4185

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4185

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4186

* those fields changing asynchronously (acording the the above rule).

4186

* those fields changing asynchronously (acording the the above rule).

4187

*

4187

*

4188

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4188

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4189

* outside of boot time (or some other assurance that no concurrent updaters

4189

* outside of boot time (or some other assurance that no concurrent updaters

4190

* exist).

4190

* exist).

4191

*/

4191

*/

4192

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4192

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4193

unsigned long batch)

4193

unsigned long batch)

4194

{

4194

{

4195

/* start with a fail safe value for batch */

4195

/* start with a fail safe value for batch */

4196

pcp->batch = 1;

4196

pcp->batch = 1;

4197

smp_wmb();

4197

smp_wmb();

4198

4199

/* Update high, then batch, in order */

4199

/* Update high, then batch, in order */

4200

pcp->high = high;

4200

pcp->high = high;

4201

smp_wmb();

4201

smp_wmb();

4202

4203

pcp->batch = batch;

4203

pcp->batch = batch;

4204

}

4204

}

4205

4206

/* a companion to pageset_set_high() */

4206

/* a companion to pageset_set_high() */

4207

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4207

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4208

{

4208

{

4209

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4209

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4210

}

4210

}

4211

4212

static void pageset_init(struct per_cpu_pageset *p)

4212

static void pageset_init(struct per_cpu_pageset *p)

4213

{

4213

{

4214

struct per_cpu_pages *pcp;

4214

struct per_cpu_pages *pcp;

4215

int migratetype;

4215

int migratetype;

4216

4217

memset(p, 0, sizeof(*p));

4217

memset(p, 0, sizeof(*p));

4218

4219

pcp = &p->pcp;

4219

pcp = &p->pcp;

4220

pcp->count = 0;

4220

pcp->count = 0;

4221

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4221

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4222

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4222

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4223

}

4223

}

4224

4225

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4225

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4226

{

4226

{

4227

pageset_init(p);

4227

pageset_init(p);

4228

pageset_set_batch(p, batch);

4228

pageset_set_batch(p, batch);

4229

}

4229

}

4230

4231

/*

4231

/*

4232

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4232

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4233

* to the value high for the pageset p.

4233

* to the value high for the pageset p.

4234

*/

4234

*/

4235

static void pageset_set_high(struct per_cpu_pageset *p,

4235

static void pageset_set_high(struct per_cpu_pageset *p,

4236

unsigned long high)

4236

unsigned long high)

4237

{

4237

{

4238

unsigned long batch = max(1UL, high / 4);

4238

unsigned long batch = max(1UL, high / 4);

4239

if ((high / 4) > (PAGE_SHIFT * 8))

4239

if ((high / 4) > (PAGE_SHIFT * 8))

4240

batch = PAGE_SHIFT * 8;

4240

batch = PAGE_SHIFT * 8;

4241

4242

pageset_update(&p->pcp, high, batch);

4242

pageset_update(&p->pcp, high, batch);

4243

}

4243

}

4244

4245

static void pageset_set_high_and_batch(struct zone *zone,

4245

static void pageset_set_high_and_batch(struct zone *zone,

4246

struct per_cpu_pageset *pcp)

4246

struct per_cpu_pageset *pcp)

4247

{

4247

{

4248

if (percpu_pagelist_fraction)

4248

if (percpu_pagelist_fraction)

4249

pageset_set_high(pcp,

4249

pageset_set_high(pcp,

4250

(zone->managed_pages /

4250

(zone->managed_pages /

4251

percpu_pagelist_fraction));

4251

percpu_pagelist_fraction));

4252

else

4252

else

4253

pageset_set_batch(pcp, zone_batchsize(zone));

4253

pageset_set_batch(pcp, zone_batchsize(zone));

4254

}

4254

}

4255

4256

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4256

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4257

{

4257

{

4258

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4258

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4259

4260

pageset_init(pcp);

4260

pageset_init(pcp);

4261

pageset_set_high_and_batch(zone, pcp);

4261

pageset_set_high_and_batch(zone, pcp);

4262

}

4262

}

4263

4264

static void __meminit setup_zone_pageset(struct zone *zone)

4264

static void __meminit setup_zone_pageset(struct zone *zone)

4265

{

4265

{

4266

int cpu;

4266

int cpu;

4267

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4267

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4268

for_each_possible_cpu(cpu)

4268

for_each_possible_cpu(cpu)

4269

zone_pageset_init(zone, cpu);

4269

zone_pageset_init(zone, cpu);

4270

}

4270

}

4271

4272

/*

4272

/*

4273

* Allocate per cpu pagesets and initialize them.

4273

* Allocate per cpu pagesets and initialize them.

4274

* Before this call only boot pagesets were available.

4274

* Before this call only boot pagesets were available.

4275

*/

4275

*/

4276

void __init setup_per_cpu_pageset(void)

4276

void __init setup_per_cpu_pageset(void)

4277

{

4277

{

4278

struct zone *zone;

4278

struct zone *zone;

4279

4280

for_each_populated_zone(zone)

4280

for_each_populated_zone(zone)

4281

setup_zone_pageset(zone);

4281

setup_zone_pageset(zone);

4282

}

4282

}

4283

4284

static noinline __init_refok

4284

static noinline __init_refok

4285

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4285

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4286

{

4286

{

4287

int i;

4287

int i;

4288

struct pglist_data *pgdat = zone->zone_pgdat;

4288

struct pglist_data *pgdat = zone->zone_pgdat;

4289

size_t alloc_size;

4289

size_t alloc_size;

4290

4291

/*

4291

/*

4292

* The per-page waitqueue mechanism uses hashed waitqueues

4292

* The per-page waitqueue mechanism uses hashed waitqueues

4293

* per zone.

4293

* per zone.

4294

*/

4294

*/

4295

zone->wait_table_hash_nr_entries =

4295

zone->wait_table_hash_nr_entries =

4296

wait_table_hash_nr_entries(zone_size_pages);

4296

wait_table_hash_nr_entries(zone_size_pages);

4297

zone->wait_table_bits =

4297

zone->wait_table_bits =

4298

wait_table_bits(zone->wait_table_hash_nr_entries);

4298

wait_table_bits(zone->wait_table_hash_nr_entries);

4299

alloc_size = zone->wait_table_hash_nr_entries

4299

alloc_size = zone->wait_table_hash_nr_entries

4300

* sizeof(wait_queue_head_t);

4300

* sizeof(wait_queue_head_t);

4301

4302

if (!slab_is_available()) {

4302

if (!slab_is_available()) {

4303

zone->wait_table = (wait_queue_head_t *)

4303

zone->wait_table = (wait_queue_head_t *)

4304

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4304

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4305

} else {

4305

} else {

4306

/*

4306

/*

4307

* This case means that a zone whose size was 0 gets new memory

4307

* This case means that a zone whose size was 0 gets new memory

4308

* via memory hot-add.

4308

* via memory hot-add.

4309

* But it may be the case that a new node was hot-added. In

4309

* But it may be the case that a new node was hot-added. In

4310

* this case vmalloc() will not be able to use this new node's

4310

* this case vmalloc() will not be able to use this new node's

4311

* memory - this wait_table must be initialized to use this new

4311

* memory - this wait_table must be initialized to use this new

4312

* node itself as well.

4312

* node itself as well.

4313

* To use this new node's memory, further consideration will be

4313

* To use this new node's memory, further consideration will be

4314

* necessary.

4314

* necessary.

4315

*/

4315

*/

4316

zone->wait_table = vmalloc(alloc_size);

4316

zone->wait_table = vmalloc(alloc_size);

4317

}

4317

}

4318

if (!zone->wait_table)

4318

if (!zone->wait_table)

4319

return -ENOMEM;

4319

return -ENOMEM;

4320

4321

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4321

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4322

init_waitqueue_head(zone->wait_table + i);

4322

init_waitqueue_head(zone->wait_table + i);

4323

4324

return 0;

4324

return 0;

4325

}

4325

}

4326

4327

static __meminit void zone_pcp_init(struct zone *zone)

4327

static __meminit void zone_pcp_init(struct zone *zone)

4328

{

4328

{

4329

/*

4329

/*

4330

* per cpu subsystem is not up at this point. The following code

4330

* per cpu subsystem is not up at this point. The following code

4331

* relies on the ability of the linker to provide the

4331

* relies on the ability of the linker to provide the

4332

* offset of a (static) per cpu variable into the per cpu area.

4332

* offset of a (static) per cpu variable into the per cpu area.

4333

*/

4333

*/

4334

zone->pageset = &boot_pageset;

4334

zone->pageset = &boot_pageset;

4335

4336

if (zone->present_pages)

4336

if (zone->present_pages)

4337

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4337

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4338

zone->name, zone->present_pages,

4338

zone->name, zone->present_pages,

4339

zone_batchsize(zone));

4339

zone_batchsize(zone));

4340

}

4340

}

4341

4342

int __meminit init_currently_empty_zone(struct zone *zone,

4342

int __meminit init_currently_empty_zone(struct zone *zone,

4343

unsigned long zone_start_pfn,

4343

unsigned long zone_start_pfn,

4344

unsigned long size,

4344

unsigned long size,

4345

enum memmap_context context)

4345

enum memmap_context context)

4346

{

4346

{

4347

struct pglist_data *pgdat = zone->zone_pgdat;

4347

struct pglist_data *pgdat = zone->zone_pgdat;

4348

int ret;

4348

int ret;

4349

ret = zone_wait_table_init(zone, size);

4349

ret = zone_wait_table_init(zone, size);

4350

if (ret)

4350

if (ret)

4351

return ret;

4351

return ret;

4352

pgdat->nr_zones = zone_idx(zone) + 1;

4352

pgdat->nr_zones = zone_idx(zone) + 1;

4353

4354

zone->zone_start_pfn = zone_start_pfn;

4354

zone->zone_start_pfn = zone_start_pfn;

4355

4356

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4356

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4357

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4357

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4358

pgdat->node_id,

4358

pgdat->node_id,

4359

(unsigned long)zone_idx(zone),

4359

(unsigned long)zone_idx(zone),

4360

zone_start_pfn, (zone_start_pfn + size));

4360

zone_start_pfn, (zone_start_pfn + size));

4361

4362

zone_init_free_lists(zone);

4362

zone_init_free_lists(zone);

4363

4364

return 0;

4364

return 0;

4365

}

4365

}

4366

4367

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4367

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4368

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4368

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4369

/*

4369

/*

4370

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4370

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4371

* Architectures may implement their own version but if add_active_range()

4371

* Architectures may implement their own version but if add_active_range()

4372

* was used and there are no special requirements, this is a convenient

4372

* was used and there are no special requirements, this is a convenient

4373

* alternative

4373

* alternative

4374

*/

4374

*/

4375

int __meminit __early_pfn_to_nid(unsigned long pfn)

4375

int __meminit __early_pfn_to_nid(unsigned long pfn)

4376

{

4376

{

4377

unsigned long start_pfn, end_pfn;

4377

unsigned long start_pfn, end_pfn;

4378

int nid;

4378

int nid;

4379

/*

4379

/*

4380

* NOTE: The following SMP-unsafe globals are only used early in boot

4380

* NOTE: The following SMP-unsafe globals are only used early in boot

4381

* when the kernel is running single-threaded.

4381

* when the kernel is running single-threaded.

4382

*/

4382

*/

4383

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4383

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4384

static int __meminitdata last_nid;

4384

static int __meminitdata last_nid;

4385

4386

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4386

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4387

return last_nid;

4387

return last_nid;

4388

4389

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4389

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4390

if (nid != -1) {

4390

if (nid != -1) {

4391

last_start_pfn = start_pfn;

4391

last_start_pfn = start_pfn;

4392

last_end_pfn = end_pfn;

4392

last_end_pfn = end_pfn;

4393

last_nid = nid;

4393

last_nid = nid;

4394

}

4394

}

4395

4396

return nid;

4396

return nid;

4397

}

4397

}

4398

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4398

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4399

4400

int __meminit early_pfn_to_nid(unsigned long pfn)

4400

int __meminit early_pfn_to_nid(unsigned long pfn)

4401

{

4401

{

4402

int nid;

4402

int nid;

4403

4404

nid = __early_pfn_to_nid(pfn);

4404

nid = __early_pfn_to_nid(pfn);

4405

if (nid >= 0)

4405

if (nid >= 0)

4406

return nid;

4406

return nid;

4407

/* just returns 0 */

4407

/* just returns 0 */

4408

return 0;

4408

return 0;

4409

}

4409

}

4410

4411

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4411

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4412

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4412

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4413

{

4413

{

4414

int nid;

4414

int nid;

4415

4416

nid = __early_pfn_to_nid(pfn);

4416

nid = __early_pfn_to_nid(pfn);

4417

if (nid >= 0 && nid != node)

4417

if (nid >= 0 && nid != node)

4418

return false;

4418

return false;

4419

return true;

4419

return true;

4420

}

4420

}

4421

#endif

4421

#endif

4422

4423

/**

4423

/**

4424

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4424

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4425

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4425

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4426

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4426

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4427

*

4427

*

4428

* If an architecture guarantees that all ranges registered with

4428

* If an architecture guarantees that all ranges registered with

4429

* add_active_ranges() contain no holes and may be freed, this

4429

* add_active_ranges() contain no holes and may be freed, this

4430

* this function may be used instead of calling free_bootmem() manually.

4430

* this function may be used instead of calling free_bootmem() manually.

4431

*/

4431

*/

4432

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4432

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4433

{

4433

{

4434

unsigned long start_pfn, end_pfn;

4434

unsigned long start_pfn, end_pfn;

4435

int i, this_nid;

4435

int i, this_nid;

4436

4437

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4437

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4438

start_pfn = min(start_pfn, max_low_pfn);

4438

start_pfn = min(start_pfn, max_low_pfn);

4439

end_pfn = min(end_pfn, max_low_pfn);

4439

end_pfn = min(end_pfn, max_low_pfn);

4440

4441

if (start_pfn < end_pfn)

4441

if (start_pfn < end_pfn)

4442

free_bootmem_node(NODE_DATA(this_nid),

4442

free_bootmem_node(NODE_DATA(this_nid),

4443

PFN_PHYS(start_pfn),

4443

PFN_PHYS(start_pfn),

4444

(end_pfn - start_pfn) << PAGE_SHIFT);

4444

(end_pfn - start_pfn) << PAGE_SHIFT);

4445

}

4445

}

4446

}

4446

}

4447

4448

/**

4448

/**

4449

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4449

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4450

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4450

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4451

*

4451

*

4452

* If an architecture guarantees that all ranges registered with

4452

* If an architecture guarantees that all ranges registered with

4453

* add_active_ranges() contain no holes and may be freed, this

4453

* add_active_ranges() contain no holes and may be freed, this

4454

* function may be used instead of calling memory_present() manually.

4454

* function may be used instead of calling memory_present() manually.

4455

*/

4455

*/

4456

void __init sparse_memory_present_with_active_regions(int nid)

4456

void __init sparse_memory_present_with_active_regions(int nid)

4457

{

4457

{

4458

unsigned long start_pfn, end_pfn;

4458

unsigned long start_pfn, end_pfn;

4459

int i, this_nid;

4459

int i, this_nid;

4460

4461

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4461

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4462

memory_present(this_nid, start_pfn, end_pfn);

4462

memory_present(this_nid, start_pfn, end_pfn);

4463

}

4463

}

4464

4465

/**

4465

/**

4466

* get_pfn_range_for_nid - Return the start and end page frames for a node

4466

* get_pfn_range_for_nid - Return the start and end page frames for a node

4467

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4467

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4468

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4468

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4469

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4469

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4470

*

4470

*

4471

* It returns the start and end page frame of a node based on information

4471

* It returns the start and end page frame of a node based on information

4472

* provided by an arch calling add_active_range(). If called for a node

4472

* provided by an arch calling add_active_range(). If called for a node

4473

* with no available memory, a warning is printed and the start and end

4473

* with no available memory, a warning is printed and the start and end

4474

* PFNs will be 0.

4474

* PFNs will be 0.

4475

*/

4475

*/

4476

void __meminit get_pfn_range_for_nid(unsigned int nid,

4476

void __meminit get_pfn_range_for_nid(unsigned int nid,

4477

unsigned long *start_pfn, unsigned long *end_pfn)

4477

unsigned long *start_pfn, unsigned long *end_pfn)

4478

{

4478

{

4479

unsigned long this_start_pfn, this_end_pfn;

4479

unsigned long this_start_pfn, this_end_pfn;

4480

int i;

4480

int i;

4481

4482

*start_pfn = -1UL;

4482

*start_pfn = -1UL;

4483

*end_pfn = 0;

4483

*end_pfn = 0;

4484

4485

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4485

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4486

*start_pfn = min(*start_pfn, this_start_pfn);

4486

*start_pfn = min(*start_pfn, this_start_pfn);

4487

*end_pfn = max(*end_pfn, this_end_pfn);

4487

*end_pfn = max(*end_pfn, this_end_pfn);

4488

}

4488

}

4489

4490

if (*start_pfn == -1UL)

4490

if (*start_pfn == -1UL)

4491

*start_pfn = 0;

4491

*start_pfn = 0;

4492

}

4492

}

4493

4494

/*

4494

/*

4495

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4495

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4496

* assumption is made that zones within a node are ordered in monotonic

4496

* assumption is made that zones within a node are ordered in monotonic

4497

* increasing memory addresses so that the "highest" populated zone is used

4497

* increasing memory addresses so that the "highest" populated zone is used

4498

*/

4498

*/

4499

static void __init find_usable_zone_for_movable(void)

4499

static void __init find_usable_zone_for_movable(void)

4500

{

4500

{

4501

int zone_index;

4501

int zone_index;

4502

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4502

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4503

if (zone_index == ZONE_MOVABLE)

4503

if (zone_index == ZONE_MOVABLE)

4504

continue;

4504

continue;

4505

4506

if (arch_zone_highest_possible_pfn[zone_index] >

4506

if (arch_zone_highest_possible_pfn[zone_index] >

4507

arch_zone_lowest_possible_pfn[zone_index])

4507

arch_zone_lowest_possible_pfn[zone_index])

4508

break;

4508

break;

4509

}

4509

}

4510

4511

VM_BUG_ON(zone_index == -1);

4511

VM_BUG_ON(zone_index == -1);

4512

movable_zone = zone_index;

4512

movable_zone = zone_index;

4513

}

4513

}

4514

4515

/*

4515

/*

4516

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4516

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4517

* because it is sized independent of architecture. Unlike the other zones,

4517

* because it is sized independent of architecture. Unlike the other zones,

4518

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4518

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4519

* in each node depending on the size of each node and how evenly kernelcore

4519

* in each node depending on the size of each node and how evenly kernelcore

4520

* is distributed. This helper function adjusts the zone ranges

4520

* is distributed. This helper function adjusts the zone ranges

4521

* provided by the architecture for a given node by using the end of the

4521

* provided by the architecture for a given node by using the end of the

4522

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4522

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4523

* zones within a node are in order of monotonic increases memory addresses

4523

* zones within a node are in order of monotonic increases memory addresses

4524

*/

4524

*/

4525

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4525

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4526

unsigned long zone_type,

4526

unsigned long zone_type,

4527

unsigned long node_start_pfn,

4527

unsigned long node_start_pfn,

4528

unsigned long node_end_pfn,

4528

unsigned long node_end_pfn,

4529

unsigned long *zone_start_pfn,

4529

unsigned long *zone_start_pfn,

4530

unsigned long *zone_end_pfn)

4530

unsigned long *zone_end_pfn)

4531

{

4531

{

4532

/* Only adjust if ZONE_MOVABLE is on this node */

4532

/* Only adjust if ZONE_MOVABLE is on this node */

4533

if (zone_movable_pfn[nid]) {

4533

if (zone_movable_pfn[nid]) {

4534

/* Size ZONE_MOVABLE */

4534

/* Size ZONE_MOVABLE */

4535

if (zone_type == ZONE_MOVABLE) {

4535

if (zone_type == ZONE_MOVABLE) {

4536

*zone_start_pfn = zone_movable_pfn[nid];

4536

*zone_start_pfn = zone_movable_pfn[nid];

4537

*zone_end_pfn = min(node_end_pfn,

4537

*zone_end_pfn = min(node_end_pfn,

4538

arch_zone_highest_possible_pfn[movable_zone]);

4538

arch_zone_highest_possible_pfn[movable_zone]);

4539

4540

/* Adjust for ZONE_MOVABLE starting within this range */

4540

/* Adjust for ZONE_MOVABLE starting within this range */

4541

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4541

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4542

*zone_end_pfn > zone_movable_pfn[nid]) {

4542

*zone_end_pfn > zone_movable_pfn[nid]) {

4543

*zone_end_pfn = zone_movable_pfn[nid];

4543

*zone_end_pfn = zone_movable_pfn[nid];

4544

4545

/* Check if this whole range is within ZONE_MOVABLE */

4545

/* Check if this whole range is within ZONE_MOVABLE */

4546

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4546

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4547

*zone_start_pfn = *zone_end_pfn;

4547

*zone_start_pfn = *zone_end_pfn;

4548

}

4548

}

4549

}

4549

}

4550

4551

/*

4551

/*

4552

* Return the number of pages a zone spans in a node, including holes

4552

* Return the number of pages a zone spans in a node, including holes

4553

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4553

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4554

*/

4554

*/

4555

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4555

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4556

unsigned long zone_type,

4556

unsigned long zone_type,

4557

unsigned long node_start_pfn,

4557

unsigned long node_start_pfn,

4558

unsigned long node_end_pfn,

4558

unsigned long node_end_pfn,

4559

unsigned long *ignored)

4559

unsigned long *ignored)

4560

{

4560

{

4561

unsigned long zone_start_pfn, zone_end_pfn;

4561

unsigned long zone_start_pfn, zone_end_pfn;

4562

4563

/* Get the start and end of the zone */

4563

/* Get the start and end of the zone */

4564

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4564

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4565

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4565

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4566

adjust_zone_range_for_zone_movable(nid, zone_type,

4566

adjust_zone_range_for_zone_movable(nid, zone_type,

4567

node_start_pfn, node_end_pfn,

4567

node_start_pfn, node_end_pfn,

4568

&zone_start_pfn, &zone_end_pfn);

4568

&zone_start_pfn, &zone_end_pfn);

4569

4570

/* Check that this node has pages within the zone's required range */

4570

/* Check that this node has pages within the zone's required range */

4571

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4571

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4572

return 0;

4572

return 0;

4573

4574

/* Move the zone boundaries inside the node if necessary */

4574

/* Move the zone boundaries inside the node if necessary */

4575

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4575

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4576

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4576

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4577

4578

/* Return the spanned pages */

4578

/* Return the spanned pages */

4579

return zone_end_pfn - zone_start_pfn;

4579

return zone_end_pfn - zone_start_pfn;

4580

}

4580

}

4581

4582

/*

4582

/*

4583

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4583

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4584

* then all holes in the requested range will be accounted for.

4584

* then all holes in the requested range will be accounted for.

4585

*/

4585

*/

4586

unsigned long __meminit __absent_pages_in_range(int nid,

4586

unsigned long __meminit __absent_pages_in_range(int nid,

4587

unsigned long range_start_pfn,

4587

unsigned long range_start_pfn,

4588

unsigned long range_end_pfn)

4588

unsigned long range_end_pfn)

4589

{

4589

{

4590

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4590

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4591

unsigned long start_pfn, end_pfn;

4591

unsigned long start_pfn, end_pfn;

4592

int i;

4592

int i;

4593

4594

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4594

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4595

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4595

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4596

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4596

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4597

nr_absent -= end_pfn - start_pfn;

4597

nr_absent -= end_pfn - start_pfn;

4598

}

4598

}

4599

return nr_absent;

4599

return nr_absent;

4600

}

4600

}

4601

4602

/**

4602

/**

4603

* absent_pages_in_range - Return number of page frames in holes within a range

4603

* absent_pages_in_range - Return number of page frames in holes within a range

4604

* @start_pfn: The start PFN to start searching for holes

4604

* @start_pfn: The start PFN to start searching for holes

4605

* @end_pfn: The end PFN to stop searching for holes

4605

* @end_pfn: The end PFN to stop searching for holes

4606

*

4606

*

4607

* It returns the number of pages frames in memory holes within a range.

4607

* It returns the number of pages frames in memory holes within a range.

4608

*/

4608

*/

4609

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4609

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4610

unsigned long end_pfn)

4610

unsigned long end_pfn)

4611

{

4611

{

4612

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4612

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4613

}

4613

}

4614

4615

/* Return the number of page frames in holes in a zone on a node */

4615

/* Return the number of page frames in holes in a zone on a node */

4616

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4616

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4617

unsigned long zone_type,

4617

unsigned long zone_type,

4618

unsigned long node_start_pfn,

4618

unsigned long node_start_pfn,

4619

unsigned long node_end_pfn,

4619

unsigned long node_end_pfn,

4620

unsigned long *ignored)

4620

unsigned long *ignored)

4621

{

4621

{

4622

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4622

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4623

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4623

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4624

unsigned long zone_start_pfn, zone_end_pfn;

4624

unsigned long zone_start_pfn, zone_end_pfn;

4625

4626

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4626

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4627

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4627

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4628

4629

adjust_zone_range_for_zone_movable(nid, zone_type,

4629

adjust_zone_range_for_zone_movable(nid, zone_type,

4630

node_start_pfn, node_end_pfn,

4630

node_start_pfn, node_end_pfn,

4631

&zone_start_pfn, &zone_end_pfn);

4631

&zone_start_pfn, &zone_end_pfn);

4632

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4632

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4633

}

4633

}

4634

4635

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4635

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4636

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4636

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4637

unsigned long zone_type,

4637

unsigned long zone_type,

4638

unsigned long node_start_pfn,

4638

unsigned long node_start_pfn,

4639

unsigned long node_end_pfn,

4639

unsigned long node_end_pfn,

4640

unsigned long *zones_size)

4640

unsigned long *zones_size)

4641

{

4641

{

4642

return zones_size[zone_type];

4642

return zones_size[zone_type];

4643

}

4643

}

4644

4645

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4645

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4646

unsigned long zone_type,

4646

unsigned long zone_type,

4647

unsigned long node_start_pfn,

4647

unsigned long node_start_pfn,

4648

unsigned long node_end_pfn,

4648

unsigned long node_end_pfn,

4649

unsigned long *zholes_size)

4649

unsigned long *zholes_size)

4650

{

4650

{

4651

if (!zholes_size)

4651

if (!zholes_size)

4652

return 0;

4652

return 0;

4653

4654

return zholes_size[zone_type];

4654

return zholes_size[zone_type];

4655

}

4655

}

4656

4657

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4657

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4658

4659

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4659

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4660

unsigned long node_start_pfn,

4660

unsigned long node_start_pfn,

4661

unsigned long node_end_pfn,

4661

unsigned long node_end_pfn,

4662

unsigned long *zones_size,

4662

unsigned long *zones_size,

4663

unsigned long *zholes_size)

4663

unsigned long *zholes_size)

4664

{

4664

{

4665

unsigned long realtotalpages, totalpages = 0;

4665

unsigned long realtotalpages, totalpages = 0;

4666

enum zone_type i;

4666

enum zone_type i;

4667

4668

for (i = 0; i < MAX_NR_ZONES; i++)

4668

for (i = 0; i < MAX_NR_ZONES; i++)

4669

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4669

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4670

node_start_pfn,

4670

node_start_pfn,

4671

node_end_pfn,

4671

node_end_pfn,

4672

zones_size);

4672

zones_size);

4673

pgdat->node_spanned_pages = totalpages;

4673

pgdat->node_spanned_pages = totalpages;

4674

4675

realtotalpages = totalpages;

4675

realtotalpages = totalpages;

4676

for (i = 0; i < MAX_NR_ZONES; i++)

4676

for (i = 0; i < MAX_NR_ZONES; i++)

4677

realtotalpages -=

4677

realtotalpages -=

4678

zone_absent_pages_in_node(pgdat->node_id, i,

4678

zone_absent_pages_in_node(pgdat->node_id, i,

4679

node_start_pfn, node_end_pfn,

4679

node_start_pfn, node_end_pfn,

4680

zholes_size);

4680

zholes_size);

4681

pgdat->node_present_pages = realtotalpages;

4681

pgdat->node_present_pages = realtotalpages;

4682

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4682

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4683

realtotalpages);

4683

realtotalpages);

4684

}

4684

}

4685

4686

#ifndef CONFIG_SPARSEMEM

4686

#ifndef CONFIG_SPARSEMEM

4687

/*

4687

/*

4688

* Calculate the size of the zone->blockflags rounded to an unsigned long

4688

* Calculate the size of the zone->blockflags rounded to an unsigned long

4689

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4689

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4690

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4690

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4691

* round what is now in bits to nearest long in bits, then return it in

4691

* round what is now in bits to nearest long in bits, then return it in

4692

* bytes.

4692

* bytes.

4693

*/

4693

*/

4694

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4694

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4695

{

4695

{

4696

unsigned long usemapsize;

4696

unsigned long usemapsize;

4697

4698

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4698

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4699

usemapsize = roundup(zonesize, pageblock_nr_pages);

4699

usemapsize = roundup(zonesize, pageblock_nr_pages);

4700

usemapsize = usemapsize >> pageblock_order;

4700

usemapsize = usemapsize >> pageblock_order;

4701

usemapsize *= NR_PAGEBLOCK_BITS;

4701

usemapsize *= NR_PAGEBLOCK_BITS;

4702

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4702

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4703

4704

return usemapsize / 8;

4704

return usemapsize / 8;

4705

}

4705

}

4706

4707

static void __init setup_usemap(struct pglist_data *pgdat,

4707

static void __init setup_usemap(struct pglist_data *pgdat,

4708

struct zone *zone,

4708

struct zone *zone,

4709

unsigned long zone_start_pfn,

4709

unsigned long zone_start_pfn,

4710

unsigned long zonesize)

4710

unsigned long zonesize)

4711

{

4711

{

4712

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4712

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4713

zone->pageblock_flags = NULL;

4713

zone->pageblock_flags = NULL;

4714

if (usemapsize)

4714

if (usemapsize)

4715

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4715

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4716

usemapsize);

4716

usemapsize);

4717

}

4717

}

4718

#else

4718

#else

4719

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4719

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4720

unsigned long zone_start_pfn, unsigned long zonesize) {}

4720

unsigned long zone_start_pfn, unsigned long zonesize) {}

4721

#endif /* CONFIG_SPARSEMEM */

4721

#endif /* CONFIG_SPARSEMEM */

4722

4723

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4723

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4724

4725

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4725

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4726

void __paginginit set_pageblock_order(void)

4726

void __paginginit set_pageblock_order(void)

4727

{

4727

{

4728

unsigned int order;

4728

unsigned int order;

4729

4730

/* Check that pageblock_nr_pages has not already been setup */

4730

/* Check that pageblock_nr_pages has not already been setup */

4731

if (pageblock_order)

4731

if (pageblock_order)

4732

return;

4732

return;

4733

4734

if (HPAGE_SHIFT > PAGE_SHIFT)

4734

if (HPAGE_SHIFT > PAGE_SHIFT)

4735

order = HUGETLB_PAGE_ORDER;

4735

order = HUGETLB_PAGE_ORDER;

4736

else

4736

else

4737

order = MAX_ORDER - 1;

4737

order = MAX_ORDER - 1;

4738

4739

/*

4739

/*

4740

* Assume the largest contiguous order of interest is a huge page.

4740

* Assume the largest contiguous order of interest is a huge page.

4741

* This value may be variable depending on boot parameters on IA64 and

4741

* This value may be variable depending on boot parameters on IA64 and

4742

* powerpc.

4742

* powerpc.

4743

*/

4743

*/

4744

pageblock_order = order;

4744

pageblock_order = order;

4745

}

4745

}

4746

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4746

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4747

4748

/*

4748

/*

4749

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4749

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4750

* is unused as pageblock_order is set at compile-time. See

4750

* is unused as pageblock_order is set at compile-time. See

4751

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4751

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4752

* the kernel config

4752

* the kernel config

4753

*/

4753

*/

4754

void __paginginit set_pageblock_order(void)

4754

void __paginginit set_pageblock_order(void)

4755

{

4755

{

4756

}

4756

}

4757

4758

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4758

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4759

4760

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4760

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4761

unsigned long present_pages)

4761

unsigned long present_pages)

4762

{

4762

{

4763

unsigned long pages = spanned_pages;

4763

unsigned long pages = spanned_pages;

4764

4765

/*

4765

/*

4766

* Provide a more accurate estimation if there are holes within

4766

* Provide a more accurate estimation if there are holes within

4767

* the zone and SPARSEMEM is in use. If there are holes within the

4767

* the zone and SPARSEMEM is in use. If there are holes within the

4768

* zone, each populated memory region may cost us one or two extra

4768

* zone, each populated memory region may cost us one or two extra

4769

* memmap pages due to alignment because memmap pages for each

4769

* memmap pages due to alignment because memmap pages for each

4770

* populated regions may not naturally algined on page boundary.

4770

* populated regions may not naturally algined on page boundary.

4771

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4771

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4772

*/

4772

*/

4773

if (spanned_pages > present_pages + (present_pages >> 4) &&

4773

if (spanned_pages > present_pages + (present_pages >> 4) &&

4774

IS_ENABLED(CONFIG_SPARSEMEM))

4774

IS_ENABLED(CONFIG_SPARSEMEM))

4775

pages = present_pages;

4775

pages = present_pages;

4776

4777

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4777

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4778

}

4778

}

4779

4780

/*

4780

/*

4781

* Set up the zone data structures:

4781

* Set up the zone data structures:

4782

* - mark all pages reserved

4782

* - mark all pages reserved

4783

* - mark all memory queues empty

4783

* - mark all memory queues empty

4784

* - clear the memory bitmaps

4784

* - clear the memory bitmaps

4785

*

4785

*

4786

* NOTE: pgdat should get zeroed by caller.

4786

* NOTE: pgdat should get zeroed by caller.

4787

*/

4787

*/

4788

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4788

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4789

unsigned long node_start_pfn, unsigned long node_end_pfn,

4789

unsigned long node_start_pfn, unsigned long node_end_pfn,

4790

unsigned long *zones_size, unsigned long *zholes_size)

4790

unsigned long *zones_size, unsigned long *zholes_size)

4791

{

4791

{

4792

enum zone_type j;

4792

enum zone_type j;

4793

int nid = pgdat->node_id;

4793

int nid = pgdat->node_id;

4794

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4794

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4795

int ret;

4795

int ret;

4796

4797

pgdat_resize_init(pgdat);

4797

pgdat_resize_init(pgdat);

4798

#ifdef CONFIG_NUMA_BALANCING

4798

#ifdef CONFIG_NUMA_BALANCING

4799

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4799

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4800

pgdat->numabalancing_migrate_nr_pages = 0;

4800

pgdat->numabalancing_migrate_nr_pages = 0;

4801

pgdat->numabalancing_migrate_next_window = jiffies;

4801

pgdat->numabalancing_migrate_next_window = jiffies;

4802

#endif

4802

#endif

4803

init_waitqueue_head(&pgdat->kswapd_wait);

4803

init_waitqueue_head(&pgdat->kswapd_wait);

4804

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4804

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4805

pgdat_page_cgroup_init(pgdat);

4805

pgdat_page_cgroup_init(pgdat);

4806

4807

for (j = 0; j < MAX_NR_ZONES; j++) {

4807

for (j = 0; j < MAX_NR_ZONES; j++) {

4808

struct zone *zone = pgdat->node_zones + j;

4808

struct zone *zone = pgdat->node_zones + j;

4809

unsigned long size, realsize, freesize, memmap_pages;

4809

unsigned long size, realsize, freesize, memmap_pages;

4810

4811

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4811

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4812

node_end_pfn, zones_size);

4812

node_end_pfn, zones_size);

4813

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4813

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4814

node_start_pfn,

4814

node_start_pfn,

4815

node_end_pfn,

4815

node_end_pfn,

4816

zholes_size);

4816

zholes_size);

4817

4818

/*

4818

/*

4819

* Adjust freesize so that it accounts for how much memory

4819

* Adjust freesize so that it accounts for how much memory

4820

* is used by this zone for memmap. This affects the watermark

4820

* is used by this zone for memmap. This affects the watermark

4821

* and per-cpu initialisations

4821

* and per-cpu initialisations

4822

*/

4822

*/

4823

memmap_pages = calc_memmap_size(size, realsize);

4823

memmap_pages = calc_memmap_size(size, realsize);

4824

if (freesize >= memmap_pages) {

4824

if (freesize >= memmap_pages) {

4825

freesize -= memmap_pages;

4825

freesize -= memmap_pages;

4826

if (memmap_pages)

4826

if (memmap_pages)

4827

printk(KERN_DEBUG

4827

printk(KERN_DEBUG

4828

" %s zone: %lu pages used for memmap\n",

4828

" %s zone: %lu pages used for memmap\n",

4829

zone_names[j], memmap_pages);

4829

zone_names[j], memmap_pages);

4830

} else

4830

} else

4831

printk(KERN_WARNING

4831

printk(KERN_WARNING

4832

" %s zone: %lu pages exceeds freesize %lu\n",

4832

" %s zone: %lu pages exceeds freesize %lu\n",

4833

zone_names[j], memmap_pages, freesize);

4833

zone_names[j], memmap_pages, freesize);

4834

4835

/* Account for reserved pages */

4835

/* Account for reserved pages */

4836

if (j == 0 && freesize > dma_reserve) {

4836

if (j == 0 && freesize > dma_reserve) {

4837

freesize -= dma_reserve;

4837

freesize -= dma_reserve;

4838

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4838

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4839

zone_names[0], dma_reserve);

4839

zone_names[0], dma_reserve);

4840

}

4840

}

4841

4842

if (!is_highmem_idx(j))

4842

if (!is_highmem_idx(j))

4843

nr_kernel_pages += freesize;

4843

nr_kernel_pages += freesize;

4844

/* Charge for highmem memmap if there are enough kernel pages */

4844

/* Charge for highmem memmap if there are enough kernel pages */

4845

else if (nr_kernel_pages > memmap_pages * 2)

4845

else if (nr_kernel_pages > memmap_pages * 2)

4846

nr_kernel_pages -= memmap_pages;

4846

nr_kernel_pages -= memmap_pages;

4847

nr_all_pages += freesize;

4847

nr_all_pages += freesize;

4848

4849

zone->spanned_pages = size;

4849

zone->spanned_pages = size;

4850

zone->present_pages = realsize;

4850

zone->present_pages = realsize;

4851

/*

4851

/*

4852

* Set an approximate value for lowmem here, it will be adjusted

4852

* Set an approximate value for lowmem here, it will be adjusted

4853

* when the bootmem allocator frees pages into the buddy system.

4853

* when the bootmem allocator frees pages into the buddy system.

4854

* And all highmem pages will be managed by the buddy system.

4854

* And all highmem pages will be managed by the buddy system.

4855

*/

4855

*/

4856

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4856

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4857

#ifdef CONFIG_NUMA

4857

#ifdef CONFIG_NUMA

4858

zone->node = nid;

4858

zone->node = nid;

4859

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4859

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4860

/ 100;

4860

/ 100;

4861

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4861

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4862

#endif

4862

#endif

4863

zone->name = zone_names[j];

4863

zone->name = zone_names[j];

4864

spin_lock_init(&zone->lock);

4864

spin_lock_init(&zone->lock);

4865

spin_lock_init(&zone->lru_lock);

4865

spin_lock_init(&zone->lru_lock);

4866

zone_seqlock_init(zone);

4866

zone_seqlock_init(zone);

4867

zone->zone_pgdat = pgdat;

4867

zone->zone_pgdat = pgdat;

4868

zone_pcp_init(zone);

4868

zone_pcp_init(zone);

4869

4870

/* For bootup, initialized properly in watermark setup */

4870

/* For bootup, initialized properly in watermark setup */

4871

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4871

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4872

4873

lruvec_init(&zone->lruvec);

4873

lruvec_init(&zone->lruvec);

4874

if (!size)

4874

if (!size)

4875

continue;

4875

continue;

4876

4877

set_pageblock_order();

4877

set_pageblock_order();

4878

setup_usemap(pgdat, zone, zone_start_pfn, size);

4878

setup_usemap(pgdat, zone, zone_start_pfn, size);

4879

ret = init_currently_empty_zone(zone, zone_start_pfn,

4879

ret = init_currently_empty_zone(zone, zone_start_pfn,

4880

size, MEMMAP_EARLY);

4880

size, MEMMAP_EARLY);

4881

BUG_ON(ret);

4881

BUG_ON(ret);

4882

memmap_init(size, nid, j, zone_start_pfn);

4882

memmap_init(size, nid, j, zone_start_pfn);

4883

zone_start_pfn += size;

4883

zone_start_pfn += size;

4884

}

4884

}

4885

}

4885

}

4886

4887

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4887

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4888

{

4888

{

4889

/* Skip empty nodes */

4889

/* Skip empty nodes */

4890

if (!pgdat->node_spanned_pages)

4890

if (!pgdat->node_spanned_pages)

4891

return;

4891

return;

4892

4893

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4893

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4894

/* ia64 gets its own node_mem_map, before this, without bootmem */

4894

/* ia64 gets its own node_mem_map, before this, without bootmem */

4895

if (!pgdat->node_mem_map) {

4895

if (!pgdat->node_mem_map) {

4896

unsigned long size, start, end;

4896

unsigned long size, start, end;

4897

struct page *map;

4897

struct page *map;

4898

4899

/*

4899

/*

4900

* The zone's endpoints aren't required to be MAX_ORDER

4900

* The zone's endpoints aren't required to be MAX_ORDER

4901

* aligned but the node_mem_map endpoints must be in order

4901

* aligned but the node_mem_map endpoints must be in order

4902

* for the buddy allocator to function correctly.

4902

* for the buddy allocator to function correctly.

4903

*/

4903

*/

4904

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4904

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4905

end = pgdat_end_pfn(pgdat);

4905

end = pgdat_end_pfn(pgdat);

4906

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4906

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4907

size = (end - start) * sizeof(struct page);

4907

size = (end - start) * sizeof(struct page);

4908

map = alloc_remap(pgdat->node_id, size);

4908

map = alloc_remap(pgdat->node_id, size);

4909

if (!map)

4909

if (!map)

4910

map = alloc_bootmem_node_nopanic(pgdat, size);

4910

map = alloc_bootmem_node_nopanic(pgdat, size);

4911

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4911

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4912

}

4912

}

4913

#ifndef CONFIG_NEED_MULTIPLE_NODES

4913

#ifndef CONFIG_NEED_MULTIPLE_NODES

4914

/*

4914

/*

4915

* With no DISCONTIG, the global mem_map is just set as node 0's

4915

* With no DISCONTIG, the global mem_map is just set as node 0's

4916

*/

4916

*/

4917

if (pgdat == NODE_DATA(0)) {

4917

if (pgdat == NODE_DATA(0)) {

4918

mem_map = NODE_DATA(0)->node_mem_map;

4918

mem_map = NODE_DATA(0)->node_mem_map;

4919

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4919

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4920

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4920

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4921

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4921

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4922

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4922

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4923

}

4923

}

4924

#endif

4924

#endif

4925

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4925

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4926

}

4926

}

4927

4928

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4928

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4929

unsigned long node_start_pfn, unsigned long *zholes_size)

4929

unsigned long node_start_pfn, unsigned long *zholes_size)

4930

{

4930

{

4931

pg_data_t *pgdat = NODE_DATA(nid);

4931

pg_data_t *pgdat = NODE_DATA(nid);

4932

unsigned long start_pfn = 0;

4932

unsigned long start_pfn = 0;

4933

unsigned long end_pfn = 0;

4933

unsigned long end_pfn = 0;

4934

4935

/* pg_data_t should be reset to zero when it's allocated */

4935

/* pg_data_t should be reset to zero when it's allocated */

4936

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4936

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4937

4938

pgdat->node_id = nid;

4938

pgdat->node_id = nid;

4939

pgdat->node_start_pfn = node_start_pfn;

4939

pgdat->node_start_pfn = node_start_pfn;

4940

if (node_state(nid, N_MEMORY))

4940

if (node_state(nid, N_MEMORY))

4941

init_zone_allows_reclaim(nid);

4941

init_zone_allows_reclaim(nid);

4942

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4942

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4943

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4943

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4944

#endif

4944

#endif

4945

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4945

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4946

zones_size, zholes_size);

4946

zones_size, zholes_size);

4947

4948

alloc_node_mem_map(pgdat);

4948

alloc_node_mem_map(pgdat);

4949

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4949

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4950

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4950

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4951

nid, (unsigned long)pgdat,

4951

nid, (unsigned long)pgdat,

4952

(unsigned long)pgdat->node_mem_map);

4952

(unsigned long)pgdat->node_mem_map);

4953

#endif

4953

#endif

4954

4955

free_area_init_core(pgdat, start_pfn, end_pfn,

4955

free_area_init_core(pgdat, start_pfn, end_pfn,

4956

zones_size, zholes_size);

4956

zones_size, zholes_size);

4957

}

4957

}

4958

4959

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4959

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4960

4961

#if MAX_NUMNODES > 1

4961

#if MAX_NUMNODES > 1

4962

/*

4962

/*

4963

* Figure out the number of possible node ids.

4963

* Figure out the number of possible node ids.

4964

*/

4964

*/

4965

void __init setup_nr_node_ids(void)

4965

void __init setup_nr_node_ids(void)

4966

{

4966

{

4967

unsigned int node;

4967

unsigned int node;

4968

unsigned int highest = 0;

4968

unsigned int highest = 0;

4969

4970

for_each_node_mask(node, node_possible_map)

4970

for_each_node_mask(node, node_possible_map)

4971

highest = node;

4971

highest = node;

4972

nr_node_ids = highest + 1;

4972

nr_node_ids = highest + 1;

4973

}

4973

}

4974

#endif

4974

#endif

4975

4976

/**

4976

/**

4977

* node_map_pfn_alignment - determine the maximum internode alignment

4977

* node_map_pfn_alignment - determine the maximum internode alignment

4978

*

4978

*

4979

* This function should be called after node map is populated and sorted.

4979

* This function should be called after node map is populated and sorted.

4980

* It calculates the maximum power of two alignment which can distinguish

4980

* It calculates the maximum power of two alignment which can distinguish

4981

* all the nodes.

4981

* all the nodes.

4982

*

4982

*

4983

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4983

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4984

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4984

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4985

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4985

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4986

* shifted, 1GiB is enough and this function will indicate so.

4986

* shifted, 1GiB is enough and this function will indicate so.

4987

*

4987

*

4988

* This is used to test whether pfn -> nid mapping of the chosen memory

4988

* This is used to test whether pfn -> nid mapping of the chosen memory

4989

* model has fine enough granularity to avoid incorrect mapping for the

4989

* model has fine enough granularity to avoid incorrect mapping for the

4990

* populated node map.

4990

* populated node map.

4991

*

4991

*

4992

* Returns the determined alignment in pfn's. 0 if there is no alignment

4992

* Returns the determined alignment in pfn's. 0 if there is no alignment

4993

* requirement (single node).

4993

* requirement (single node).

4994

*/

4994

*/

4995

unsigned long __init node_map_pfn_alignment(void)

4995

unsigned long __init node_map_pfn_alignment(void)

4996

{

4996

{

4997

unsigned long accl_mask = 0, last_end = 0;

4997

unsigned long accl_mask = 0, last_end = 0;

4998

unsigned long start, end, mask;

4998

unsigned long start, end, mask;

4999

int last_nid = -1;

4999

int last_nid = -1;

5000

int i, nid;

5000

int i, nid;

5001

5002

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

5002

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

5003

if (!start || last_nid < 0 || last_nid == nid) {

5003

if (!start || last_nid < 0 || last_nid == nid) {

5004

last_nid = nid;

5004

last_nid = nid;

5005

last_end = end;

5005

last_end = end;

5006

continue;

5006

continue;

5007

}

5007

}

5008

5009

/*

5009

/*

5010

* Start with a mask granular enough to pin-point to the

5010

* Start with a mask granular enough to pin-point to the

5011

* start pfn and tick off bits one-by-one until it becomes

5011

* start pfn and tick off bits one-by-one until it becomes

5012

* too coarse to separate the current node from the last.

5012

* too coarse to separate the current node from the last.

5013

*/

5013

*/

5014

mask = ~((1 << __ffs(start)) - 1);

5014

mask = ~((1 << __ffs(start)) - 1);

5015

while (mask && last_end <= (start & (mask << 1)))

5015

while (mask && last_end <= (start & (mask << 1)))

5016

mask <<= 1;

5016

mask <<= 1;

5017

5018

/* accumulate all internode masks */

5018

/* accumulate all internode masks */

5019

accl_mask |= mask;

5019

accl_mask |= mask;

5020

}

5020

}

5021

5022

/* convert mask to number of pages */

5022

/* convert mask to number of pages */

5023

return ~accl_mask + 1;

5023

return ~accl_mask + 1;

5024

}

5024

}

5025

5026

/* Find the lowest pfn for a node */

5026

/* Find the lowest pfn for a node */

5027

static unsigned long __init find_min_pfn_for_node(int nid)

5027

static unsigned long __init find_min_pfn_for_node(int nid)

5028

{

5028

{

5029

unsigned long min_pfn = ULONG_MAX;

5029

unsigned long min_pfn = ULONG_MAX;

5030

unsigned long start_pfn;

5030

unsigned long start_pfn;

5031

int i;

5031

int i;

5032

5033

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5033

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5034

min_pfn = min(min_pfn, start_pfn);

5034

min_pfn = min(min_pfn, start_pfn);

5035

5036

if (min_pfn == ULONG_MAX) {

5036

if (min_pfn == ULONG_MAX) {

5037

printk(KERN_WARNING

5037

printk(KERN_WARNING

5038

"Could not find start_pfn for node %d\n", nid);

5038

"Could not find start_pfn for node %d\n", nid);

5039

return 0;

5039

return 0;

5040

}

5040

}

5041

5042

return min_pfn;

5042

return min_pfn;

5043

}

5043

}

5044

5045

/**

5045

/**

5046

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5046

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5047

*

5047

*

5048

* It returns the minimum PFN based on information provided via

5048

* It returns the minimum PFN based on information provided via

5049

* add_active_range().

5049

* add_active_range().

5050

*/

5050

*/

5051

unsigned long __init find_min_pfn_with_active_regions(void)

5051

unsigned long __init find_min_pfn_with_active_regions(void)

5052

{

5052

{

5053

return find_min_pfn_for_node(MAX_NUMNODES);

5053

return find_min_pfn_for_node(MAX_NUMNODES);

5054

}

5054

}

5055

5056

/*

5056

/*

5057

* early_calculate_totalpages()

5057

* early_calculate_totalpages()

5058

* Sum pages in active regions for movable zone.

5058

* Sum pages in active regions for movable zone.

5059

* Populate N_MEMORY for calculating usable_nodes.

5059

* Populate N_MEMORY for calculating usable_nodes.

5060

*/

5060

*/

5061

static unsigned long __init early_calculate_totalpages(void)

5061

static unsigned long __init early_calculate_totalpages(void)

5062

{

5062

{

5063

unsigned long totalpages = 0;

5063

unsigned long totalpages = 0;

5064

unsigned long start_pfn, end_pfn;

5064

unsigned long start_pfn, end_pfn;

5065

int i, nid;

5065

int i, nid;

5066

5067

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5067

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5068

unsigned long pages = end_pfn - start_pfn;

5068

unsigned long pages = end_pfn - start_pfn;

5069

5070

totalpages += pages;

5070

totalpages += pages;

5071

if (pages)

5071

if (pages)

5072

node_set_state(nid, N_MEMORY);

5072

node_set_state(nid, N_MEMORY);

5073

}

5073

}

5074

return totalpages;

5074

return totalpages;

5075

}

5075

}

5076

5077

/*

5077

/*

5078

* Find the PFN the Movable zone begins in each node. Kernel memory

5078

* Find the PFN the Movable zone begins in each node. Kernel memory

5079

* is spread evenly between nodes as long as the nodes have enough

5079

* is spread evenly between nodes as long as the nodes have enough

5080

* memory. When they don't, some nodes will have more kernelcore than

5080

* memory. When they don't, some nodes will have more kernelcore than

5081

* others

5081

* others

5082

*/

5082

*/

5083

static void __init find_zone_movable_pfns_for_nodes(void)

5083

static void __init find_zone_movable_pfns_for_nodes(void)

5084

{

5084

{

5085

int i, nid;

5085

int i, nid;

5086

unsigned long usable_startpfn;

5086

unsigned long usable_startpfn;

5087

unsigned long kernelcore_node, kernelcore_remaining;

5087

unsigned long kernelcore_node, kernelcore_remaining;

5088

/* save the state before borrow the nodemask */

5088

/* save the state before borrow the nodemask */

5089

nodemask_t saved_node_state = node_states[N_MEMORY];

5089

nodemask_t saved_node_state = node_states[N_MEMORY];

5090

unsigned long totalpages = early_calculate_totalpages();

5090

unsigned long totalpages = early_calculate_totalpages();

5091

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5091

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5092

5093

/*

5093

/*

5094

* If movablecore was specified, calculate what size of

5094

* If movablecore was specified, calculate what size of

5095

* kernelcore that corresponds so that memory usable for

5095

* kernelcore that corresponds so that memory usable for

5096

* any allocation type is evenly spread. If both kernelcore

5096

* any allocation type is evenly spread. If both kernelcore

5097

* and movablecore are specified, then the value of kernelcore

5097

* and movablecore are specified, then the value of kernelcore

5098

* will be used for required_kernelcore if it's greater than

5098

* will be used for required_kernelcore if it's greater than

5099

* what movablecore would have allowed.

5099

* what movablecore would have allowed.

5100

*/

5100

*/

5101

if (required_movablecore) {

5101

if (required_movablecore) {

5102

unsigned long corepages;

5102

unsigned long corepages;

5103

5104

/*

5104

/*

5105

* Round-up so that ZONE_MOVABLE is at least as large as what

5105

* Round-up so that ZONE_MOVABLE is at least as large as what

5106

* was requested by the user

5106

* was requested by the user

5107

*/

5107

*/

5108

required_movablecore =

5108

required_movablecore =

5109

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5109

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5110

corepages = totalpages - required_movablecore;

5110

corepages = totalpages - required_movablecore;

5111

5112

required_kernelcore = max(required_kernelcore, corepages);

5112

required_kernelcore = max(required_kernelcore, corepages);

5113

}

5113

}

5114

5115

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5115

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5116

if (!required_kernelcore)

5116

if (!required_kernelcore)

5117

goto out;

5117

goto out;

5118

5119

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5119

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5120

find_usable_zone_for_movable();

5120

find_usable_zone_for_movable();

5121

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5121

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5122

5123

restart:

5123

restart:

5124

/* Spread kernelcore memory as evenly as possible throughout nodes */

5124

/* Spread kernelcore memory as evenly as possible throughout nodes */

5125

kernelcore_node = required_kernelcore / usable_nodes;

5125

kernelcore_node = required_kernelcore / usable_nodes;

5126

for_each_node_state(nid, N_MEMORY) {

5126

for_each_node_state(nid, N_MEMORY) {

5127

unsigned long start_pfn, end_pfn;

5127

unsigned long start_pfn, end_pfn;

5128

5129

/*

5129

/*

5130

* Recalculate kernelcore_node if the division per node

5130

* Recalculate kernelcore_node if the division per node

5131

* now exceeds what is necessary to satisfy the requested

5131

* now exceeds what is necessary to satisfy the requested

5132

* amount of memory for the kernel

5132

* amount of memory for the kernel

5133

*/

5133

*/

5134

if (required_kernelcore < kernelcore_node)

5134

if (required_kernelcore < kernelcore_node)

5135

kernelcore_node = required_kernelcore / usable_nodes;

5135

kernelcore_node = required_kernelcore / usable_nodes;

5136

5137

/*

5137

/*

5138

* As the map is walked, we track how much memory is usable

5138

* As the map is walked, we track how much memory is usable

5139

* by the kernel using kernelcore_remaining. When it is

5139

* by the kernel using kernelcore_remaining. When it is

5140

* 0, the rest of the node is usable by ZONE_MOVABLE

5140

* 0, the rest of the node is usable by ZONE_MOVABLE

5141

*/

5141

*/

5142

kernelcore_remaining = kernelcore_node;

5142

kernelcore_remaining = kernelcore_node;

5143

5144

/* Go through each range of PFNs within this node */

5144

/* Go through each range of PFNs within this node */

5145

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5145

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5146

unsigned long size_pages;

5146

unsigned long size_pages;

5147

5148

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5148

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5149

if (start_pfn >= end_pfn)

5149

if (start_pfn >= end_pfn)

5150

continue;

5150

continue;

5151

5152

/* Account for what is only usable for kernelcore */

5152

/* Account for what is only usable for kernelcore */

5153

if (start_pfn < usable_startpfn) {

5153

if (start_pfn < usable_startpfn) {

5154

unsigned long kernel_pages;

5154

unsigned long kernel_pages;

5155

kernel_pages = min(end_pfn, usable_startpfn)

5155

kernel_pages = min(end_pfn, usable_startpfn)

5156

- start_pfn;

5156

- start_pfn;

5157

5158

kernelcore_remaining -= min(kernel_pages,

5158

kernelcore_remaining -= min(kernel_pages,

5159

kernelcore_remaining);

5159

kernelcore_remaining);

5160

required_kernelcore -= min(kernel_pages,

5160

required_kernelcore -= min(kernel_pages,

5161

required_kernelcore);

5161

required_kernelcore);

5162

5163

/* Continue if range is now fully accounted */

5163

/* Continue if range is now fully accounted */

5164

if (end_pfn <= usable_startpfn) {

5164

if (end_pfn <= usable_startpfn) {

5165

5166

/*

5166

/*

5167

* Push zone_movable_pfn to the end so

5167

* Push zone_movable_pfn to the end so

5168

* that if we have to rebalance

5168

* that if we have to rebalance

5169

* kernelcore across nodes, we will

5169

* kernelcore across nodes, we will

5170

* not double account here

5170

* not double account here

5171

*/

5171

*/

5172

zone_movable_pfn[nid] = end_pfn;

5172

zone_movable_pfn[nid] = end_pfn;

5173

continue;

5173

continue;

5174

}

5174

}

5175

start_pfn = usable_startpfn;

5175

start_pfn = usable_startpfn;

5176

}

5176

}

5177

5178

/*

5178

/*

5179

* The usable PFN range for ZONE_MOVABLE is from

5179

* The usable PFN range for ZONE_MOVABLE is from

5180

* start_pfn->end_pfn. Calculate size_pages as the

5180

* start_pfn->end_pfn. Calculate size_pages as the

5181

* number of pages used as kernelcore

5181

* number of pages used as kernelcore

5182

*/

5182

*/

5183

size_pages = end_pfn - start_pfn;

5183

size_pages = end_pfn - start_pfn;

5184

if (size_pages > kernelcore_remaining)

5184

if (size_pages > kernelcore_remaining)

5185

size_pages = kernelcore_remaining;

5185

size_pages = kernelcore_remaining;

5186

zone_movable_pfn[nid] = start_pfn + size_pages;

5186

zone_movable_pfn[nid] = start_pfn + size_pages;

5187

5188

/*

5188

/*

5189

* Some kernelcore has been met, update counts and

5189

* Some kernelcore has been met, update counts and

5190

* break if the kernelcore for this node has been

5190

* break if the kernelcore for this node has been

5191

* satisfied

5191

* satisfied

5192

*/

5192

*/

5193

required_kernelcore -= min(required_kernelcore,

5193

required_kernelcore -= min(required_kernelcore,

5194

size_pages);

5194

size_pages);

5195

kernelcore_remaining -= size_pages;

5195

kernelcore_remaining -= size_pages;

5196

if (!kernelcore_remaining)

5196

if (!kernelcore_remaining)

5197

break;

5197

break;

5198

}

5198

}

5199

}

5199

}

5200

5201

/*

5201

/*

5202

* If there is still required_kernelcore, we do another pass with one

5202

* If there is still required_kernelcore, we do another pass with one

5203

* less node in the count. This will push zone_movable_pfn[nid] further

5203

* less node in the count. This will push zone_movable_pfn[nid] further

5204

* along on the nodes that still have memory until kernelcore is

5204

* along on the nodes that still have memory until kernelcore is

5205

* satisfied

5205

* satisfied

5206

*/

5206

*/

5207

usable_nodes--;

5207

usable_nodes--;

5208

if (usable_nodes && required_kernelcore > usable_nodes)

5208

if (usable_nodes && required_kernelcore > usable_nodes)

5209

goto restart;

5209

goto restart;

5210

5211

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5211

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5212

for (nid = 0; nid < MAX_NUMNODES; nid++)

5212

for (nid = 0; nid < MAX_NUMNODES; nid++)

5213

zone_movable_pfn[nid] =

5213

zone_movable_pfn[nid] =

5214

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5214

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5215

5216

out:

5216

out:

5217

/* restore the node_state */

5217

/* restore the node_state */

5218

node_states[N_MEMORY] = saved_node_state;

5218

node_states[N_MEMORY] = saved_node_state;

5219

}

5219

}

5220

5221

/* Any regular or high memory on that node ? */

5221

/* Any regular or high memory on that node ? */

5222

static void check_for_memory(pg_data_t *pgdat, int nid)

5222

static void check_for_memory(pg_data_t *pgdat, int nid)

5223

{

5223

{

5224

enum zone_type zone_type;

5224

enum zone_type zone_type;

5225

5226

if (N_MEMORY == N_NORMAL_MEMORY)

5226

if (N_MEMORY == N_NORMAL_MEMORY)

5227

return;

5227

return;

5228

5229

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5229

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5230

struct zone *zone = &pgdat->node_zones[zone_type];

5230

struct zone *zone = &pgdat->node_zones[zone_type];

5231

if (zone->present_pages) {

5231

if (zone->present_pages) {

5232

node_set_state(nid, N_HIGH_MEMORY);

5232

node_set_state(nid, N_HIGH_MEMORY);

5233

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5233

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5234

zone_type <= ZONE_NORMAL)

5234

zone_type <= ZONE_NORMAL)

5235

node_set_state(nid, N_NORMAL_MEMORY);

5235

node_set_state(nid, N_NORMAL_MEMORY);

5236

break;

5236

break;

5237

}

5237

}

5238

}

5238

}

5239

}

5239

}

5240

5241

/**

5241

/**

5242

* free_area_init_nodes - Initialise all pg_data_t and zone data

5242

* free_area_init_nodes - Initialise all pg_data_t and zone data

5243

* @max_zone_pfn: an array of max PFNs for each zone

5243

* @max_zone_pfn: an array of max PFNs for each zone

5244

*

5244

*

5245

* This will call free_area_init_node() for each active node in the system.

5245

* This will call free_area_init_node() for each active node in the system.

5246

* Using the page ranges provided by add_active_range(), the size of each

5246

* Using the page ranges provided by add_active_range(), the size of each

5247

* zone in each node and their holes is calculated. If the maximum PFN

5247

* zone in each node and their holes is calculated. If the maximum PFN

5248

* between two adjacent zones match, it is assumed that the zone is empty.

5248

* between two adjacent zones match, it is assumed that the zone is empty.

5249

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5249

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5250

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5250

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5251

* starts where the previous one ended. For example, ZONE_DMA32 starts

5251

* starts where the previous one ended. For example, ZONE_DMA32 starts

5252

* at arch_max_dma_pfn.

5252

* at arch_max_dma_pfn.

5253

*/

5253

*/

5254

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5254

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5255

{

5255

{

5256

unsigned long start_pfn, end_pfn;

5256

unsigned long start_pfn, end_pfn;

5257

int i, nid;

5257

int i, nid;

5258

5259

/* Record where the zone boundaries are */

5259

/* Record where the zone boundaries are */

5260

memset(arch_zone_lowest_possible_pfn, 0,

5260

memset(arch_zone_lowest_possible_pfn, 0,

5261

sizeof(arch_zone_lowest_possible_pfn));

5261

sizeof(arch_zone_lowest_possible_pfn));

5262

memset(arch_zone_highest_possible_pfn, 0,

5262

memset(arch_zone_highest_possible_pfn, 0,

5263

sizeof(arch_zone_highest_possible_pfn));

5263

sizeof(arch_zone_highest_possible_pfn));

5264

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5264

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5265

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5265

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5266

for (i = 1; i < MAX_NR_ZONES; i++) {

5266

for (i = 1; i < MAX_NR_ZONES; i++) {

5267

if (i == ZONE_MOVABLE)

5267

if (i == ZONE_MOVABLE)

5268

continue;

5268

continue;

5269

arch_zone_lowest_possible_pfn[i] =

5269

arch_zone_lowest_possible_pfn[i] =

5270

arch_zone_highest_possible_pfn[i-1];

5270

arch_zone_highest_possible_pfn[i-1];

5271

arch_zone_highest_possible_pfn[i] =

5271

arch_zone_highest_possible_pfn[i] =

5272

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5272

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5273

}

5273

}

5274

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5274

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5275

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5275

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5276

5277

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5277

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5278

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5278

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5279

find_zone_movable_pfns_for_nodes();

5279

find_zone_movable_pfns_for_nodes();

5280

5281

/* Print out the zone ranges */

5281

/* Print out the zone ranges */

5282

printk("Zone ranges:\n");

5282

printk("Zone ranges:\n");

5283

for (i = 0; i < MAX_NR_ZONES; i++) {

5283

for (i = 0; i < MAX_NR_ZONES; i++) {

5284

if (i == ZONE_MOVABLE)

5284

if (i == ZONE_MOVABLE)

5285

continue;

5285

continue;

5286

printk(KERN_CONT " %-8s ", zone_names[i]);

5286

printk(KERN_CONT " %-8s ", zone_names[i]);

5287

if (arch_zone_lowest_possible_pfn[i] ==

5287

if (arch_zone_lowest_possible_pfn[i] ==

5288

arch_zone_highest_possible_pfn[i])

5288

arch_zone_highest_possible_pfn[i])

5289

printk(KERN_CONT "empty\n");

5289

printk(KERN_CONT "empty\n");

5290

else

5290

else

5291

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5291

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5292

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5292

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5293

(arch_zone_highest_possible_pfn[i]

5293

(arch_zone_highest_possible_pfn[i]

5294

<< PAGE_SHIFT) - 1);

5294

<< PAGE_SHIFT) - 1);

5295

}

5295

}

5296

5297

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5297

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5298

printk("Movable zone start for each node\n");

5298

printk("Movable zone start for each node\n");

5299

for (i = 0; i < MAX_NUMNODES; i++) {

5299

for (i = 0; i < MAX_NUMNODES; i++) {

5300

if (zone_movable_pfn[i])

5300

if (zone_movable_pfn[i])

5301

printk(" Node %d: %#010lx\n", i,

5301

printk(" Node %d: %#010lx\n", i,

5302

zone_movable_pfn[i] << PAGE_SHIFT);

5302

zone_movable_pfn[i] << PAGE_SHIFT);

5303

}

5303

}

5304

5305

/* Print out the early node map */

5305

/* Print out the early node map */

5306

printk("Early memory node ranges\n");

5306

printk("Early memory node ranges\n");

5307

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5307

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5308

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5308

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5309

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5309

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5310

5311

/* Initialise every node */

5311

/* Initialise every node */

5312

mminit_verify_pageflags_layout();

5312

mminit_verify_pageflags_layout();

5313

setup_nr_node_ids();

5313

setup_nr_node_ids();

5314

for_each_online_node(nid) {

5314

for_each_online_node(nid) {

5315

pg_data_t *pgdat = NODE_DATA(nid);

5315

pg_data_t *pgdat = NODE_DATA(nid);

5316

free_area_init_node(nid, NULL,

5316

free_area_init_node(nid, NULL,

5317

find_min_pfn_for_node(nid), NULL);

5317

find_min_pfn_for_node(nid), NULL);

5318

5319

/* Any memory on that node */

5319

/* Any memory on that node */

5320

if (pgdat->node_present_pages)

5320

if (pgdat->node_present_pages)

5321

node_set_state(nid, N_MEMORY);

5321

node_set_state(nid, N_MEMORY);

5322

check_for_memory(pgdat, nid);

5322

check_for_memory(pgdat, nid);

5323

}

5323

}

5324

}

5324

}

5325

5326

static int __init cmdline_parse_core(char *p, unsigned long *core)

5326

static int __init cmdline_parse_core(char *p, unsigned long *core)

5327

{

5327

{

5328

unsigned long long coremem;

5328

unsigned long long coremem;

5329

if (!p)

5329

if (!p)

5330

return -EINVAL;

5330

return -EINVAL;

5331

5332

coremem = memparse(p, &p);

5332

coremem = memparse(p, &p);

5333

*core = coremem >> PAGE_SHIFT;

5333

*core = coremem >> PAGE_SHIFT;

5334

5335

/* Paranoid check that UL is enough for the coremem value */

5335

/* Paranoid check that UL is enough for the coremem value */

5336

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5336

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5337

5338

return 0;

5338

return 0;

5339

}

5339

}

5340

5341

/*

5341

/*

5342

* kernelcore=size sets the amount of memory for use for allocations that

5342

* kernelcore=size sets the amount of memory for use for allocations that

5343

* cannot be reclaimed or migrated.

5343

* cannot be reclaimed or migrated.

5344

*/

5344

*/

5345

static int __init cmdline_parse_kernelcore(char *p)

5345

static int __init cmdline_parse_kernelcore(char *p)

5346

{

5346

{

5347

return cmdline_parse_core(p, &required_kernelcore);

5347

return cmdline_parse_core(p, &required_kernelcore);

5348

}

5348

}

5349

5350

/*

5350

/*

5351

* movablecore=size sets the amount of memory for use for allocations that

5351

* movablecore=size sets the amount of memory for use for allocations that

5352

* can be reclaimed or migrated.

5352

* can be reclaimed or migrated.

5353

*/

5353

*/

5354

static int __init cmdline_parse_movablecore(char *p)

5354

static int __init cmdline_parse_movablecore(char *p)

5355

{

5355

{

5356

return cmdline_parse_core(p, &required_movablecore);

5356

return cmdline_parse_core(p, &required_movablecore);

5357

}

5357

}

5358

5359

early_param("kernelcore", cmdline_parse_kernelcore);

5359

early_param("kernelcore", cmdline_parse_kernelcore);

5360

early_param("movablecore", cmdline_parse_movablecore);

5360

early_param("movablecore", cmdline_parse_movablecore);

5361

5362

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5362

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5363

5364

void adjust_managed_page_count(struct page *page, long count)

5364

void adjust_managed_page_count(struct page *page, long count)

5365

{

5365

{

5366

spin_lock(&managed_page_count_lock);

5366

spin_lock(&managed_page_count_lock);

5367

page_zone(page)->managed_pages += count;

5367

page_zone(page)->managed_pages += count;

5368

totalram_pages += count;

5368

totalram_pages += count;

5369

#ifdef CONFIG_HIGHMEM

5369

#ifdef CONFIG_HIGHMEM

5370

if (PageHighMem(page))

5370

if (PageHighMem(page))

5371

totalhigh_pages += count;

5371

totalhigh_pages += count;

5372

#endif

5372

#endif

5373

spin_unlock(&managed_page_count_lock);

5373

spin_unlock(&managed_page_count_lock);

5374

}

5374

}

5375

EXPORT_SYMBOL(adjust_managed_page_count);

5375

EXPORT_SYMBOL(adjust_managed_page_count);

5376

5377

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5377

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5378

{

5378

{

5379

void *pos;

5379

void *pos;

5380

unsigned long pages = 0;

5380

unsigned long pages = 0;

5381

5382

start = (void *)PAGE_ALIGN((unsigned long)start);

5382

start = (void *)PAGE_ALIGN((unsigned long)start);

5383

end = (void *)((unsigned long)end & PAGE_MASK);

5383

end = (void *)((unsigned long)end & PAGE_MASK);

5384

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5384

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5385

if ((unsigned int)poison <= 0xFF)

5385

if ((unsigned int)poison <= 0xFF)

5386

memset(pos, poison, PAGE_SIZE);

5386

memset(pos, poison, PAGE_SIZE);

5387

free_reserved_page(virt_to_page(pos));

5387

free_reserved_page(virt_to_page(pos));

5388

}

5388

}

5389

5390

if (pages && s)

5390

if (pages && s)

5391

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5391

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5392

s, pages << (PAGE_SHIFT - 10), start, end);

5392

s, pages << (PAGE_SHIFT - 10), start, end);

5393

5394

return pages;

5394

return pages;

5395

}

5395

}

5396

EXPORT_SYMBOL(free_reserved_area);

5396

EXPORT_SYMBOL(free_reserved_area);

5397

5398

#ifdef CONFIG_HIGHMEM

5398

#ifdef CONFIG_HIGHMEM

5399

void free_highmem_page(struct page *page)

5399

void free_highmem_page(struct page *page)

5400

{

5400

{

5401

__free_reserved_page(page);

5401

__free_reserved_page(page);

5402

totalram_pages++;

5402

totalram_pages++;

5403

page_zone(page)->managed_pages++;

5403

page_zone(page)->managed_pages++;

5404

totalhigh_pages++;

5404

totalhigh_pages++;

5405

}

5405

}

5406

#endif

5406

#endif

5407

5408

5409

void __init mem_init_print_info(const char *str)

5409

void __init mem_init_print_info(const char *str)

5410

{

5410

{

5411

unsigned long physpages, codesize, datasize, rosize, bss_size;

5411

unsigned long physpages, codesize, datasize, rosize, bss_size;

5412

unsigned long init_code_size, init_data_size;

5412

unsigned long init_code_size, init_data_size;

5413

5414

physpages = get_num_physpages();

5414

physpages = get_num_physpages();

5415

codesize = _etext - _stext;

5415

codesize = _etext - _stext;

5416

datasize = _edata - _sdata;

5416

datasize = _edata - _sdata;

5417

rosize = __end_rodata - __start_rodata;

5417

rosize = __end_rodata - __start_rodata;

5418

bss_size = __bss_stop - __bss_start;

5418

bss_size = __bss_stop - __bss_start;

5419

init_data_size = __init_end - __init_begin;

5419

init_data_size = __init_end - __init_begin;

5420

init_code_size = _einittext - _sinittext;

5420

init_code_size = _einittext - _sinittext;

5421

5422

/*

5422

/*

5423

* Detect special cases and adjust section sizes accordingly:

5423

* Detect special cases and adjust section sizes accordingly:

5424

* 1) .init.* may be embedded into .data sections

5424

* 1) .init.* may be embedded into .data sections

5425

* 2) .init.text.* may be out of [__init_begin, __init_end],

5425

* 2) .init.text.* may be out of [__init_begin, __init_end],

5426

* please refer to arch/tile/kernel/vmlinux.lds.S.

5426

* please refer to arch/tile/kernel/vmlinux.lds.S.

5427

* 3) .rodata.* may be embedded into .text or .data sections.

5427

* 3) .rodata.* may be embedded into .text or .data sections.

5428

*/

5428

*/

5429

#define adj_init_size(start, end, size, pos, adj) \

5429

#define adj_init_size(start, end, size, pos, adj) \

5430

do { \

5430

do { \

5431

if (start <= pos && pos < end && size > adj) \

5431

if (start <= pos && pos < end && size > adj) \

5432

size -= adj; \

5432

size -= adj; \

5433

} while (0)

5433

} while (0)

5434

5435

adj_init_size(__init_begin, __init_end, init_data_size,

5435

adj_init_size(__init_begin, __init_end, init_data_size,

5436

_sinittext, init_code_size);

5436

_sinittext, init_code_size);

5437

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5437

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5438

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5438

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5439

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5439

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5440

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5440

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5441

5442

#undef adj_init_size

5442

#undef adj_init_size

5443

5444

printk("Memory: %luK/%luK available "

5444

printk("Memory: %luK/%luK available "

5445

"(%luK kernel code, %luK rwdata, %luK rodata, "

5445

"(%luK kernel code, %luK rwdata, %luK rodata, "

5446

"%luK init, %luK bss, %luK reserved"

5446

"%luK init, %luK bss, %luK reserved"

5447

#ifdef CONFIG_HIGHMEM

5447

#ifdef CONFIG_HIGHMEM

5448

", %luK highmem"

5448

", %luK highmem"

5449

#endif

5449

#endif

5450

"%s%s)\n",

5450

"%s%s)\n",

5451

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5451

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5452

codesize >> 10, datasize >> 10, rosize >> 10,

5452

codesize >> 10, datasize >> 10, rosize >> 10,

5453

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5453

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5454

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5454

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5455

#ifdef CONFIG_HIGHMEM

5455

#ifdef CONFIG_HIGHMEM

5456

totalhigh_pages << (PAGE_SHIFT-10),

5456

totalhigh_pages << (PAGE_SHIFT-10),

5457

#endif

5457

#endif

5458

str ? ", " : "", str ? str : "");

5458

str ? ", " : "", str ? str : "");

5459

}

5459

}

5460

5461

/**

5461

/**

5462

* set_dma_reserve - set the specified number of pages reserved in the first zone

5462

* set_dma_reserve - set the specified number of pages reserved in the first zone

5463

* @new_dma_reserve: The number of pages to mark reserved

5463

* @new_dma_reserve: The number of pages to mark reserved

5464

*

5464

*

5465

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5465

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5466

* In the DMA zone, a significant percentage may be consumed by kernel image

5466

* In the DMA zone, a significant percentage may be consumed by kernel image

5467

* and other unfreeable allocations which can skew the watermarks badly. This

5467

* and other unfreeable allocations which can skew the watermarks badly. This

5468

* function may optionally be used to account for unfreeable pages in the

5468

* function may optionally be used to account for unfreeable pages in the

5469

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5469

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5470

* smaller per-cpu batchsize.

5470

* smaller per-cpu batchsize.

5471

*/

5471

*/

5472

void __init set_dma_reserve(unsigned long new_dma_reserve)

5472

void __init set_dma_reserve(unsigned long new_dma_reserve)

5473

{

5473

{

5474

dma_reserve = new_dma_reserve;

5474

dma_reserve = new_dma_reserve;

5475

}

5475

}

5476

5477

void __init free_area_init(unsigned long *zones_size)

5477

void __init free_area_init(unsigned long *zones_size)

5478

{

5478

{

5479

free_area_init_node(0, zones_size,

5479

free_area_init_node(0, zones_size,

5480

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5480

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5481

}

5481

}

5482

5483

static int page_alloc_cpu_notify(struct notifier_block *self,

5483

static int page_alloc_cpu_notify(struct notifier_block *self,

5484

unsigned long action, void *hcpu)

5484

unsigned long action, void *hcpu)

5485

{

5485

{

5486

int cpu = (unsigned long)hcpu;

5486

int cpu = (unsigned long)hcpu;

5487

5488

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5488

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5489

lru_add_drain_cpu(cpu);

5489

lru_add_drain_cpu(cpu);

5490

drain_pages(cpu);

5490

drain_pages(cpu);

5491

5492

/*

5492

/*

5493

* Spill the event counters of the dead processor

5493

* Spill the event counters of the dead processor

5494

* into the current processors event counters.

5494

* into the current processors event counters.

5495

* This artificially elevates the count of the current

5495

* This artificially elevates the count of the current

5496

* processor.

5496

* processor.

5497

*/

5497

*/

5498

vm_events_fold_cpu(cpu);

5498

vm_events_fold_cpu(cpu);

5499

5500

/*

5500

/*

5501

* Zero the differential counters of the dead processor

5501

* Zero the differential counters of the dead processor

5502

* so that the vm statistics are consistent.

5502

* so that the vm statistics are consistent.

5503

*

5503

*

5504

* This is only okay since the processor is dead and cannot

5504

* This is only okay since the processor is dead and cannot

5505

* race with what we are doing.

5505

* race with what we are doing.

5506

*/

5506

*/

5507

cpu_vm_stats_fold(cpu);

5507

cpu_vm_stats_fold(cpu);

5508

}

5508

}

5509

return NOTIFY_OK;

5509

return NOTIFY_OK;

5510

}

5510

}

5511

5512

void __init page_alloc_init(void)

5512

void __init page_alloc_init(void)

5513

{

5513

{

5514

hotcpu_notifier(page_alloc_cpu_notify, 0);

5514

hotcpu_notifier(page_alloc_cpu_notify, 0);

5515

}

5515

}

5516

5517

/*

5517

/*

5518

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5518

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5519

* or min_free_kbytes changes.

5519

* or min_free_kbytes changes.

5520

*/

5520

*/

5521

static void calculate_totalreserve_pages(void)

5521

static void calculate_totalreserve_pages(void)

5522

{

5522

{

5523

struct pglist_data *pgdat;

5523

struct pglist_data *pgdat;

5524

unsigned long reserve_pages = 0;

5524

unsigned long reserve_pages = 0;

5525

enum zone_type i, j;

5525

enum zone_type i, j;

5526

5527

for_each_online_pgdat(pgdat) {

5527

for_each_online_pgdat(pgdat) {

5528

for (i = 0; i < MAX_NR_ZONES; i++) {

5528

for (i = 0; i < MAX_NR_ZONES; i++) {

5529

struct zone *zone = pgdat->node_zones + i;

5529

struct zone *zone = pgdat->node_zones + i;

5530

unsigned long max = 0;

5530

unsigned long max = 0;

5531

5532

/* Find valid and maximum lowmem_reserve in the zone */

5532

/* Find valid and maximum lowmem_reserve in the zone */

5533

for (j = i; j < MAX_NR_ZONES; j++) {

5533

for (j = i; j < MAX_NR_ZONES; j++) {

5534

if (zone->lowmem_reserve[j] > max)

5534

if (zone->lowmem_reserve[j] > max)

5535

max = zone->lowmem_reserve[j];

5535

max = zone->lowmem_reserve[j];

5536

}

5536

}

5537

5538

/* we treat the high watermark as reserved pages. */

5538

/* we treat the high watermark as reserved pages. */

5539

max += high_wmark_pages(zone);

5539

max += high_wmark_pages(zone);

5540

5541

if (max > zone->managed_pages)

5541

if (max > zone->managed_pages)

5542

max = zone->managed_pages;

5542

max = zone->managed_pages;

5543

reserve_pages += max;

5543

reserve_pages += max;

5544

/*

5544

/*

5545

* Lowmem reserves are not available to

5545

* Lowmem reserves are not available to

5546

* GFP_HIGHUSER page cache allocations and

5546

* GFP_HIGHUSER page cache allocations and

5547

* kswapd tries to balance zones to their high

5547

* kswapd tries to balance zones to their high

5548

* watermark. As a result, neither should be

5548

* watermark. As a result, neither should be

5549

* regarded as dirtyable memory, to prevent a

5549

* regarded as dirtyable memory, to prevent a

5550

* situation where reclaim has to clean pages

5550

* situation where reclaim has to clean pages

5551

* in order to balance the zones.

5551

* in order to balance the zones.

5552

*/

5552

*/

5553

zone->dirty_balance_reserve = max;

5553

zone->dirty_balance_reserve = max;

5554

}

5554

}

5555

}

5555

}

5556

dirty_balance_reserve = reserve_pages;

5556

dirty_balance_reserve = reserve_pages;

5557

totalreserve_pages = reserve_pages;

5557

totalreserve_pages = reserve_pages;

5558

}

5558

}

5559

5560

/*

5560

/*

5561

* setup_per_zone_lowmem_reserve - called whenever

5561

* setup_per_zone_lowmem_reserve - called whenever

5562

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5562

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5563

* has a correct pages reserved value, so an adequate number of

5563

* has a correct pages reserved value, so an adequate number of

5564

* pages are left in the zone after a successful __alloc_pages().

5564

* pages are left in the zone after a successful __alloc_pages().

5565

*/

5565

*/

5566

static void setup_per_zone_lowmem_reserve(void)

5566

static void setup_per_zone_lowmem_reserve(void)

5567

{

5567

{

5568

struct pglist_data *pgdat;

5568

struct pglist_data *pgdat;

5569

enum zone_type j, idx;

5569

enum zone_type j, idx;

5570

5571

for_each_online_pgdat(pgdat) {

5571

for_each_online_pgdat(pgdat) {

5572

for (j = 0; j < MAX_NR_ZONES; j++) {

5572

for (j = 0; j < MAX_NR_ZONES; j++) {

5573

struct zone *zone = pgdat->node_zones + j;

5573

struct zone *zone = pgdat->node_zones + j;

5574

unsigned long managed_pages = zone->managed_pages;

5574

unsigned long managed_pages = zone->managed_pages;

5575

5576

zone->lowmem_reserve[j] = 0;

5576

zone->lowmem_reserve[j] = 0;

5577

5578

idx = j;

5578

idx = j;

5579

while (idx) {

5579

while (idx) {

5580

struct zone *lower_zone;

5580

struct zone *lower_zone;

5581

5582

idx--;

5582

idx--;

5583

5584

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5584

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5585

sysctl_lowmem_reserve_ratio[idx] = 1;

5585

sysctl_lowmem_reserve_ratio[idx] = 1;

5586

5587

lower_zone = pgdat->node_zones + idx;

5587

lower_zone = pgdat->node_zones + idx;

5588

lower_zone->lowmem_reserve[j] = managed_pages /

5588

lower_zone->lowmem_reserve[j] = managed_pages /

5589

sysctl_lowmem_reserve_ratio[idx];

5589

sysctl_lowmem_reserve_ratio[idx];

5590

managed_pages += lower_zone->managed_pages;

5590

managed_pages += lower_zone->managed_pages;

5591

}

5591

}

5592

}

5592

}

5593

}

5593

}

5594

5595

/* update totalreserve_pages */

5595

/* update totalreserve_pages */

5596

calculate_totalreserve_pages();

5596

calculate_totalreserve_pages();

5597

}

5597

}

5598

5599

static void __setup_per_zone_wmarks(void)

5599

static void __setup_per_zone_wmarks(void)

5600

{

5600

{

5601

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5601

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5602

unsigned long lowmem_pages = 0;

5602

unsigned long lowmem_pages = 0;

5603

struct zone *zone;

5603

struct zone *zone;

5604

unsigned long flags;

5604

unsigned long flags;

5605

5606

/* Calculate total number of !ZONE_HIGHMEM pages */

5606

/* Calculate total number of !ZONE_HIGHMEM pages */

5607

for_each_zone(zone) {

5607

for_each_zone(zone) {

5608

if (!is_highmem(zone))

5608

if (!is_highmem(zone))

5609

lowmem_pages += zone->managed_pages;

5609

lowmem_pages += zone->managed_pages;

5610

}

5610

}

5611

5612

for_each_zone(zone) {

5612

for_each_zone(zone) {

5613

u64 tmp;

5613

u64 tmp;

5614

5615

spin_lock_irqsave(&zone->lock, flags);

5615

spin_lock_irqsave(&zone->lock, flags);

5616

tmp = (u64)pages_min * zone->managed_pages;

5616

tmp = (u64)pages_min * zone->managed_pages;

5617

do_div(tmp, lowmem_pages);

5617

do_div(tmp, lowmem_pages);

5618

if (is_highmem(zone)) {

5618

if (is_highmem(zone)) {

5619

/*

5619

/*

5620

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5620

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5621

* need highmem pages, so cap pages_min to a small

5621

* need highmem pages, so cap pages_min to a small

5622

* value here.

5622

* value here.

5623

*

5623

*

5624

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5624

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5625

* deltas controls asynch page reclaim, and so should

5625

* deltas controls asynch page reclaim, and so should

5626

* not be capped for highmem.

5626

* not be capped for highmem.

5627

*/

5627

*/

5628

unsigned long min_pages;

5628

unsigned long min_pages;

5629

5630

min_pages = zone->managed_pages / 1024;

5630

min_pages = zone->managed_pages / 1024;

5631

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5631

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5632

zone->watermark[WMARK_MIN] = min_pages;

5632

zone->watermark[WMARK_MIN] = min_pages;

5633

} else {

5633

} else {

5634

/*

5634

/*

5635

* If it's a lowmem zone, reserve a number of pages

5635

* If it's a lowmem zone, reserve a number of pages

5636

* proportionate to the zone's size.

5636

* proportionate to the zone's size.

5637

*/

5637

*/

5638

zone->watermark[WMARK_MIN] = tmp;

5638

zone->watermark[WMARK_MIN] = tmp;

5639

}

5639

}

5640

5641

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5641

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5642

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5642

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5643

5644

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5644

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5645

high_wmark_pages(zone) -

5645

high_wmark_pages(zone) -

5646

low_wmark_pages(zone) -

5646

low_wmark_pages(zone) -

5647

zone_page_state(zone, NR_ALLOC_BATCH));

5647

zone_page_state(zone, NR_ALLOC_BATCH));

5648

5649

setup_zone_migrate_reserve(zone);

5649

setup_zone_migrate_reserve(zone);

5650

spin_unlock_irqrestore(&zone->lock, flags);

5650

spin_unlock_irqrestore(&zone->lock, flags);

5651

}

5651

}

5652

5653

/* update totalreserve_pages */

5653

/* update totalreserve_pages */

5654

calculate_totalreserve_pages();

5654

calculate_totalreserve_pages();

5655

}

5655

}

5656

5657

/**

5657

/**

5658

* setup_per_zone_wmarks - called when min_free_kbytes changes

5658

* setup_per_zone_wmarks - called when min_free_kbytes changes

5659

* or when memory is hot-{added|removed}

5659

* or when memory is hot-{added|removed}

5660

*

5660

*

5661

* Ensures that the watermark[min,low,high] values for each zone are set

5661

* Ensures that the watermark[min,low,high] values for each zone are set

5662

* correctly with respect to min_free_kbytes.

5662

* correctly with respect to min_free_kbytes.

5663

*/

5663

*/

5664

void setup_per_zone_wmarks(void)

5664

void setup_per_zone_wmarks(void)

5665

{

5665

{

5666

mutex_lock(&zonelists_mutex);

5666

mutex_lock(&zonelists_mutex);

5667

__setup_per_zone_wmarks();

5667

__setup_per_zone_wmarks();

5668

mutex_unlock(&zonelists_mutex);

5668

mutex_unlock(&zonelists_mutex);

5669

}

5669

}

5670

5671

/*

5671

/*

5672

* The inactive anon list should be small enough that the VM never has to

5672

* The inactive anon list should be small enough that the VM never has to

5673

* do too much work, but large enough that each inactive page has a chance

5673

* do too much work, but large enough that each inactive page has a chance

5674

* to be referenced again before it is swapped out.

5674

* to be referenced again before it is swapped out.

5675

*

5675

*

5676

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5676

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5677

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5677

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5678

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5678

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5679

* the anonymous pages are kept on the inactive list.

5679

* the anonymous pages are kept on the inactive list.

5680

*

5680

*

5681

* total target max

5681

* total target max

5682

* memory ratio inactive anon

5682

* memory ratio inactive anon

5683

* -------------------------------------

5683

* -------------------------------------

5684

* 10MB 1 5MB

5684

* 10MB 1 5MB

5685

* 100MB 1 50MB

5685

* 100MB 1 50MB

5686

* 1GB 3 250MB

5686

* 1GB 3 250MB

5687

* 10GB 10 0.9GB

5687

* 10GB 10 0.9GB

5688

* 100GB 31 3GB

5688

* 100GB 31 3GB

5689

* 1TB 101 10GB

5689

* 1TB 101 10GB

5690

* 10TB 320 32GB

5690

* 10TB 320 32GB

5691

*/

5691

*/

5692

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5692

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5693

{

5693

{

5694

unsigned int gb, ratio;

5694

unsigned int gb, ratio;

5695

5696

/* Zone size in gigabytes */

5696

/* Zone size in gigabytes */

5697

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5697

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5698

if (gb)

5698

if (gb)

5699

ratio = int_sqrt(10 * gb);

5699

ratio = int_sqrt(10 * gb);

5700

else

5700

else

5701

ratio = 1;

5701

ratio = 1;

5702

5703

zone->inactive_ratio = ratio;

5703

zone->inactive_ratio = ratio;

5704

}

5704

}

5705

5706

static void __meminit setup_per_zone_inactive_ratio(void)

5706

static void __meminit setup_per_zone_inactive_ratio(void)

5707

{

5707

{

5708

struct zone *zone;

5708

struct zone *zone;

5709

5710

for_each_zone(zone)

5710

for_each_zone(zone)

5711

calculate_zone_inactive_ratio(zone);

5711

calculate_zone_inactive_ratio(zone);

5712

}

5712

}

5713

5714

/*

5714

/*

5715

* Initialise min_free_kbytes.

5715

* Initialise min_free_kbytes.

5716

*

5716

*

5717

* For small machines we want it small (128k min). For large machines

5717

* For small machines we want it small (128k min). For large machines

5718

* we want it large (64MB max). But it is not linear, because network

5718

* we want it large (64MB max). But it is not linear, because network

5719

* bandwidth does not increase linearly with machine size. We use

5719

* bandwidth does not increase linearly with machine size. We use

5720

*

5720

*

5721

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5721

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5722

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5722

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5723

*

5723

*

5724

* which yields

5724

* which yields

5725

*

5725

*

5726

* 16MB: 512k

5726

* 16MB: 512k

5727

* 32MB: 724k

5727

* 32MB: 724k

5728

* 64MB: 1024k

5728

* 64MB: 1024k

5729

* 128MB: 1448k

5729

* 128MB: 1448k

5730

* 256MB: 2048k

5730

* 256MB: 2048k

5731

* 512MB: 2896k

5731

* 512MB: 2896k

5732

* 1024MB: 4096k

5732

* 1024MB: 4096k

5733

* 2048MB: 5792k

5733

* 2048MB: 5792k

5734

* 4096MB: 8192k

5734

* 4096MB: 8192k

5735

* 8192MB: 11584k

5735

* 8192MB: 11584k

5736

* 16384MB: 16384k

5736

* 16384MB: 16384k

5737

*/

5737

*/

5738

int __meminit init_per_zone_wmark_min(void)

5738

int __meminit init_per_zone_wmark_min(void)

5739

{

5739

{

5740

unsigned long lowmem_kbytes;

5740

unsigned long lowmem_kbytes;

5741

int new_min_free_kbytes;

5741

int new_min_free_kbytes;

5742

5743

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5743

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5744

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5744

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5745

5746

if (new_min_free_kbytes > user_min_free_kbytes) {

5746

if (new_min_free_kbytes > user_min_free_kbytes) {

5747

min_free_kbytes = new_min_free_kbytes;

5747

min_free_kbytes = new_min_free_kbytes;

5748

if (min_free_kbytes < 128)

5748

if (min_free_kbytes < 128)

5749

min_free_kbytes = 128;

5749

min_free_kbytes = 128;

5750

if (min_free_kbytes > 65536)

5750

if (min_free_kbytes > 65536)

5751

min_free_kbytes = 65536;

5751

min_free_kbytes = 65536;

5752

} else {

5752

} else {

5753

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5753

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5754

new_min_free_kbytes, user_min_free_kbytes);

5754

new_min_free_kbytes, user_min_free_kbytes);

5755

}

5755

}

5756

setup_per_zone_wmarks();

5756

setup_per_zone_wmarks();

5757

refresh_zone_stat_thresholds();

5757

refresh_zone_stat_thresholds();

5758

setup_per_zone_lowmem_reserve();

5758

setup_per_zone_lowmem_reserve();

5759

setup_per_zone_inactive_ratio();

5759

setup_per_zone_inactive_ratio();

5760

return 0;

5760

return 0;

5761

}

5761

}

5762

module_init(init_per_zone_wmark_min)

5762

module_init(init_per_zone_wmark_min)

5763

5764

/*

5764

/*

5765

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5765

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5766

* that we can call two helper functions whenever min_free_kbytes

5766

* that we can call two helper functions whenever min_free_kbytes

5767

* changes.

5767

* changes.

5768

*/

5768

*/

5769

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5769

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5770

void __user *buffer, size_t *length, loff_t *ppos)

5770

void __user *buffer, size_t *length, loff_t *ppos)

5771

{

5771

{

5772

int rc;

5772

int rc;

5773

5774

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5774

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5775

if (rc)

5775

if (rc)

5776

return rc;

5776

return rc;

5777

5778

if (write) {

5778

if (write) {

5779

user_min_free_kbytes = min_free_kbytes;

5779

user_min_free_kbytes = min_free_kbytes;

5780

setup_per_zone_wmarks();

5780

setup_per_zone_wmarks();

5781

}

5781

}

5782

return 0;

5782

return 0;

5783

}

5783

}

5784

5785

#ifdef CONFIG_NUMA

5785

#ifdef CONFIG_NUMA

5786

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5786

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5787

void __user *buffer, size_t *length, loff_t *ppos)

5787

void __user *buffer, size_t *length, loff_t *ppos)

5788

{

5788

{

5789

struct zone *zone;

5789

struct zone *zone;

5790

int rc;

5790

int rc;

5791

5792

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5792

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5793

if (rc)

5793

if (rc)

5794

return rc;

5794

return rc;

5795

5796

for_each_zone(zone)

5796

for_each_zone(zone)

5797

zone->min_unmapped_pages = (zone->managed_pages *

5797

zone->min_unmapped_pages = (zone->managed_pages *

5798

sysctl_min_unmapped_ratio) / 100;

5798

sysctl_min_unmapped_ratio) / 100;

5799

return 0;

5799

return 0;

5800

}

5800

}

5801

5802

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5802

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5803

void __user *buffer, size_t *length, loff_t *ppos)

5803

void __user *buffer, size_t *length, loff_t *ppos)

5804

{

5804

{

5805

struct zone *zone;

5805

struct zone *zone;

5806

int rc;

5806

int rc;

5807

5808

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5808

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5809

if (rc)

5809

if (rc)

5810

return rc;

5810

return rc;

5811

5812

for_each_zone(zone)

5812

for_each_zone(zone)

5813

zone->min_slab_pages = (zone->managed_pages *

5813

zone->min_slab_pages = (zone->managed_pages *

5814

sysctl_min_slab_ratio) / 100;

5814

sysctl_min_slab_ratio) / 100;

5815

return 0;

5815

return 0;

5816

}

5816

}

5817

#endif

5817

#endif

5818

5819

/*

5819

/*

5820

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5820

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5821

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5821

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5822

* whenever sysctl_lowmem_reserve_ratio changes.

5822

* whenever sysctl_lowmem_reserve_ratio changes.

5823

*

5823

*

5824

* The reserve ratio obviously has absolutely no relation with the

5824

* The reserve ratio obviously has absolutely no relation with the

5825

* minimum watermarks. The lowmem reserve ratio can only make sense

5825

* minimum watermarks. The lowmem reserve ratio can only make sense

5826

* if in function of the boot time zone sizes.

5826

* if in function of the boot time zone sizes.

5827

*/

5827

*/

5828

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5828

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5829

void __user *buffer, size_t *length, loff_t *ppos)

5829

void __user *buffer, size_t *length, loff_t *ppos)

5830

{

5830

{

5831

proc_dointvec_minmax(table, write, buffer, length, ppos);

5831

proc_dointvec_minmax(table, write, buffer, length, ppos);

5832

setup_per_zone_lowmem_reserve();

5832

setup_per_zone_lowmem_reserve();

5833

return 0;

5833

return 0;

5834

}

5834

}

5835

5836

/*

5836

/*

5837

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5837

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5838

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5838

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5839

* pagelist can have before it gets flushed back to buddy allocator.

5839

* pagelist can have before it gets flushed back to buddy allocator.

5840

*/

5840

*/

5841

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5841

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5842

void __user *buffer, size_t *length, loff_t *ppos)

5842

void __user *buffer, size_t *length, loff_t *ppos)

5843

{

5843

{

5844

struct zone *zone;

5844

struct zone *zone;

5845

int old_percpu_pagelist_fraction;

5845

int old_percpu_pagelist_fraction;

5846

int ret;

5846

int ret;

5847

5848

mutex_lock(&pcp_batch_high_lock);

5848

mutex_lock(&pcp_batch_high_lock);

5849

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5849

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5850

5851

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5851

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5852

if (!write || ret < 0)

5852

if (!write || ret < 0)

5853

goto out;

5853

goto out;

5854

5855

/* Sanity checking to avoid pcp imbalance */

5855

/* Sanity checking to avoid pcp imbalance */

5856

if (percpu_pagelist_fraction &&

5856

if (percpu_pagelist_fraction &&

5857

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5857

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5858

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5858

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5859

ret = -EINVAL;

5859

ret = -EINVAL;

5860

goto out;

5860

goto out;

5861

}

5861

}

5862

5863

/* No change? */

5863

/* No change? */

5864

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5864

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5865

goto out;

5865

goto out;

5866

5867

for_each_populated_zone(zone) {

5867

for_each_populated_zone(zone) {

5868

unsigned int cpu;

5868

unsigned int cpu;

5869

5870

for_each_possible_cpu(cpu)

5870

for_each_possible_cpu(cpu)

5871

pageset_set_high_and_batch(zone,

5871

pageset_set_high_and_batch(zone,

5872

per_cpu_ptr(zone->pageset, cpu));

5872

per_cpu_ptr(zone->pageset, cpu));

5873

}

5873

}

5874

out:

5874

out:

5875

mutex_unlock(&pcp_batch_high_lock);

5875

mutex_unlock(&pcp_batch_high_lock);

5876

return ret;

5876

return ret;

5877

}

5877

}

5878

5879

int hashdist = HASHDIST_DEFAULT;

5879

int hashdist = HASHDIST_DEFAULT;

5880

5881

#ifdef CONFIG_NUMA

5881

#ifdef CONFIG_NUMA

5882

static int __init set_hashdist(char *str)

5882

static int __init set_hashdist(char *str)

5883

{

5883

{

5884

if (!str)

5884

if (!str)

5885

return 0;

5885

return 0;

5886

hashdist = simple_strtoul(str, &str, 0);

5886

hashdist = simple_strtoul(str, &str, 0);

5887

return 1;

5887

return 1;

5888

}

5888

}

5889

__setup("hashdist=", set_hashdist);

5889

__setup("hashdist=", set_hashdist);

5890

#endif

5890

#endif

5891

5892

/*

5892

/*

5893

* allocate a large system hash table from bootmem

5893

* allocate a large system hash table from bootmem

5894

* - it is assumed that the hash table must contain an exact power-of-2

5894

* - it is assumed that the hash table must contain an exact power-of-2

5895

* quantity of entries

5895

* quantity of entries

5896

* - limit is the number of hash buckets, not the total allocation size

5896

* - limit is the number of hash buckets, not the total allocation size

5897

*/

5897

*/

5898

void *__init alloc_large_system_hash(const char *tablename,

5898

void *__init alloc_large_system_hash(const char *tablename,

5899

unsigned long bucketsize,

5899

unsigned long bucketsize,

5900

unsigned long numentries,

5900

unsigned long numentries,

5901

int scale,

5901

int scale,

5902

int flags,

5902

int flags,

5903

unsigned int *_hash_shift,

5903

unsigned int *_hash_shift,

5904

unsigned int *_hash_mask,

5904

unsigned int *_hash_mask,

5905

unsigned long low_limit,

5905

unsigned long low_limit,

5906

unsigned long high_limit)

5906

unsigned long high_limit)

5907

{

5907

{

5908

unsigned long long max = high_limit;

5908

unsigned long long max = high_limit;

5909

unsigned long log2qty, size;

5909

unsigned long log2qty, size;

5910

void *table = NULL;

5910

void *table = NULL;

5911

5912

/* allow the kernel cmdline to have a say */

5912

/* allow the kernel cmdline to have a say */

5913

if (!numentries) {

5913

if (!numentries) {

5914

/* round applicable memory size up to nearest megabyte */

5914

/* round applicable memory size up to nearest megabyte */

5915

numentries = nr_kernel_pages;

5915

numentries = nr_kernel_pages;

5916

5917

/* It isn't necessary when PAGE_SIZE >= 1MB */

5917

/* It isn't necessary when PAGE_SIZE >= 1MB */

5918

if (PAGE_SHIFT < 20)

5918

if (PAGE_SHIFT < 20)

5919

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5919

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5920

5921

/* limit to 1 bucket per 2^scale bytes of low memory */

5921

/* limit to 1 bucket per 2^scale bytes of low memory */

5922

if (scale > PAGE_SHIFT)

5922

if (scale > PAGE_SHIFT)

5923

numentries >>= (scale - PAGE_SHIFT);

5923

numentries >>= (scale - PAGE_SHIFT);

5924

else

5924

else

5925

numentries <<= (PAGE_SHIFT - scale);

5925

numentries <<= (PAGE_SHIFT - scale);

5926

5927

/* Make sure we've got at least a 0-order allocation.. */

5927

/* Make sure we've got at least a 0-order allocation.. */

5928

if (unlikely(flags & HASH_SMALL)) {

5928

if (unlikely(flags & HASH_SMALL)) {

5929

/* Makes no sense without HASH_EARLY */

5929

/* Makes no sense without HASH_EARLY */

5930

WARN_ON(!(flags & HASH_EARLY));

5930

WARN_ON(!(flags & HASH_EARLY));

5931

if (!(numentries >> *_hash_shift)) {

5931

if (!(numentries >> *_hash_shift)) {

5932

numentries = 1UL << *_hash_shift;

5932

numentries = 1UL << *_hash_shift;

5933

BUG_ON(!numentries);

5933

BUG_ON(!numentries);

5934

}

5934

}

5935

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5935

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5936

numentries = PAGE_SIZE / bucketsize;

5936

numentries = PAGE_SIZE / bucketsize;

5937

}

5937

}

5938

numentries = roundup_pow_of_two(numentries);

5938

numentries = roundup_pow_of_two(numentries);

5939

5940

/* limit allocation size to 1/16 total memory by default */

5940

/* limit allocation size to 1/16 total memory by default */

5941

if (max == 0) {

5941

if (max == 0) {

5942

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5942

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5943

do_div(max, bucketsize);

5943

do_div(max, bucketsize);

5944

}

5944

}

5945

max = min(max, 0x80000000ULL);

5945

max = min(max, 0x80000000ULL);

5946

5947

if (numentries < low_limit)

5947

if (numentries < low_limit)

5948

numentries = low_limit;

5948

numentries = low_limit;

5949

if (numentries > max)

5949

if (numentries > max)

5950

numentries = max;

5950

numentries = max;

5951

5952

log2qty = ilog2(numentries);

5952

log2qty = ilog2(numentries);

5953

5954

do {

5954

do {

5955

size = bucketsize << log2qty;

5955

size = bucketsize << log2qty;

5956

if (flags & HASH_EARLY)

5956

if (flags & HASH_EARLY)

5957

table = alloc_bootmem_nopanic(size);

5957

table = alloc_bootmem_nopanic(size);

5958

else if (hashdist)

5958

else if (hashdist)

5959

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5959

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5960

else {

5960

else {

5961

/*

5961

/*

5962

* If bucketsize is not a power-of-two, we may free

5962

* If bucketsize is not a power-of-two, we may free

5963

* some pages at the end of hash table which

5963

* some pages at the end of hash table which

5964

* alloc_pages_exact() automatically does

5964

* alloc_pages_exact() automatically does

5965

*/

5965

*/

5966

if (get_order(size) < MAX_ORDER) {

5966

if (get_order(size) < MAX_ORDER) {

5967

table = alloc_pages_exact(size, GFP_ATOMIC);

5967

table = alloc_pages_exact(size, GFP_ATOMIC);

5968

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5968

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5969

}

5969

}

5970

}

5970

}

5971

} while (!table && size > PAGE_SIZE && --log2qty);

5971

} while (!table && size > PAGE_SIZE && --log2qty);

5972

5973

if (!table)

5973

if (!table)

5974

panic("Failed to allocate %s hash table\n", tablename);

5974

panic("Failed to allocate %s hash table\n", tablename);

5975

5976

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5976

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5977

tablename,

5977

tablename,

5978

(1UL << log2qty),

5978

(1UL << log2qty),

5979

ilog2(size) - PAGE_SHIFT,

5979

ilog2(size) - PAGE_SHIFT,

5980

size);

5980

size);

5981

5982

if (_hash_shift)

5982

if (_hash_shift)

5983

*_hash_shift = log2qty;

5983

*_hash_shift = log2qty;

5984

if (_hash_mask)

5984

if (_hash_mask)

5985

*_hash_mask = (1 << log2qty) - 1;

5985

*_hash_mask = (1 << log2qty) - 1;

5986

5987

return table;

5987

return table;

5988

}

5988

}

5989

5990

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5990

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5991

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5991

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5992

unsigned long pfn)

5992

unsigned long pfn)

5993

{

5993

{

5994

#ifdef CONFIG_SPARSEMEM

5994

#ifdef CONFIG_SPARSEMEM

5995

return __pfn_to_section(pfn)->pageblock_flags;

5995

return __pfn_to_section(pfn)->pageblock_flags;

5996

#else

5996

#else

5997

return zone->pageblock_flags;

5997

return zone->pageblock_flags;

5998

#endif /* CONFIG_SPARSEMEM */

5998

#endif /* CONFIG_SPARSEMEM */

5999

}

5999

}

6000

6001

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

6001

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

6002

{

6002

{

6003

#ifdef CONFIG_SPARSEMEM

6003

#ifdef CONFIG_SPARSEMEM

6004

pfn &= (PAGES_PER_SECTION-1);

6004

pfn &= (PAGES_PER_SECTION-1);

6005

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6005

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6006

#else

6006

#else

6007

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

6007

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

6008

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6008

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6009

#endif /* CONFIG_SPARSEMEM */

6009

#endif /* CONFIG_SPARSEMEM */

6010

}

6010

}

6011

6012

/**

6012

/**

6013

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6013

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6014

* @page: The page within the block of interest

6014

* @page: The page within the block of interest

6015

* @start_bitidx: The first bit of interest to retrieve

6015

* @start_bitidx: The first bit of interest to retrieve

6016

* @end_bitidx: The last bit of interest

6016

* @end_bitidx: The last bit of interest

6017

* returns pageblock_bits flags

6017

* returns pageblock_bits flags

6018

*/

6018

*/

6019

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

6019

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

6020

unsigned long end_bitidx,

6020

unsigned long end_bitidx,

6021

unsigned long mask)

6021

unsigned long mask)

6022

{

6022

{

6023

struct zone *zone;

6023

struct zone *zone;

6024

unsigned long *bitmap;

6024

unsigned long *bitmap;

6025

unsigned long bitidx, word_bitidx;

6025

unsigned long bitidx, word_bitidx;

6026

unsigned long word;

6026

unsigned long word;

6027

6028

zone = page_zone(page);

6028

zone = page_zone(page);

6029

bitmap = get_pageblock_bitmap(zone, pfn);

6029

bitmap = get_pageblock_bitmap(zone, pfn);

6030

bitidx = pfn_to_bitidx(zone, pfn);

6030

bitidx = pfn_to_bitidx(zone, pfn);

6031

word_bitidx = bitidx / BITS_PER_LONG;

6031

word_bitidx = bitidx / BITS_PER_LONG;

6032

bitidx &= (BITS_PER_LONG-1);

6032

bitidx &= (BITS_PER_LONG-1);

6033

6034

word = bitmap[word_bitidx];

6034

word = bitmap[word_bitidx];

6035

bitidx += end_bitidx;

6035

bitidx += end_bitidx;

6036

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6036

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6037

}

6037

}

6038

6039

/**

6039

/**

6040

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6040

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6041

* @page: The page within the block of interest

6041

* @page: The page within the block of interest

6042

* @start_bitidx: The first bit of interest

6042

* @start_bitidx: The first bit of interest

6043

* @end_bitidx: The last bit of interest

6043

* @end_bitidx: The last bit of interest

6044

* @flags: The flags to set

6044

* @flags: The flags to set

6045

*/

6045

*/

6046

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

6046

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

6047

unsigned long pfn,

6047

unsigned long pfn,

6048

unsigned long end_bitidx,

6048

unsigned long end_bitidx,

6049

unsigned long mask)

6049

unsigned long mask)

6050

{

6050

{

6051

struct zone *zone;

6051

struct zone *zone;

6052

unsigned long *bitmap;

6052

unsigned long *bitmap;

6053

unsigned long bitidx, word_bitidx;

6053

unsigned long bitidx, word_bitidx;

6054

unsigned long old_word, word;

6054

unsigned long old_word, word;

6055

6056

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6056

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6057

6058

zone = page_zone(page);

6058

zone = page_zone(page);

6059

bitmap = get_pageblock_bitmap(zone, pfn);

6059

bitmap = get_pageblock_bitmap(zone, pfn);

6060

bitidx = pfn_to_bitidx(zone, pfn);

6060

bitidx = pfn_to_bitidx(zone, pfn);

6061

word_bitidx = bitidx / BITS_PER_LONG;

6061

word_bitidx = bitidx / BITS_PER_LONG;

6062

bitidx &= (BITS_PER_LONG-1);

6062

bitidx &= (BITS_PER_LONG-1);

6063

6064

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6064

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6065

6066

bitidx += end_bitidx;

6066

bitidx += end_bitidx;

6067

mask <<= (BITS_PER_LONG - bitidx - 1);

6067

mask <<= (BITS_PER_LONG - bitidx - 1);

6068

flags <<= (BITS_PER_LONG - bitidx - 1);

6068

flags <<= (BITS_PER_LONG - bitidx - 1);

6069

6070

word = ACCESS_ONCE(bitmap[word_bitidx]);

6070

word = ACCESS_ONCE(bitmap[word_bitidx]);

6071

for (;;) {

6071

for (;;) {

6072

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6072

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6073

if (word == old_word)

6073

if (word == old_word)

6074

break;

6074

break;

6075

word = old_word;

6075

word = old_word;

6076

}

6076

}

6077

}

6077

}

6078

6079

/*

6079

/*

6080

* This function checks whether pageblock includes unmovable pages or not.

6080

* This function checks whether pageblock includes unmovable pages or not.

6081

* If @count is not zero, it is okay to include less @count unmovable pages

6081

* If @count is not zero, it is okay to include less @count unmovable pages

6082

*

6082

*

6083

* PageLRU check without isolation or lru_lock could race so that

6083

* PageLRU check without isolation or lru_lock could race so that

6084

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6084

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6085

* expect this function should be exact.

6085

* expect this function should be exact.

6086

*/

6086

*/

6087

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6087

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6088

bool skip_hwpoisoned_pages)

6088

bool skip_hwpoisoned_pages)

6089

{

6089

{

6090

unsigned long pfn, iter, found;

6090

unsigned long pfn, iter, found;

6091

int mt;

6091

int mt;

6092

6093

/*

6093

/*

6094

* For avoiding noise data, lru_add_drain_all() should be called

6094

* For avoiding noise data, lru_add_drain_all() should be called

6095

* If ZONE_MOVABLE, the zone never contains unmovable pages

6095

* If ZONE_MOVABLE, the zone never contains unmovable pages

6096

*/

6096

*/

6097

if (zone_idx(zone) == ZONE_MOVABLE)

6097

if (zone_idx(zone) == ZONE_MOVABLE)

6098

return false;

6098

return false;

6099

mt = get_pageblock_migratetype(page);

6099

mt = get_pageblock_migratetype(page);

6100

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6100

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6101

return false;

6101

return false;

6102

6103

pfn = page_to_pfn(page);

6103

pfn = page_to_pfn(page);

6104

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6104

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6105

unsigned long check = pfn + iter;

6105

unsigned long check = pfn + iter;

6106

6107

if (!pfn_valid_within(check))

6107

if (!pfn_valid_within(check))

6108

continue;

6108

continue;

6109

6110

page = pfn_to_page(check);

6110

page = pfn_to_page(check);

6111

6112

/*

6112

/*

6113

* Hugepages are not in LRU lists, but they're movable.

6113

* Hugepages are not in LRU lists, but they're movable.

6114

* We need not scan over tail pages bacause we don't

6114

* We need not scan over tail pages bacause we don't

6115

* handle each tail page individually in migration.

6115

* handle each tail page individually in migration.

6116

*/

6116

*/

6117

if (PageHuge(page)) {

6117

if (PageHuge(page)) {

6118

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6118

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6119

continue;

6119

continue;

6120

}

6120

}

6121

6122

/*

6122

/*

6123

* We can't use page_count without pin a page

6123

* We can't use page_count without pin a page

6124

* because another CPU can free compound page.

6124

* because another CPU can free compound page.

6125

* This check already skips compound tails of THP

6125

* This check already skips compound tails of THP

6126

* because their page->_count is zero at all time.

6126

* because their page->_count is zero at all time.

6127

*/

6127

*/

6128

if (!atomic_read(&page->_count)) {

6128

if (!atomic_read(&page->_count)) {

6129

if (PageBuddy(page))

6129

if (PageBuddy(page))

6130

iter += (1 << page_order(page)) - 1;

6130

iter += (1 << page_order(page)) - 1;

6131

continue;

6131

continue;

6132

}

6132

}

6133

6134

/*

6134

/*

6135

* The HWPoisoned page may be not in buddy system, and

6135

* The HWPoisoned page may be not in buddy system, and

6136

* page_count() is not 0.

6136

* page_count() is not 0.

6137

*/

6137

*/

6138

if (skip_hwpoisoned_pages && PageHWPoison(page))

6138

if (skip_hwpoisoned_pages && PageHWPoison(page))

6139

continue;

6139

continue;

6140

6141

if (!PageLRU(page))

6141

if (!PageLRU(page))

6142

found++;

6142

found++;

6143

/*

6143

/*

6144

* If there are RECLAIMABLE pages, we need to check it.

6144

* If there are RECLAIMABLE pages, we need to check it.

6145

* But now, memory offline itself doesn't call shrink_slab()

6145

* But now, memory offline itself doesn't call shrink_slab()

6146

* and it still to be fixed.

6146

* and it still to be fixed.

6147

*/

6147

*/

6148

/*

6148

/*

6149

* If the page is not RAM, page_count()should be 0.

6149

* If the page is not RAM, page_count()should be 0.

6150

* we don't need more check. This is an _used_ not-movable page.

6150

* we don't need more check. This is an _used_ not-movable page.

6151

*

6151

*

6152

* The problematic thing here is PG_reserved pages. PG_reserved

6152

* The problematic thing here is PG_reserved pages. PG_reserved

6153

* is set to both of a memory hole page and a _used_ kernel

6153

* is set to both of a memory hole page and a _used_ kernel

6154

* page at boot.

6154

* page at boot.

6155

*/

6155

*/

6156

if (found > count)

6156

if (found > count)

6157

return true;

6157

return true;

6158

}

6158

}

6159

return false;

6159

return false;

6160

}

6160

}

6161

6162

bool is_pageblock_removable_nolock(struct page *page)

6162

bool is_pageblock_removable_nolock(struct page *page)

6163

{

6163

{

6164

struct zone *zone;

6164

struct zone *zone;

6165

unsigned long pfn;

6165

unsigned long pfn;

6166

6167

/*

6167

/*

6168

* We have to be careful here because we are iterating over memory

6168

* We have to be careful here because we are iterating over memory

6169

* sections which are not zone aware so we might end up outside of

6169

* sections which are not zone aware so we might end up outside of

6170

* the zone but still within the section.

6170

* the zone but still within the section.

6171

* We have to take care about the node as well. If the node is offline

6171

* We have to take care about the node as well. If the node is offline

6172

* its NODE_DATA will be NULL - see page_zone.

6172

* its NODE_DATA will be NULL - see page_zone.

6173

*/

6173

*/

6174

if (!node_online(page_to_nid(page)))

6174

if (!node_online(page_to_nid(page)))

6175

return false;

6175

return false;

6176

6177

zone = page_zone(page);

6177

zone = page_zone(page);

6178

pfn = page_to_pfn(page);

6178

pfn = page_to_pfn(page);

6179

if (!zone_spans_pfn(zone, pfn))

6179

if (!zone_spans_pfn(zone, pfn))

6180

return false;

6180

return false;

6181

6182

return !has_unmovable_pages(zone, page, 0, true);

6182

return !has_unmovable_pages(zone, page, 0, true);

6183

}

6183

}

6184

6185

#ifdef CONFIG_CMA

6185

#ifdef CONFIG_CMA

6186

6187

static unsigned long pfn_max_align_down(unsigned long pfn)

6187

static unsigned long pfn_max_align_down(unsigned long pfn)

6188

{

6188

{

6189

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6189

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6190

pageblock_nr_pages) - 1);

6190

pageblock_nr_pages) - 1);

6191

}

6191

}

6192

6193

static unsigned long pfn_max_align_up(unsigned long pfn)

6193

static unsigned long pfn_max_align_up(unsigned long pfn)

6194

{

6194

{

6195

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6195

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6196

pageblock_nr_pages));

6196

pageblock_nr_pages));

6197

}

6197

}

6198

6199

/* [start, end) must belong to a single zone. */

6199

/* [start, end) must belong to a single zone. */

6200

static int __alloc_contig_migrate_range(struct compact_control *cc,

6200

static int __alloc_contig_migrate_range(struct compact_control *cc,

6201

unsigned long start, unsigned long end)

6201

unsigned long start, unsigned long end)

6202

{

6202

{

6203

/* This function is based on compact_zone() from compaction.c. */

6203

/* This function is based on compact_zone() from compaction.c. */

6204

unsigned long nr_reclaimed;

6204

unsigned long nr_reclaimed;

6205

unsigned long pfn = start;

6205

unsigned long pfn = start;

6206

unsigned int tries = 0;

6206

unsigned int tries = 0;

6207

int ret = 0;

6207

int ret = 0;

6208

6209

migrate_prep();

6209

migrate_prep();

6210

6211

while (pfn < end || !list_empty(&cc->migratepages)) {

6211

while (pfn < end || !list_empty(&cc->migratepages)) {

6212

if (fatal_signal_pending(current)) {

6212

if (fatal_signal_pending(current)) {

6213

ret = -EINTR;

6213

ret = -EINTR;

6214

break;

6214

break;

6215

}

6215

}

6216

6217

if (list_empty(&cc->migratepages)) {

6217

if (list_empty(&cc->migratepages)) {

6218

cc->nr_migratepages = 0;

6218

cc->nr_migratepages = 0;

6219

pfn = isolate_migratepages_range(cc->zone, cc,

6219

pfn = isolate_migratepages_range(cc->zone, cc,

6220

pfn, end, true);

6220

pfn, end, true);

6221

if (!pfn) {

6221

if (!pfn) {

6222

ret = -EINTR;

6222

ret = -EINTR;

6223

break;

6223

break;

6224

}

6224

}

6225

tries = 0;

6225

tries = 0;

6226

} else if (++tries == 5) {

6226

} else if (++tries == 5) {

6227

ret = ret < 0 ? ret : -EBUSY;

6227

ret = ret < 0 ? ret : -EBUSY;

6228

break;

6228

break;

6229

}

6229

}

6230

6231

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6231

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6232

&cc->migratepages);

6232

&cc->migratepages);

6233

cc->nr_migratepages -= nr_reclaimed;

6233

cc->nr_migratepages -= nr_reclaimed;

6234

6235

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6235

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6236

NULL, 0, cc->mode, MR_CMA);

6236

NULL, 0, cc->mode, MR_CMA);

6237

}

6237

}

6238

if (ret < 0) {

6238

if (ret < 0) {

6239

putback_movable_pages(&cc->migratepages);

6239

putback_movable_pages(&cc->migratepages);

6240

return ret;

6240

return ret;

6241

}

6241

}

6242

return 0;

6242

return 0;

6243

}

6243

}

6244

6245

/**

6245

/**

6246

* alloc_contig_range() -- tries to allocate given range of pages

6246

* alloc_contig_range() -- tries to allocate given range of pages

6247

* @start: start PFN to allocate

6247

* @start: start PFN to allocate

6248

* @end: one-past-the-last PFN to allocate

6248

* @end: one-past-the-last PFN to allocate

6249

* @migratetype: migratetype of the underlaying pageblocks (either

6249

* @migratetype: migratetype of the underlaying pageblocks (either

6250

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6250

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6251

* in range must have the same migratetype and it must

6251

* in range must have the same migratetype and it must

6252

* be either of the two.

6252

* be either of the two.

6253

*

6253

*

6254

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6254

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6255

* aligned, however it's the caller's responsibility to guarantee that

6255

* aligned, however it's the caller's responsibility to guarantee that

6256

* we are the only thread that changes migrate type of pageblocks the

6256

* we are the only thread that changes migrate type of pageblocks the

6257

* pages fall in.

6257

* pages fall in.

6258

*

6258

*

6259

* The PFN range must belong to a single zone.

6259

* The PFN range must belong to a single zone.

6260

*

6260

*

6261

* Returns zero on success or negative error code. On success all

6261

* Returns zero on success or negative error code. On success all

6262

* pages which PFN is in [start, end) are allocated for the caller and

6262

* pages which PFN is in [start, end) are allocated for the caller and

6263

* need to be freed with free_contig_range().

6263

* need to be freed with free_contig_range().

6264

*/

6264

*/

6265

int alloc_contig_range(unsigned long start, unsigned long end,

6265

int alloc_contig_range(unsigned long start, unsigned long end,

6266

unsigned migratetype)

6266

unsigned migratetype)

6267

{

6267

{

6268

unsigned long outer_start, outer_end;

6268

unsigned long outer_start, outer_end;

6269

int ret = 0, order;

6269

int ret = 0, order;

6270

6271

struct compact_control cc = {

6271

struct compact_control cc = {

6272

.nr_migratepages = 0,

6272

.nr_migratepages = 0,

6273

.order = -1,

6273

.order = -1,

6274

.zone = page_zone(pfn_to_page(start)),

6274

.zone = page_zone(pfn_to_page(start)),

6275

.mode = MIGRATE_SYNC,

6275

.mode = MIGRATE_SYNC,

6276

.ignore_skip_hint = true,

6276

.ignore_skip_hint = true,

6277

};

6277

};

6278

INIT_LIST_HEAD(&cc.migratepages);

6278

INIT_LIST_HEAD(&cc.migratepages);

6279

6280

/*

6280

/*

6281

* What we do here is we mark all pageblocks in range as

6281

* What we do here is we mark all pageblocks in range as

6282

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6282

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6283

* have different sizes, and due to the way page allocator

6283

* have different sizes, and due to the way page allocator

6284

* work, we align the range to biggest of the two pages so

6284

* work, we align the range to biggest of the two pages so

6285

* that page allocator won't try to merge buddies from

6285

* that page allocator won't try to merge buddies from

6286

* different pageblocks and change MIGRATE_ISOLATE to some

6286

* different pageblocks and change MIGRATE_ISOLATE to some

6287

* other migration type.

6287

* other migration type.

6288

*

6288

*

6289

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6289

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6290

* migrate the pages from an unaligned range (ie. pages that

6290

* migrate the pages from an unaligned range (ie. pages that

6291

* we are interested in). This will put all the pages in

6291

* we are interested in). This will put all the pages in

6292

* range back to page allocator as MIGRATE_ISOLATE.

6292

* range back to page allocator as MIGRATE_ISOLATE.

6293

*

6293

*

6294

* When this is done, we take the pages in range from page

6294

* When this is done, we take the pages in range from page

6295

* allocator removing them from the buddy system. This way

6295

* allocator removing them from the buddy system. This way

6296

* page allocator will never consider using them.

6296

* page allocator will never consider using them.

6297

*

6297

*

6298

* This lets us mark the pageblocks back as

6298

* This lets us mark the pageblocks back as

6299

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6299

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6300

* aligned range but not in the unaligned, original range are

6300

* aligned range but not in the unaligned, original range are

6301

* put back to page allocator so that buddy can use them.

6301

* put back to page allocator so that buddy can use them.

6302

*/

6302

*/

6303

6304

ret = start_isolate_page_range(pfn_max_align_down(start),

6304

ret = start_isolate_page_range(pfn_max_align_down(start),

6305

pfn_max_align_up(end), migratetype,

6305

pfn_max_align_up(end), migratetype,

6306

false);

6306

false);

6307

if (ret)

6307

if (ret)

6308

return ret;

6308

return ret;

6309

6310

ret = __alloc_contig_migrate_range(&cc, start, end);

6310

ret = __alloc_contig_migrate_range(&cc, start, end);

6311

if (ret)

6311

if (ret)

6312

goto done;

6312

goto done;

6313

6314

/*

6314

/*

6315

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6315

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6316

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6316

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6317

* more, all pages in [start, end) are free in page allocator.

6317

* more, all pages in [start, end) are free in page allocator.

6318

* What we are going to do is to allocate all pages from

6318

* What we are going to do is to allocate all pages from

6319

* [start, end) (that is remove them from page allocator).

6319

* [start, end) (that is remove them from page allocator).

6320

*

6320

*

6321

* The only problem is that pages at the beginning and at the

6321

* The only problem is that pages at the beginning and at the

6322

* end of interesting range may be not aligned with pages that

6322

* end of interesting range may be not aligned with pages that

6323

* page allocator holds, ie. they can be part of higher order

6323

* page allocator holds, ie. they can be part of higher order

6324

* pages. Because of this, we reserve the bigger range and

6324

* pages. Because of this, we reserve the bigger range and

6325

* once this is done free the pages we are not interested in.

6325

* once this is done free the pages we are not interested in.

6326

*

6326

*

6327

* We don't have to hold zone->lock here because the pages are

6327

* We don't have to hold zone->lock here because the pages are

6328

* isolated thus they won't get removed from buddy.

6328

* isolated thus they won't get removed from buddy.

6329

*/

6329

*/

6330

6331

lru_add_drain_all();

6331

lru_add_drain_all();

6332

drain_all_pages();

6332

drain_all_pages();

6333

6334

order = 0;

6334

order = 0;

6335

outer_start = start;

6335

outer_start = start;

6336

while (!PageBuddy(pfn_to_page(outer_start))) {

6336

while (!PageBuddy(pfn_to_page(outer_start))) {

6337

if (++order >= MAX_ORDER) {

6337

if (++order >= MAX_ORDER) {

6338

ret = -EBUSY;

6338

ret = -EBUSY;

6339

goto done;

6339

goto done;

6340

}

6340

}

6341

outer_start &= ~0UL << order;

6341

outer_start &= ~0UL << order;

6342

}

6342

}

6343

6344

/* Make sure the range is really isolated. */

6344

/* Make sure the range is really isolated. */

6345

if (test_pages_isolated(outer_start, end, false)) {

6345

if (test_pages_isolated(outer_start, end, false)) {

6346

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6346

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6347

outer_start, end);

6347

outer_start, end);

6348

ret = -EBUSY;

6348

ret = -EBUSY;

6349

goto done;

6349

goto done;

6350

}

6350

}

6351

6352

6353

/* Grab isolated pages from freelists. */

6353

/* Grab isolated pages from freelists. */

6354

outer_end = isolate_freepages_range(&cc, outer_start, end);

6354

outer_end = isolate_freepages_range(&cc, outer_start, end);

6355

if (!outer_end) {

6355

if (!outer_end) {

6356

ret = -EBUSY;

6356

ret = -EBUSY;

6357

goto done;

6357

goto done;

6358

}

6358

}

6359

6360

/* Free head and tail (if any) */

6360

/* Free head and tail (if any) */

6361

if (start != outer_start)

6361

if (start != outer_start)

6362

free_contig_range(outer_start, start - outer_start);

6362

free_contig_range(outer_start, start - outer_start);

6363

if (end != outer_end)

6363

if (end != outer_end)

6364

free_contig_range(end, outer_end - end);

6364

free_contig_range(end, outer_end - end);

6365

6366

done:

6366

done:

6367

undo_isolate_page_range(pfn_max_align_down(start),

6367

undo_isolate_page_range(pfn_max_align_down(start),

6368

pfn_max_align_up(end), migratetype);

6368

pfn_max_align_up(end), migratetype);

6369

return ret;

6369

return ret;

6370

}

6370

}

6371

6372

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6372

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6373

{

6373

{

6374

unsigned int count = 0;

6374

unsigned int count = 0;

6375

6376

for (; nr_pages--; pfn++) {

6376

for (; nr_pages--; pfn++) {

6377

struct page *page = pfn_to_page(pfn);

6377

struct page *page = pfn_to_page(pfn);

6378

6379

count += page_count(page) != 1;

6379

count += page_count(page) != 1;

6380

__free_page(page);

6380

__free_page(page);

6381

}

6381

}

6382

WARN(count != 0, "%d pages are still in use!\n", count);

6382

WARN(count != 0, "%d pages are still in use!\n", count);

6383

}

6383

}

6384

#endif

6384

#endif

6385

6386

#ifdef CONFIG_MEMORY_HOTPLUG

6386

#ifdef CONFIG_MEMORY_HOTPLUG

6387

/*

6387

/*

6388

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6388

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6389

* page high values need to be recalulated.

6389

* page high values need to be recalulated.

6390

*/

6390

*/

6391

void __meminit zone_pcp_update(struct zone *zone)

6391

void __meminit zone_pcp_update(struct zone *zone)

6392

{

6392

{

6393

unsigned cpu;

6393

unsigned cpu;

6394

mutex_lock(&pcp_batch_high_lock);

6394

mutex_lock(&pcp_batch_high_lock);

6395

for_each_possible_cpu(cpu)

6395

for_each_possible_cpu(cpu)

6396

pageset_set_high_and_batch(zone,

6396

pageset_set_high_and_batch(zone,

6397

per_cpu_ptr(zone->pageset, cpu));

6397

per_cpu_ptr(zone->pageset, cpu));

6398

mutex_unlock(&pcp_batch_high_lock);

6398

mutex_unlock(&pcp_batch_high_lock);

6399

}

6399

}

6400

#endif

6400

#endif

6401

6402

void zone_pcp_reset(struct zone *zone)

6402

void zone_pcp_reset(struct zone *zone)

6403

{

6403

{

6404

unsigned long flags;

6404

unsigned long flags;

6405

int cpu;

6405

int cpu;

6406

struct per_cpu_pageset *pset;

6406

struct per_cpu_pageset *pset;

6407

6408

/* avoid races with drain_pages() */

6408

/* avoid races with drain_pages() */

6409

local_irq_save(flags);

6409

local_irq_save(flags);

6410

if (zone->pageset != &boot_pageset) {

6410

if (zone->pageset != &boot_pageset) {

6411

for_each_online_cpu(cpu) {

6411

for_each_online_cpu(cpu) {

6412

pset = per_cpu_ptr(zone->pageset, cpu);

6412

pset = per_cpu_ptr(zone->pageset, cpu);

6413

drain_zonestat(zone, pset);

6413

drain_zonestat(zone, pset);

6414

}

6414

}

6415

free_percpu(zone->pageset);

6415

free_percpu(zone->pageset);

6416

zone->pageset = &boot_pageset;

6416

zone->pageset = &boot_pageset;

6417

}

6417

}

6418

local_irq_restore(flags);

6418

local_irq_restore(flags);

6419

}

6419

}

6420

6421

#ifdef CONFIG_MEMORY_HOTREMOVE

6421

#ifdef CONFIG_MEMORY_HOTREMOVE

6422

/*

6422

/*

6423

* All pages in the range must be isolated before calling this.

6423

* All pages in the range must be isolated before calling this.

6424

*/

6424

*/

6425

void

6425

void

6426

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6426

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6427

{

6427

{

6428

struct page *page;

6428

struct page *page;

6429

struct zone *zone;

6429

struct zone *zone;

6430

unsigned int order, i;

6430

unsigned int order, i;

6431

unsigned long pfn;

6431

unsigned long pfn;

6432

unsigned long flags;

6432

unsigned long flags;

6433

/* find the first valid pfn */

6433

/* find the first valid pfn */

6434

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6434

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6435

if (pfn_valid(pfn))

6435

if (pfn_valid(pfn))

6436

break;

6436

break;

6437

if (pfn == end_pfn)

6437

if (pfn == end_pfn)

6438

return;

6438

return;

6439

zone = page_zone(pfn_to_page(pfn));

6439

zone = page_zone(pfn_to_page(pfn));

6440

spin_lock_irqsave(&zone->lock, flags);

6440

spin_lock_irqsave(&zone->lock, flags);

6441

pfn = start_pfn;

6441

pfn = start_pfn;

6442

while (pfn < end_pfn) {

6442

while (pfn < end_pfn) {

6443

if (!pfn_valid(pfn)) {

6443

if (!pfn_valid(pfn)) {

6444

pfn++;

6444

pfn++;

6445

continue;

6445

continue;

6446

}

6446

}

6447

page = pfn_to_page(pfn);

6447

page = pfn_to_page(pfn);

6448

/*

6448

/*

6449

* The HWPoisoned page may be not in buddy system, and

6449

* The HWPoisoned page may be not in buddy system, and

6450

* page_count() is not 0.

6450

* page_count() is not 0.

6451

*/

6451

*/

6452

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6452

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6453

pfn++;

6453

pfn++;

6454

SetPageReserved(page);

6454

SetPageReserved(page);

6455

continue;

6455

continue;

6456

}

6456

}

6457

6458

BUG_ON(page_count(page));

6458

BUG_ON(page_count(page));

6459

BUG_ON(!PageBuddy(page));

6459

BUG_ON(!PageBuddy(page));

6460

order = page_order(page);

6460

order = page_order(page);

6461

#ifdef CONFIG_DEBUG_VM

6461

#ifdef CONFIG_DEBUG_VM

6462

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6462

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6463

pfn, 1 << order, end_pfn);

6463

pfn, 1 << order, end_pfn);

6464

#endif

6464

#endif

6465

list_del(&page->lru);

6465

list_del(&page->lru);

6466

rmv_page_order(page);

6466

rmv_page_order(page);

6467

zone->free_area[order].nr_free--;

6467

zone->free_area[order].nr_free--;

6468

for (i = 0; i < (1 << order); i++)

6468

for (i = 0; i < (1 << order); i++)

6469

SetPageReserved((page+i));

6469

SetPageReserved((page+i));

6470

pfn += (1 << order);

6470

pfn += (1 << order);

6471

}

6471

}

6472

spin_unlock_irqrestore(&zone->lock, flags);

6472

spin_unlock_irqrestore(&zone->lock, flags);

6473

}

6473

}

6474

#endif

6474

#endif

6475

6476

#ifdef CONFIG_MEMORY_FAILURE

6476

#ifdef CONFIG_MEMORY_FAILURE

6477

bool is_free_buddy_page(struct page *page)

6477

bool is_free_buddy_page(struct page *page)

6478

{

6478

{

6479

struct zone *zone = page_zone(page);

6479

struct zone *zone = page_zone(page);

6480

unsigned long pfn = page_to_pfn(page);

6480

unsigned long pfn = page_to_pfn(page);

6481

unsigned long flags;

6481

unsigned long flags;

6482

unsigned int order;

6482

unsigned int order;

6483

6484

spin_lock_irqsave(&zone->lock, flags);

6484

spin_lock_irqsave(&zone->lock, flags);

6485

for (order = 0; order < MAX_ORDER; order++) {

6485

for (order = 0; order < MAX_ORDER; order++) {

6486

struct page *page_head = page - (pfn & ((1 << order) - 1));

6486

struct page *page_head = page - (pfn & ((1 << order) - 1));

6487

6488

if (PageBuddy(page_head) && page_order(page_head) >= order)

6488

if (PageBuddy(page_head) && page_order(page_head) >= order)

6489

break;

6489

break;

6490

}

6490

}

6491

spin_unlock_irqrestore(&zone->lock, flags);

6491

spin_unlock_irqrestore(&zone->lock, flags);

6492

6493

return order < MAX_ORDER;

6493

return order < MAX_ORDER;

6494

}

6494

}

6495

#endif

6495

#endif

6496

6497

static const struct trace_print_flags pageflag_names[] = {

6497

static const struct trace_print_flags pageflag_names[] = {

6498

{1UL << PG_locked, "locked" },

6498

{1UL << PG_locked, "locked" },

6499

{1UL << PG_error, "error" },

6499

{1UL << PG_error, "error" },

6500

{1UL << PG_referenced, "referenced" },

6500

{1UL << PG_referenced, "referenced" },

6501

{1UL << PG_uptodate, "uptodate" },

6501

{1UL << PG_uptodate, "uptodate" },

6502

{1UL << PG_dirty, "dirty" },

6502

{1UL << PG_dirty, "dirty" },

6503

{1UL << PG_lru, "lru" },

6503

{1UL << PG_lru, "lru" },

6504

{1UL << PG_active, "active" },

6504

{1UL << PG_active, "active" },

6505

{1UL << PG_slab, "slab" },

6505

{1UL << PG_slab, "slab" },

6506

{1UL << PG_owner_priv_1, "owner_priv_1" },

6506

{1UL << PG_owner_priv_1, "owner_priv_1" },

6507

{1UL << PG_arch_1, "arch_1" },

6507

{1UL << PG_arch_1, "arch_1" },

6508

{1UL << PG_reserved, "reserved" },

6508

{1UL << PG_reserved, "reserved" },

6509

{1UL << PG_private, "private" },

6509

{1UL << PG_private, "private" },

6510

{1UL << PG_private_2, "private_2" },

6510

{1UL << PG_private_2, "private_2" },

6511

{1UL << PG_writeback, "writeback" },

6511

{1UL << PG_writeback, "writeback" },

6512

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6512

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6513

{1UL << PG_head, "head" },

6513

{1UL << PG_head, "head" },

6514

{1UL << PG_tail, "tail" },

6514

{1UL << PG_tail, "tail" },

6515

#else

6515

#else

6516

{1UL << PG_compound, "compound" },

6516

{1UL << PG_compound, "compound" },

6517

#endif

6517

#endif

6518

{1UL << PG_swapcache, "swapcache" },

6518

{1UL << PG_swapcache, "swapcache" },

6519

{1UL << PG_mappedtodisk, "mappedtodisk" },

6519

{1UL << PG_mappedtodisk, "mappedtodisk" },

6520

{1UL << PG_reclaim, "reclaim" },

6520

{1UL << PG_reclaim, "reclaim" },

6521

{1UL << PG_swapbacked, "swapbacked" },

6521

{1UL << PG_swapbacked, "swapbacked" },

6522

{1UL << PG_unevictable, "unevictable" },

6522

{1UL << PG_unevictable, "unevictable" },

6523

#ifdef CONFIG_MMU

6523

#ifdef CONFIG_MMU

6524

{1UL << PG_mlocked, "mlocked" },

6524

{1UL << PG_mlocked, "mlocked" },

6525

#endif

6525

#endif

6526

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6526

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6527

{1UL << PG_uncached, "uncached" },

6527

{1UL << PG_uncached, "uncached" },

6528

#endif

6528

#endif

6529

#ifdef CONFIG_MEMORY_FAILURE

6529

#ifdef CONFIG_MEMORY_FAILURE

6530

{1UL << PG_hwpoison, "hwpoison" },

6530

{1UL << PG_hwpoison, "hwpoison" },

6531

#endif

6531

#endif

6532

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6532

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6533

{1UL << PG_compound_lock, "compound_lock" },

6533

{1UL << PG_compound_lock, "compound_lock" },

6534

#endif

6534

#endif

6535

};

6535

};

6536

6537

static void dump_page_flags(unsigned long flags)

6537

static void dump_page_flags(unsigned long flags)

6538

{

6538

{

6539

const char *delim = "";

6539

const char *delim = "";

6540

unsigned long mask;

6540

unsigned long mask;

6541

int i;

6541

int i;

6542

6543

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6543

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6544

6545

printk(KERN_ALERT "page flags: %#lx(", flags);

6545

printk(KERN_ALERT "page flags: %#lx(", flags);

6546

6547

/* remove zone id */

6547

/* remove zone id */

6548

flags &= (1UL << NR_PAGEFLAGS) - 1;

6548

flags &= (1UL << NR_PAGEFLAGS) - 1;

6549

6550

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6550

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6551

6552

mask = pageflag_names[i].mask;

6552

mask = pageflag_names[i].mask;

6553

if ((flags & mask) != mask)

6553

if ((flags & mask) != mask)

6554

continue;

6554

continue;

6555

6556

flags &= ~mask;

6556

flags &= ~mask;

6557

printk("%s%s", delim, pageflag_names[i].name);

6557

printk("%s%s", delim, pageflag_names[i].name);

6558

delim = "|";

6558

delim = "|";

6559

}

6559

}

6560

6561

/* check for left over flags */

6561

/* check for left over flags */

6562

if (flags)

6562

if (flags)

6563

printk("%s%#lx", delim, flags);

6563

printk("%s%#lx", delim, flags);

6564

6565

printk(")\n");

6565

printk(")\n");

6566

}

6566

}

6567

6568

void dump_page(struct page *page)

6568

void dump_page(struct page *page)

6569

{

6569

{

6570

printk(KERN_ALERT

6570

printk(KERN_ALERT

6571

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6571

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6572

page, atomic_read(&page->_count), page_mapcount(page),

6572

page, atomic_read(&page->_count), page_mapcount(page),

6573

page->mapping, page->index);

6573

page->mapping, page->index);

6574

dump_page_flags(page->flags);

6574

dump_page_flags(page->flags);

6575

mem_cgroup_print_bad_page(page);

6575

mem_cgroup_print_bad_page(page);

6576

}

6576

}

GITLAB

mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 #ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page %lu outside zone [ %lu - %lu ]\n",
 			pfn, start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		set_page_count(p, 0);
 		p->first_page = page;
 		/* Make sure p->first_page is always valid for PageTail() */
 		smp_wmb();
 		__SetPageTail(p);
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, unsigned int order,
 							gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 							unsigned int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		/*
 		 * zone check is done late to avoid uselessly
 		 * calculating zone/node ids for pages that could
 		 * never merge.
 		 */
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		unsigned long pfn,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	VM_BUG_ON(!zone_is_initialized(zone));
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = pfn & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	page_nid_reset_last(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 			if (likely(!is_migrate_isolate_page(page))) {
 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
 				if (is_migrate_cma(mt))
 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 			}
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone,
 				struct page *page, unsigned long pfn,
 				unsigned int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	__free_one_page(page, pfn, zone, order, migratetype);
 	if (unlikely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 	if (!free_pages_prepare(page, order))
 		return;
+	migratetype = get_pfnblock_migratetype(page, pfn);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
-	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
 	local_irq_restore(flags);
 }
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	if (pageblock_order >= MAX_ORDER) {
 		i = pageblock_nr_pages;
 		p = page;
 		do {
 			set_page_refcounted(p);
 			__free_pages(p, MAX_ORDER - 1);
 			p += MAX_ORDER_NR_PAGES;
 		} while (i -= MAX_ORDER_NR_PAGES);
 	} else {
 		set_page_refcounted(page);
 		__free_pages(page, pageblock_order);
 	}
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_freepage_state(zone, -(1 << high),
 						  migratetype);
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		set_freepage_migratetype(page, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 #endif
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * If breaking a large block of pages, move all free pages to the preferred
  * allocation list. If falling back for a reclaimable kernel allocation, be
  * more aggressive about taking ownership of free pages.
  *
  * On the other hand, never change migration type of MIGRATE_CMA pageblocks
  * nor move CMA pages to different free lists. We don't want unmovable pages
  * to be allocated from MIGRATE_CMA areas.
  *
  * Returns the new migratetype of the pageblock (or the same old migratetype
  * if it was unchanged).
  */
 static int try_to_steal_freepages(struct zone *zone, struct page *page,
 				  int start_type, int fallback_type)
 {
 	int current_order = page_order(page);
 	/*
 	 * When borrowing from MIGRATE_CMA, we need to release the excess
 	 * buddy pages to CMA itself. We also ensure the freepage_migratetype
 	 * is set to CMA so it is returned to the correct freelist in case
 	 * the page ends up being not actually allocated from the pcp lists.
 	 */
 	if (is_migrate_cma(fallback_type))
 		return fallback_type;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		return start_type;
 	}
 	if (current_order >= pageblock_order / 2 ||
 	    start_type == MIGRATE_RECLAIMABLE ||
 	    page_group_by_mobility_disabled) {
 		int pages;
 		pages = move_freepages_block(zone, page, start_type);
 		/* Claim the whole block if over half of it is free */
 		if (pages >= (1 << (pageblock_order-1)) ||
 				page_group_by_mobility_disabled) {
 			set_pageblock_migratetype(page, start_type);
 			return start_type;
 		}
 	}
 	return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 {
 	struct free_area *area;
 	unsigned int current_order;
 	struct page *page;
 	int migratetype, new_type, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1;
 				current_order >= order && current_order <= MAX_ORDER-1;
 				--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			new_type = try_to_steal_freepages(zone, page,
 							  start_migratetype,
 							  migratetype);
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			expand(zone, page, order, current_order, area,
 			       new_type);
 			/* The freepage_migratetype may differ from pageblock's
 			 * migratetype depending on the decisions in
 			 * try_to_steal_freepages. This is OK as long as it does
 			 * not differ for MIGRATE_CMA type.
 			 */
 			set_freepage_migratetype(page, new_type);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype, new_type);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, bool cold)
 {
 	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(!cold))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
 		if (is_migrate_cma(get_freepage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	unsigned long batch;
 	local_irq_save(flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	if (pcp->count >= batch)
 		to_drain = batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	unsigned int order, t;
 	struct list_head *curr;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == true ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, bool cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, pfn, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (!cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, bool cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	int nr_pages;
 	order = page_order(page);
 	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	return nr_pages;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
 			gfp_t gfp_flags, int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_freepage_migratetype(page));
 	}
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 			unsigned long mark, int classzone_idx, int alloc_flags,
 			long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	long free_cma = 0;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	if (free_pages - free_cma <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 			unsigned long mark, int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return local_zone->node == zone->node;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 static void __paginginit init_zone_allows_reclaim(int nid)
 {
 	int i;
 	for_each_node_state(i, N_MEMORY)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
 		else
 			zone_reclaim_mode = 1;
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static inline void init_zone_allows_reclaim(int nid)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
 				(gfp_mask & __GFP_WRITE);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		unsigned long mark;
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		/*
 		 * Distribute pages in proportion to the individual
 		 * zone size to ensure fair page aging.  The zone a
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			if (!zone_local(preferred_zone, zone))
 				continue;
 			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
 				continue;
 		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if (consider_zone_dirty && !zone_dirty_ok(zone))
 			continue;
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_ok(zone, order, mark,
 				       classzone_idx, alloc_flags)) {
 			int ret;
 			/* Checked here to keep the fast path fast */
 			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 			if (alloc_flags & ALLOC_NO_WATERMARKS)
 				goto try_this_zone;
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto try_this_zone;
 				/*
 				 * Failed to reclaim enough to meet watermark.
 				 * Only mark the zone full if checking the min
 				 * watermark or if we failed to reclaim just
 				 * 1<<order pages or else the page allocator
 				 * fastpath will prematurely mark zones full
 				 * when the watermark is between the low and
 				 * min watermarks.
 				 */
 				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
 				    ret == ZONE_RECLAIM_SOME)
 					goto this_zone_full;
 				continue;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
 		 * that the caller is taking steps that will free more
 		 * memory. The caller should avoid the page being used
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * Walking all memory to count page types is very expensive and should
 	 * be inhibited in non-blockable contexts.
 	 */
 	if (!(gfp_mask & __GFP_WAIT))
 		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		struct page *page;
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype,
 	enum migrate_mode mode, bool *contended_compaction,
 	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (IS_ENABLED(CONFIG_NUMA))
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
 					preferred_zone, classzone_idx,
 					migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, classzone_idx, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static void reset_alloc_batches(struct zonelist *zonelist,
 				enum zone_type high_zoneidx,
 				struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		/*
 		 * Only reset the batches of zones that were actually
 		 * considered in the fairness pass, we don't want to
 		 * trash fairness information for zones that are not
 		 * actually part of this zonelist's round-robin cycle.
 		 */
 		if (!zone_local(preferred_zone, zone))
 			continue;
 		mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
 			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 	}
 }
 static void wake_all_kswapds(unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
 			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (atomic) {
 		/*
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
 		if (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed_softwall().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (IS_ENABLED(CONFIG_NUMA) &&
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
 		struct zoneref *preferred_zoneref;
 		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
 				NULL,
 				&preferred_zone);
 		classzone_idx = zonelist_zone_idx(preferred_zoneref);
 	}
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		/*
 		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
 		 * the allocation is high priority and these type of
 		 * allocations are system rather than user orientated
 		 */
 		zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			goto got_pg;
 		}
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone,
 					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	migration_mode = MIGRATE_SYNC_LIGHT;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * requested a movable allocation that does not heavily disrupt the
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
 						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					classzone_idx, migratetype,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					classzone_idx, migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone,
 					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	struct mem_cgroup *memcg = NULL;
 	int classzone_idx;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 	/*
 	 * Will only have any effect when __GFP_KMEMCG is set.  This is
 	 * verified in the (always inline) callee
 	 */
 	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 	classzone_idx = zonelist_zone_idx(preferred_zoneref);
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, classzone_idx, migratetype);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
 		 * fairly within the local node.  However, the local
 		 * node might have free pages left after the fairness
 		 * batches are exhausted, and remote zones haven't
 		 * even been considered yet.  Try once more without
 		 * fairness, and include remote zones now, before
 		 * entering the slowpath and waking kswapd: prefer
 		 * spilling to a remote zone over swapping locally.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			reset_alloc_batches(zonelist, high_zoneidx,
 					    preferred_zone);
 			alloc_flags &= ~ALLOC_FAIR;
 			goto retry;
 		}
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
 		 * complete.
 		 */
 		gfp_mask = memalloc_noio_flags(gfp_mask);
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, classzone_idx, migratetype);
 	}
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	memcg_kmem_commit_charge(page, memcg, order);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, false);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
  * pages allocated with __GFP_KMEMCG.
  *
  * Those pages are accounted to a particular memcg, embedded in the
  * corresponding page_cgroup. To avoid adding a hit in the allocator to search
  * for that information only to find out that it is NULL for users who have no
  * interest in that whatsoever, we provide these functions.
  *
  * The caller knows better which flags it relies on.
  */
 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
 {
 	memcg_kmem_uncharge_pages(page, order);
 	__free_pages(page, order);
 }
 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
 	}
 }
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *     managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = read_mems_allowed_begin();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RESERVE]	= 'R',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk("(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE),
 		global_page_state(NR_FREE_CMA_PAGES));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk("= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write) {
 		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
 			ret = -EINVAL;
 			goto out;
 		}
 		strcpy(saved_string, (char *)table->data);
 	}
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		ret = __parse_numa_zonelist_order((char *)table->data);
 		if (ret) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char *)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
 	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->managed_pages;
 				total_size += z->managed_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
 	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
 	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	int old_reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = zone_end_pfn(zone);
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	old_reserve = zone->nr_migrate_reserve_block;
 	/* When memory hot-add, we almost always need to do nothing */
 	if (reserve == old_reserve)
 		return;
 	zone->nr_migrate_reserve_block = reserve;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		} else if (!old_reserve) {
 			/*
 			 * At boot time we don't need to scan the whole zone
 			 * for turning off MIGRATE_RESERVE.
 			 */
 			break;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
 		page_nid_reset_last(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < zone_end_pfn(z))
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	unsigned int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void pageset_set_high_and_batch(struct zone *zone,
 				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
 	 */
 	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
 	static int __meminitdata last_nid;
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		last_start_pfn = start_pfn;
 		last_end_pfn = end_pfn;
 		last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 							 node_start_pfn,
 							 node_end_pfn,
 							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not naturally algined on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
 						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
 								node_start_pfn,
 								node_end_pfn,
 								zholes_size);
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (freesize >= memmap_pages) {
 			freesize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds freesize %lu\n",
 				zone_names[j], memmap_pages, freesize);
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	if (node_state(nid, N_MEMORY))
 		init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, start_pfn, end_pfn,
 			    zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	printk("Memory: %luK/%luK available "
 	       "(%luK kernel code, %luK rwdata, %luK rodata, "
 	       "%luK init, %luK bss, %luK reserved"
 #ifdef	CONFIG_HIGHMEM
 	       ", %luK highmem"
 #endif
 	       "%s%s)\n",
 	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
 	       codesize >> 10, datasize >> 10, rosize >> 10,
 	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
 	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
 #ifdef	CONFIG_HIGHMEM
 	       totalhigh_pages << (PAGE_SHIFT-10),
 #endif
 	       str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->managed_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
 				      low_wmark_pages(zone) -
 				      zone_page_state(zone, NR_ALLOC_BATCH));
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
 	int ret;
 	mutex_lock(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || ret < 0)
 		goto out;
 	/* Sanity checking to avoid pcp imbalance */
 	if (percpu_pagelist_fraction &&
 	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
 		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
 		ret = -EINVAL;
 		goto out;
 	}
 	/* No change? */
 	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
 		goto out;
 	for_each_populated_zone(zone) {
 		unsigned int cpu;
 		for_each_possible_cpu(cpu)
 			pageset_set_high_and_batch(zone,
 					per_cpu_ptr(zone->pageset, cpu));
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long word;
 	zone = page_zone(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	word = bitmap[word_bitidx];
 	bitidx += end_bitidx;
 	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 }
 /**
  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long old_word, word;
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 	zone = page_zone(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
 	flags <<= (BITS_PER_LONG - bitidx - 1);
 	word = ACCESS_ONCE(bitmap[word_bitidx]);
 	for (;;) {
 		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 		if (word == old_word)
 			break;
 		word = old_word;
 	}
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    NULL, 0, cc->mode, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	unsigned int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	unsigned int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);
 }