Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

73

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

75

DEFINE_PER_CPU(int, numa_node);

75

DEFINE_PER_CPU(int, numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

77

#endif

77

#endif

78

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

80

/*

80

/*

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

84

* defined in <linux/topology.h>.

84

* defined in <linux/topology.h>.

85

*/

85

*/

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

88

#endif

88

#endif

89

90

/*

90

/*

91

* Array of node states.

91

* Array of node states.

92

*/

92

*/

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

94

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_POSSIBLE] = NODE_MASK_ALL,

95

[N_ONLINE] = { { [0] = 1UL } },

95

[N_ONLINE] = { { [0] = 1UL } },

96

#ifndef CONFIG_NUMA

96

#ifndef CONFIG_NUMA

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

100

#endif

100

#endif

101

#ifdef CONFIG_MOVABLE_NODE

101

#ifdef CONFIG_MOVABLE_NODE

102

[N_MEMORY] = { { [0] = 1UL } },

102

[N_MEMORY] = { { [0] = 1UL } },

103

#endif

103

#endif

104

[N_CPU] = { { [0] = 1UL } },

104

[N_CPU] = { { [0] = 1UL } },

105

#endif /* NUMA */

105

#endif /* NUMA */

106

};

106

};

107

EXPORT_SYMBOL(node_states);

107

EXPORT_SYMBOL(node_states);

108

109

/* Protect totalram_pages and zone->managed_pages */

109

/* Protect totalram_pages and zone->managed_pages */

110

static DEFINE_SPINLOCK(managed_page_count_lock);

110

static DEFINE_SPINLOCK(managed_page_count_lock);

111

112

unsigned long totalram_pages __read_mostly;

112

unsigned long totalram_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

114

/*

114

/*

115

* When calculating the number of globally allowed dirty pages, there

115

* When calculating the number of globally allowed dirty pages, there

116

* is a certain number of per-zone reserves that should not be

116

* is a certain number of per-zone reserves that should not be

117

* considered dirtyable memory. This is the sum of those reserves

117

* considered dirtyable memory. This is the sum of those reserves

118

* over all existing zones that contribute dirtyable memory.

118

* over all existing zones that contribute dirtyable memory.

119

*/

119

*/

120

unsigned long dirty_balance_reserve __read_mostly;

120

unsigned long dirty_balance_reserve __read_mostly;

121

122

int percpu_pagelist_fraction;

122

int percpu_pagelist_fraction;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

124

125

#ifdef CONFIG_PM_SLEEP

125

#ifdef CONFIG_PM_SLEEP

126

/*

126

/*

127

* The following functions are used by the suspend/hibernate code to temporarily

127

* The following functions are used by the suspend/hibernate code to temporarily

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

132

* guaranteed not to run in parallel with that modification).

132

* guaranteed not to run in parallel with that modification).

133

*/

133

*/

134

135

static gfp_t saved_gfp_mask;

135

static gfp_t saved_gfp_mask;

136

137

void pm_restore_gfp_mask(void)

137

void pm_restore_gfp_mask(void)

138

{

138

{

139

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(!mutex_is_locked(&pm_mutex));

140

if (saved_gfp_mask) {

140

if (saved_gfp_mask) {

141

gfp_allowed_mask = saved_gfp_mask;

141

gfp_allowed_mask = saved_gfp_mask;

142

saved_gfp_mask = 0;

142

saved_gfp_mask = 0;

143

}

143

}

144

}

144

}

145

146

void pm_restrict_gfp_mask(void)

146

void pm_restrict_gfp_mask(void)

147

{

147

{

148

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(!mutex_is_locked(&pm_mutex));

149

WARN_ON(saved_gfp_mask);

149

WARN_ON(saved_gfp_mask);

150

saved_gfp_mask = gfp_allowed_mask;

150

saved_gfp_mask = gfp_allowed_mask;

151

gfp_allowed_mask &= ~GFP_IOFS;

151

gfp_allowed_mask &= ~GFP_IOFS;

152

}

152

}

153

154

bool pm_suspended_storage(void)

154

bool pm_suspended_storage(void)

155

{

155

{

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

157

return false;

157

return false;

158

return true;

158

return true;

159

}

159

}

160

#endif /* CONFIG_PM_SLEEP */

160

#endif /* CONFIG_PM_SLEEP */

161

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

163

int pageblock_order __read_mostly;

163

int pageblock_order __read_mostly;

164

#endif

164

#endif

165

166

static void __free_pages_ok(struct page *page, unsigned int order);

166

static void __free_pages_ok(struct page *page, unsigned int order);

167

168

/*

168

/*

169

* results with 256, 32 in the lowmem_reserve sysctl:

169

* results with 256, 32 in the lowmem_reserve sysctl:

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

175

*

175

*

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

177

* don't need any ZONE_NORMAL reservation

177

* don't need any ZONE_NORMAL reservation

178

*/

178

*/

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

180

#ifdef CONFIG_ZONE_DMA

180

#ifdef CONFIG_ZONE_DMA

181

256,

181

256,

182

#endif

182

#endif

183

#ifdef CONFIG_ZONE_DMA32

183

#ifdef CONFIG_ZONE_DMA32

184

256,

184

256,

185

#endif

185

#endif

186

#ifdef CONFIG_HIGHMEM

186

#ifdef CONFIG_HIGHMEM

187

32,

187

32,

188

#endif

188

#endif

189

32,

189

32,

190

};

190

};

191

192

EXPORT_SYMBOL(totalram_pages);

192

EXPORT_SYMBOL(totalram_pages);

193

194

static char * const zone_names[MAX_NR_ZONES] = {

194

static char * const zone_names[MAX_NR_ZONES] = {

195

#ifdef CONFIG_ZONE_DMA

195

#ifdef CONFIG_ZONE_DMA

196

"DMA",

196

"DMA",

197

#endif

197

#endif

198

#ifdef CONFIG_ZONE_DMA32

198

#ifdef CONFIG_ZONE_DMA32

199

"DMA32",

199

"DMA32",

200

#endif

200

#endif

201

"Normal",

201

"Normal",

202

#ifdef CONFIG_HIGHMEM

202

#ifdef CONFIG_HIGHMEM

203

"HighMem",

203

"HighMem",

204

#endif

204

#endif

205

"Movable",

205

"Movable",

206

};

206

};

207

208

int min_free_kbytes = 1024;

208

int min_free_kbytes = 1024;

209

int user_min_free_kbytes;

209

int user_min_free_kbytes;

210

211

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_kernel_pages;

212

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata nr_all_pages;

213

static unsigned long __meminitdata dma_reserve;

213

static unsigned long __meminitdata dma_reserve;

214

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

218

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_kernelcore;

219

static unsigned long __initdata required_movablecore;

219

static unsigned long __initdata required_movablecore;

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

221

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

223

int movable_zone;

223

int movable_zone;

224

EXPORT_SYMBOL(movable_zone);

224

EXPORT_SYMBOL(movable_zone);

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

226

227

#if MAX_NUMNODES > 1

227

#if MAX_NUMNODES > 1

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

229

int nr_online_nodes __read_mostly = 1;

229

int nr_online_nodes __read_mostly = 1;

230

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_node_ids);

231

EXPORT_SYMBOL(nr_online_nodes);

231

EXPORT_SYMBOL(nr_online_nodes);

232

#endif

232

#endif

233

234

int page_group_by_mobility_disabled __read_mostly;

234

int page_group_by_mobility_disabled __read_mostly;

235

236

void set_pageblock_migratetype(struct page *page, int migratetype)

236

void set_pageblock_migratetype(struct page *page, int migratetype)

237

{

237

{

238

239

if (unlikely(page_group_by_mobility_disabled))

239

if (unlikely(page_group_by_mobility_disabled))

240

migratetype = MIGRATE_UNMOVABLE;

240

migratetype = MIGRATE_UNMOVABLE;

241

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

243

PB_migrate, PB_migrate_end);

243

PB_migrate, PB_migrate_end);

244

}

244

}

245

246

bool oom_killer_disabled __read_mostly;

246

bool oom_killer_disabled __read_mostly;

247

248

#ifdef CONFIG_DEBUG_VM

248

#ifdef CONFIG_DEBUG_VM

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

250

{

250

{

251

int ret = 0;

251

int ret = 0;

252

unsigned seq;

252

unsigned seq;

253

unsigned long pfn = page_to_pfn(page);

253

unsigned long pfn = page_to_pfn(page);

254

unsigned long sp, start_pfn;

254

unsigned long sp, start_pfn;

255

256

do {

256

do {

257

seq = zone_span_seqbegin(zone);

257

seq = zone_span_seqbegin(zone);

258

start_pfn = zone->zone_start_pfn;

258

start_pfn = zone->zone_start_pfn;

259

sp = zone->spanned_pages;

259

sp = zone->spanned_pages;

260

if (!zone_spans_pfn(zone, pfn))

260

if (!zone_spans_pfn(zone, pfn))

261

ret = 1;

261

ret = 1;

262

} while (zone_span_seqretry(zone, seq));

262

} while (zone_span_seqretry(zone, seq));

263

264

if (ret)

264

if (ret)

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

266

pfn, start_pfn, start_pfn + sp);

266

pfn, start_pfn, start_pfn + sp);

267

268

return ret;

268

return ret;

269

}

269

}

270

271

static int page_is_consistent(struct zone *zone, struct page *page)

271

static int page_is_consistent(struct zone *zone, struct page *page)

272

{

272

{

273

if (!pfn_valid_within(page_to_pfn(page)))

273

if (!pfn_valid_within(page_to_pfn(page)))

274

return 0;

274

return 0;

275

if (zone != page_zone(page))

275

if (zone != page_zone(page))

276

return 0;

276

return 0;

277

278

return 1;

278

return 1;

279

}

279

}

280

/*

280

/*

281

* Temporary debugging check for pages not lying within a given zone.

281

* Temporary debugging check for pages not lying within a given zone.

282

*/

282

*/

283

static int bad_range(struct zone *zone, struct page *page)

283

static int bad_range(struct zone *zone, struct page *page)

284

{

284

{

285

if (page_outside_zone_boundaries(zone, page))

285

if (page_outside_zone_boundaries(zone, page))

286

return 1;

286

return 1;

287

if (!page_is_consistent(zone, page))

287

if (!page_is_consistent(zone, page))

288

return 1;

288

return 1;

289

290

return 0;

290

return 0;

291

}

291

}

292

#else

292

#else

293

static inline int bad_range(struct zone *zone, struct page *page)

293

static inline int bad_range(struct zone *zone, struct page *page)

294

{

294

{

295

return 0;

295

return 0;

296

}

296

}

297

#endif

297

#endif

298

299

static void bad_page(struct page *page)

299

static void bad_page(struct page *page)

300

{

300

{

301

static unsigned long resume;

301

static unsigned long resume;

302

static unsigned long nr_shown;

302

static unsigned long nr_shown;

303

static unsigned long nr_unshown;

303

static unsigned long nr_unshown;

304

305

/* Don't complain about poisoned pages */

305

/* Don't complain about poisoned pages */

306

if (PageHWPoison(page)) {

306

if (PageHWPoison(page)) {

307

page_mapcount_reset(page); /* remove PageBuddy */

307

page_mapcount_reset(page); /* remove PageBuddy */

308

return;

308

return;

309

}

309

}

310

311

/*

311

/*

312

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* Allow a burst of 60 reports, then keep quiet for that minute;

313

* or allow a steady drip of one report per second.

313

* or allow a steady drip of one report per second.

314

*/

314

*/

315

if (nr_shown == 60) {

315

if (nr_shown == 60) {

316

if (time_before(jiffies, resume)) {

316

if (time_before(jiffies, resume)) {

317

nr_unshown++;

317

nr_unshown++;

318

goto out;

318

goto out;

319

}

319

}

320

if (nr_unshown) {

320

if (nr_unshown) {

321

printk(KERN_ALERT

321

printk(KERN_ALERT

322

"BUG: Bad page state: %lu messages suppressed\n",

322

"BUG: Bad page state: %lu messages suppressed\n",

323

nr_unshown);

323

nr_unshown);

324

nr_unshown = 0;

324

nr_unshown = 0;

325

}

325

}

326

nr_shown = 0;

326

nr_shown = 0;

327

}

327

}

328

if (nr_shown++ == 0)

328

if (nr_shown++ == 0)

329

resume = jiffies + 60 * HZ;

329

resume = jiffies + 60 * HZ;

330

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

332

current->comm, page_to_pfn(page));

332

current->comm, page_to_pfn(page));

333

dump_page(page);

333

dump_page(page);

334

335

print_modules();

335

print_modules();

336

dump_stack();

336

dump_stack();

337

out:

337

out:

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

339

page_mapcount_reset(page); /* remove PageBuddy */

339

page_mapcount_reset(page); /* remove PageBuddy */

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

341

}

341

}

342

343

/*

343

/*

344

* Higher-order pages are called "compound pages". They are structured thusly:

344

* Higher-order pages are called "compound pages". They are structured thusly:

345

*

345

*

346

* The first PAGE_SIZE page is called the "head page".

346

* The first PAGE_SIZE page is called the "head page".

347

*

347

*

348

* The remaining PAGE_SIZE pages are called "tail pages".

348

* The remaining PAGE_SIZE pages are called "tail pages".

349

*

349

*

350

* All pages have PG_compound set. All tail pages have their ->first_page

350

* All pages have PG_compound set. All tail pages have their ->first_page

351

* pointing at the head page.

351

* pointing at the head page.

352

*

352

*

353

* The first tail page's ->lru.next holds the address of the compound page's

353

* The first tail page's ->lru.next holds the address of the compound page's

354

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* put_page() function. Its ->lru.prev holds the order of allocation.

355

* This usage means that zero-order pages may not be compound.

355

* This usage means that zero-order pages may not be compound.

356

*/

356

*/

357

358

static void free_compound_page(struct page *page)

358

static void free_compound_page(struct page *page)

359

{

359

{

360

__free_pages_ok(page, compound_order(page));

360

__free_pages_ok(page, compound_order(page));

361

}

361

}

362

363

void prep_compound_page(struct page *page, unsigned long order)

363

void prep_compound_page(struct page *page, unsigned long order)

364

{

364

{

365

int i;

365

int i;

366

int nr_pages = 1 << order;

366

int nr_pages = 1 << order;

367

368

set_compound_page_dtor(page, free_compound_page);

368

set_compound_page_dtor(page, free_compound_page);

369

set_compound_order(page, order);

369

set_compound_order(page, order);

370

__SetPageHead(page);

370

__SetPageHead(page);

371

for (i = 1; i < nr_pages; i++) {

371

for (i = 1; i < nr_pages; i++) {

372

struct page *p = page + i;

372

struct page *p = page + i;

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

/* Make sure p->first_page is always valid for PageTail() */

375

/* Make sure p->first_page is always valid for PageTail() */

376

smp_wmb();

376

smp_wmb();

377

__SetPageTail(p);

377

__SetPageTail(p);

378

}

378

}

379

}

379

}

380

381

/* update __split_huge_page_refcount if you change this function */

381

/* update __split_huge_page_refcount if you change this function */

382

static int destroy_compound_page(struct page *page, unsigned long order)

382

static int destroy_compound_page(struct page *page, unsigned long order)

383

{

383

{

384

int i;

384

int i;

385

int nr_pages = 1 << order;

385

int nr_pages = 1 << order;

386

int bad = 0;

386

int bad = 0;

387

388

if (unlikely(compound_order(page) != order)) {

388

if (unlikely(compound_order(page) != order)) {

389

bad_page(page);

389

bad_page(page);

390

bad++;

390

bad++;

391

}

391

}

392

393

__ClearPageHead(page);

393

__ClearPageHead(page);

394

395

for (i = 1; i < nr_pages; i++) {

395

for (i = 1; i < nr_pages; i++) {

396

struct page *p = page + i;

396

struct page *p = page + i;

397

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

399

bad_page(page);

399

bad_page(page);

400

bad++;

400

bad++;

401

}

401

}

402

__ClearPageTail(p);

402

__ClearPageTail(p);

403

}

403

}

404

405

return bad;

405

return bad;

406

}

406

}

407

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

409

{

409

{

410

int i;

410

int i;

411

412

/*

412

/*

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

415

*/

415

*/

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

417

for (i = 0; i < (1 << order); i++)

417

for (i = 0; i < (1 << order); i++)

418

clear_highpage(page + i);

418

clear_highpage(page + i);

419

}

419

}

420

421

#ifdef CONFIG_DEBUG_PAGEALLOC

421

#ifdef CONFIG_DEBUG_PAGEALLOC

422

unsigned int _debug_guardpage_minorder;

422

unsigned int _debug_guardpage_minorder;

423

424

static int __init debug_guardpage_minorder_setup(char *buf)

424

static int __init debug_guardpage_minorder_setup(char *buf)

425

{

425

{

426

unsigned long res;

426

unsigned long res;

427

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

430

return 0;

430

return 0;

431

}

431

}

432

_debug_guardpage_minorder = res;

432

_debug_guardpage_minorder = res;

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

434

return 0;

434

return 0;

435

}

435

}

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

437

438

static inline void set_page_guard_flag(struct page *page)

438

static inline void set_page_guard_flag(struct page *page)

439

{

439

{

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

441

}

441

}

442

443

static inline void clear_page_guard_flag(struct page *page)

443

static inline void clear_page_guard_flag(struct page *page)

444

{

444

{

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

446

}

446

}

447

#else

447

#else

448

static inline void set_page_guard_flag(struct page *page) { }

448

static inline void set_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

450

#endif

450

#endif

451

452

static inline void set_page_order(struct page *page, int order)

452

static inline void set_page_order(struct page *page, int order)

453

{

453

{

454

set_page_private(page, order);

454

set_page_private(page, order);

455

__SetPageBuddy(page);

455

__SetPageBuddy(page);

456

}

456

}

457

458

static inline void rmv_page_order(struct page *page)

458

static inline void rmv_page_order(struct page *page)

459

{

459

{

460

__ClearPageBuddy(page);

460

__ClearPageBuddy(page);

461

set_page_private(page, 0);

461

set_page_private(page, 0);

462

}

462

}

463

464

/*

464

/*

465

* Locate the struct page for both the matching buddy in our

465

* Locate the struct page for both the matching buddy in our

466

* pair (buddy1) and the combined O(n+1) page they form (page).

466

* pair (buddy1) and the combined O(n+1) page they form (page).

467

*

467

*

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

469

* the following equation:

469

* the following equation:

470

* B2 = B1 ^ (1 << O)

470

* B2 = B1 ^ (1 << O)

471

* For example, if the starting buddy (buddy2) is #8 its order

471

* For example, if the starting buddy (buddy2) is #8 its order

472

* 1 buddy is #10:

472

* 1 buddy is #10:

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

474

*

474

*

475

* 2) Any buddy B will have an order O+1 parent P which

475

* 2) Any buddy B will have an order O+1 parent P which

476

* satisfies the following equation:

476

* satisfies the following equation:

477

* P = B & ~(1 << O)

477

* P = B & ~(1 << O)

478

*

478

*

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

480

*/

480

*/

481

static inline unsigned long

481

static inline unsigned long

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

483

{

483

{

484

return page_idx ^ (1 << order);

484

return page_idx ^ (1 << order);

485

}

485

}

486

487

/*

487

/*

488

* This function checks whether a page is free && is the buddy

488

* This function checks whether a page is free && is the buddy

489

* we can do coalesce a page and its buddy if

489

* we can do coalesce a page and its buddy if

490

* (a) the buddy is not in a hole &&

490

* (a) the buddy is not in a hole &&

491

* (b) the buddy is in the buddy system &&

491

* (b) the buddy is in the buddy system &&

492

* (c) a page and its buddy have the same order &&

492

* (c) a page and its buddy have the same order &&

493

* (d) a page and its buddy are in the same zone.

493

* (d) a page and its buddy are in the same zone.

494

*

494

*

495

* For recording whether a page is in the buddy system, we set ->_mapcount

495

* For recording whether a page is in the buddy system, we set ->_mapcount

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

498

* serialized by zone->lock.

498

* serialized by zone->lock.

499

*

499

*

500

* For recording page's order, we use page_private(page).

500

* For recording page's order, we use page_private(page).

501

*/

501

*/

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

503

int order)

503

int order)

504

{

504

{

505

if (!pfn_valid_within(page_to_pfn(buddy)))

505

if (!pfn_valid_within(page_to_pfn(buddy)))

506

return 0;

506

return 0;

507

508

if (page_is_guard(buddy) && page_order(buddy) == order) {

508

if (page_is_guard(buddy) && page_order(buddy) == order) {

509

VM_BUG_ON(page_count(buddy) != 0);

509

VM_BUG_ON(page_count(buddy) != 0);

510

511

if (page_zone_id(page) != page_zone_id(buddy))

511

if (page_zone_id(page) != page_zone_id(buddy))

512

return 0;

512

return 0;

513

514

return 1;

514

return 1;

515

}

515

}

516

517

if (PageBuddy(buddy) && page_order(buddy) == order) {

517

if (PageBuddy(buddy) && page_order(buddy) == order) {

518

VM_BUG_ON(page_count(buddy) != 0);

518

VM_BUG_ON(page_count(buddy) != 0);

519

520

/*

520

/*

521

* zone check is done late to avoid uselessly

521

* zone check is done late to avoid uselessly

522

* calculating zone/node ids for pages that could

522

* calculating zone/node ids for pages that could

523

* never merge.

523

* never merge.

524

*/

524

*/

525

if (page_zone_id(page) != page_zone_id(buddy))

525

if (page_zone_id(page) != page_zone_id(buddy))

526

return 0;

526

return 0;

527

528

return 1;

528

return 1;

529

}

529

}

530

return 0;

530

return 0;

531

}

531

}

532

533

/*

533

/*

534

* Freeing function for a buddy system allocator.

534

* Freeing function for a buddy system allocator.

535

*

535

*

536

* The concept of a buddy system is to maintain direct-mapped table

536

* The concept of a buddy system is to maintain direct-mapped table

537

* (containing bit values) for memory blocks of various "orders".

537

* (containing bit values) for memory blocks of various "orders".

538

* The bottom level table contains the map for the smallest allocatable

538

* The bottom level table contains the map for the smallest allocatable

539

* units of memory (here, pages), and each level above it describes

539

* units of memory (here, pages), and each level above it describes

540

* pairs of units from the levels below, hence, "buddies".

540

* pairs of units from the levels below, hence, "buddies".

541

* At a high level, all that happens here is marking the table entry

541

* At a high level, all that happens here is marking the table entry

542

* at the bottom level available, and propagating the changes upward

542

* at the bottom level available, and propagating the changes upward

543

* as necessary, plus some accounting needed to play nicely with other

543

* as necessary, plus some accounting needed to play nicely with other

544

* parts of the VM system.

544

* parts of the VM system.

545

* At each level, we keep a list of pages, which are heads of continuous

545

* At each level, we keep a list of pages, which are heads of continuous

546

* free pages of length of (1 << order) and marked with _mapcount

546

* free pages of length of (1 << order) and marked with _mapcount

547

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

547

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

548

* field.

548

* field.

549

* So when we are allocating or freeing one, we can derive the state of the

549

* So when we are allocating or freeing one, we can derive the state of the

550

* other. That is, if we allocate a small block, and both were

550

* other. That is, if we allocate a small block, and both were

551

* free, the remainder of the region must be split into blocks.

551

* free, the remainder of the region must be split into blocks.

552

* If a block is freed, and its buddy is also free, then this

552

* If a block is freed, and its buddy is also free, then this

553

* triggers coalescing into a block of larger size.

553

* triggers coalescing into a block of larger size.

554

*

554

*

555

* -- nyc

555

* -- nyc

556

*/

556

*/

557

558

static inline void __free_one_page(struct page *page,

558

static inline void __free_one_page(struct page *page,

559

struct zone *zone, unsigned int order,

559

struct zone *zone, unsigned int order,

560

int migratetype)

560

int migratetype)

561

{

561

{

562

unsigned long page_idx;

562

unsigned long page_idx;

563

unsigned long combined_idx;

563

unsigned long combined_idx;

564

unsigned long uninitialized_var(buddy_idx);

564

unsigned long uninitialized_var(buddy_idx);

565

struct page *buddy;

565

struct page *buddy;

566

567

VM_BUG_ON(!zone_is_initialized(zone));

567

VM_BUG_ON(!zone_is_initialized(zone));

568

569

if (unlikely(PageCompound(page)))

569

if (unlikely(PageCompound(page)))

570

if (unlikely(destroy_compound_page(page, order)))

570

if (unlikely(destroy_compound_page(page, order)))

571

return;

571

return;

572

573

VM_BUG_ON(migratetype == -1);

573

VM_BUG_ON(migratetype == -1);

574

575

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

575

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

576

577

VM_BUG_ON(page_idx & ((1 << order) - 1));

577

VM_BUG_ON(page_idx & ((1 << order) - 1));

578

VM_BUG_ON(bad_range(zone, page));

578

VM_BUG_ON(bad_range(zone, page));

579

580

while (order < MAX_ORDER-1) {

580

while (order < MAX_ORDER-1) {

581

buddy_idx = __find_buddy_index(page_idx, order);

581

buddy_idx = __find_buddy_index(page_idx, order);

582

buddy = page + (buddy_idx - page_idx);

582

buddy = page + (buddy_idx - page_idx);

583

if (!page_is_buddy(page, buddy, order))

583

if (!page_is_buddy(page, buddy, order))

584

break;

584

break;

585

/*

585

/*

586

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

586

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

587

* merge with it and move up one order.

587

* merge with it and move up one order.

588

*/

588

*/

589

if (page_is_guard(buddy)) {

589

if (page_is_guard(buddy)) {

590

clear_page_guard_flag(buddy);

590

clear_page_guard_flag(buddy);

591

set_page_private(page, 0);

591

set_page_private(page, 0);

592

__mod_zone_freepage_state(zone, 1 << order,

592

__mod_zone_freepage_state(zone, 1 << order,

593

migratetype);

593

migratetype);

594

} else {

594

} else {

595

list_del(&buddy->lru);

595

list_del(&buddy->lru);

596

zone->free_area[order].nr_free--;

596

zone->free_area[order].nr_free--;

597

rmv_page_order(buddy);

597

rmv_page_order(buddy);

598

}

598

}

599

combined_idx = buddy_idx & page_idx;

599

combined_idx = buddy_idx & page_idx;

600

page = page + (combined_idx - page_idx);

600

page = page + (combined_idx - page_idx);

601

page_idx = combined_idx;

601

page_idx = combined_idx;

602

order++;

602

order++;

603

}

603

}

604

set_page_order(page, order);

604

set_page_order(page, order);

605

606

/*

606

/*

607

* If this is not the largest possible page, check if the buddy

607

* If this is not the largest possible page, check if the buddy

608

* of the next-highest order is free. If it is, it's possible

608

* of the next-highest order is free. If it is, it's possible

609

* that pages are being freed that will coalesce soon. In case,

609

* that pages are being freed that will coalesce soon. In case,

610

* that is happening, add the free page to the tail of the list

610

* that is happening, add the free page to the tail of the list

611

* so it's less likely to be used soon and more likely to be merged

611

* so it's less likely to be used soon and more likely to be merged

612

* as a higher order page

612

* as a higher order page

613

*/

613

*/

614

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

614

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

615

struct page *higher_page, *higher_buddy;

615

struct page *higher_page, *higher_buddy;

616

combined_idx = buddy_idx & page_idx;

616

combined_idx = buddy_idx & page_idx;

617

higher_page = page + (combined_idx - page_idx);

617

higher_page = page + (combined_idx - page_idx);

618

buddy_idx = __find_buddy_index(combined_idx, order + 1);

618

buddy_idx = __find_buddy_index(combined_idx, order + 1);

619

higher_buddy = higher_page + (buddy_idx - combined_idx);

619

higher_buddy = higher_page + (buddy_idx - combined_idx);

620

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

620

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

621

list_add_tail(&page->lru,

621

list_add_tail(&page->lru,

622

&zone->free_area[order].free_list[migratetype]);

622

&zone->free_area[order].free_list[migratetype]);

623

goto out;

623

goto out;

624

}

624

}

625

}

625

}

626

627

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

627

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

628

out:

628

out:

629

zone->free_area[order].nr_free++;

629

zone->free_area[order].nr_free++;

630

}

630

}

631

632

static inline int free_pages_check(struct page *page)

632

static inline int free_pages_check(struct page *page)

633

{

633

{

634

if (unlikely(page_mapcount(page) |

634

if (unlikely(page_mapcount(page) |

635

(page->mapping != NULL) |

635

(page->mapping != NULL) |

636

(atomic_read(&page->_count) != 0) |

636

(atomic_read(&page->_count) != 0) |

637

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

637

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

638

(mem_cgroup_bad_page_check(page)))) {

638

(mem_cgroup_bad_page_check(page)))) {

639

bad_page(page);

639

bad_page(page);

640

return 1;

640

return 1;

641

}

641

}

642

page_nid_reset_last(page);

642

page_nid_reset_last(page);

643

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

643

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

644

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

644

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

645

return 0;

645

return 0;

646

}

646

}

647

648

/*

648

/*

649

* Frees a number of pages from the PCP lists

649

* Frees a number of pages from the PCP lists

650

* Assumes all pages on list are in same zone, and of same order.

650

* Assumes all pages on list are in same zone, and of same order.

651

* count is the number of pages to free.

651

* count is the number of pages to free.

652

*

652

*

653

* If the zone was previously in an "all pages pinned" state then look to

653

* If the zone was previously in an "all pages pinned" state then look to

654

* see if this freeing clears that state.

654

* see if this freeing clears that state.

655

*

655

*

656

* And clear the zone's pages_scanned counter, to hold off the "all pages are

656

* And clear the zone's pages_scanned counter, to hold off the "all pages are

657

* pinned" detection logic.

657

* pinned" detection logic.

658

*/

658

*/

659

static void free_pcppages_bulk(struct zone *zone, int count,

659

static void free_pcppages_bulk(struct zone *zone, int count,

660

struct per_cpu_pages *pcp)

660

struct per_cpu_pages *pcp)

661

{

661

{

662

int migratetype = 0;

662

int migratetype = 0;

663

int batch_free = 0;

663

int batch_free = 0;

664

int to_free = count;

664

int to_free = count;

665

666

spin_lock(&zone->lock);

666

spin_lock(&zone->lock);

667

zone->pages_scanned = 0;

667

zone->pages_scanned = 0;

668

669

while (to_free) {

669

while (to_free) {

670

struct page *page;

670

struct page *page;

671

struct list_head *list;

671

struct list_head *list;

672

673

/*

673

/*

674

* Remove pages from lists in a round-robin fashion. A

674

* Remove pages from lists in a round-robin fashion. A

675

* batch_free count is maintained that is incremented when an

675

* batch_free count is maintained that is incremented when an

676

* empty list is encountered. This is so more pages are freed

676

* empty list is encountered. This is so more pages are freed

677

* off fuller lists instead of spinning excessively around empty

677

* off fuller lists instead of spinning excessively around empty

678

* lists

678

* lists

679

*/

679

*/

680

do {

680

do {

681

batch_free++;

681

batch_free++;

682

if (++migratetype == MIGRATE_PCPTYPES)

682

if (++migratetype == MIGRATE_PCPTYPES)

683

migratetype = 0;

683

migratetype = 0;

684

list = &pcp->lists[migratetype];

684

list = &pcp->lists[migratetype];

685

} while (list_empty(list));

685

} while (list_empty(list));

686

687

/* This is the only non-empty list. Free them all. */

687

/* This is the only non-empty list. Free them all. */

688

if (batch_free == MIGRATE_PCPTYPES)

688

if (batch_free == MIGRATE_PCPTYPES)

689

batch_free = to_free;

689

batch_free = to_free;

690

691

do {

691

do {

692

int mt; /* migratetype of the to-be-freed page */

692

int mt; /* migratetype of the to-be-freed page */

693

694

page = list_entry(list->prev, struct page, lru);

694

page = list_entry(list->prev, struct page, lru);

695

/* must delete as __free_one_page list manipulates */

695

/* must delete as __free_one_page list manipulates */

696

list_del(&page->lru);

696

list_del(&page->lru);

697

mt = get_freepage_migratetype(page);

697

mt = get_freepage_migratetype(page);

698

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

698

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

699

__free_one_page(page, zone, 0, mt);

699

__free_one_page(page, zone, 0, mt);

700

trace_mm_page_pcpu_drain(page, 0, mt);

700

trace_mm_page_pcpu_drain(page, 0, mt);

701

if (likely(!is_migrate_isolate_page(page))) {

701

if (likely(!is_migrate_isolate_page(page))) {

702

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

702

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

703

if (is_migrate_cma(mt))

703

if (is_migrate_cma(mt))

704

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

704

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

705

}

705

}

706

} while (--to_free && --batch_free && !list_empty(list));

706

} while (--to_free && --batch_free && !list_empty(list));

707

}

707

}

708

spin_unlock(&zone->lock);

708

spin_unlock(&zone->lock);

709

}

709

}

710

711

static void free_one_page(struct zone *zone, struct page *page, int order,

711

static void free_one_page(struct zone *zone, struct page *page, int order,

712

int migratetype)

712

int migratetype)

713

{

713

{

714

spin_lock(&zone->lock);

714

spin_lock(&zone->lock);

715

zone->pages_scanned = 0;

715

zone->pages_scanned = 0;

716

717

__free_one_page(page, zone, order, migratetype);

717

__free_one_page(page, zone, order, migratetype);

718

if (unlikely(!is_migrate_isolate(migratetype)))

718

if (unlikely(!is_migrate_isolate(migratetype)))

719

__mod_zone_freepage_state(zone, 1 << order, migratetype);

719

__mod_zone_freepage_state(zone, 1 << order, migratetype);

720

spin_unlock(&zone->lock);

720

spin_unlock(&zone->lock);

721

}

721

}

722

723

static bool free_pages_prepare(struct page *page, unsigned int order)

723

static bool free_pages_prepare(struct page *page, unsigned int order)

724

{

724

{

725

int i;

725

int i;

726

int bad = 0;

726

int bad = 0;

727

728

trace_mm_page_free(page, order);

728

trace_mm_page_free(page, order);

729

kmemcheck_free_shadow(page, order);

729

kmemcheck_free_shadow(page, order);

730

731

if (PageAnon(page))

731

if (PageAnon(page))

732

page->mapping = NULL;

732

page->mapping = NULL;

733

for (i = 0; i < (1 << order); i++)

733

for (i = 0; i < (1 << order); i++)

734

bad += free_pages_check(page + i);

734

bad += free_pages_check(page + i);

735

if (bad)

735

if (bad)

736

return false;

736

return false;

737

738

if (!PageHighMem(page)) {

738

if (!PageHighMem(page)) {

739

debug_check_no_locks_freed(page_address(page),

739

debug_check_no_locks_freed(page_address(page),

740

PAGE_SIZE << order);

740

PAGE_SIZE << order);

741

debug_check_no_obj_freed(page_address(page),

741

debug_check_no_obj_freed(page_address(page),

742

PAGE_SIZE << order);

742

PAGE_SIZE << order);

743

}

743

}

744

arch_free_page(page, order);

744

arch_free_page(page, order);

745

kernel_map_pages(page, 1 << order, 0);

745

kernel_map_pages(page, 1 << order, 0);

746

747

return true;

747

return true;

748

}

748

}

749

750

static void __free_pages_ok(struct page *page, unsigned int order)

750

static void __free_pages_ok(struct page *page, unsigned int order)

751

{

751

{

752

unsigned long flags;

752

unsigned long flags;

753

int migratetype;

753

int migratetype;

754

755

if (!free_pages_prepare(page, order))

755

if (!free_pages_prepare(page, order))

756

return;

756

return;

757

758

local_irq_save(flags);

758

local_irq_save(flags);

759

__count_vm_events(PGFREE, 1 << order);

759

__count_vm_events(PGFREE, 1 << order);

760

migratetype = get_pageblock_migratetype(page);

760

migratetype = get_pageblock_migratetype(page);

761

set_freepage_migratetype(page, migratetype);

761

set_freepage_migratetype(page, migratetype);

762

free_one_page(page_zone(page), page, order, migratetype);

762

free_one_page(page_zone(page), page, order, migratetype);

763

local_irq_restore(flags);

763

local_irq_restore(flags);

764

}

764

}

765

766

void __init __free_pages_bootmem(struct page *page, unsigned int order)

766

void __init __free_pages_bootmem(struct page *page, unsigned int order)

767

{

767

{

768

unsigned int nr_pages = 1 << order;

768

unsigned int nr_pages = 1 << order;

769

struct page *p = page;

769

struct page *p = page;

770

unsigned int loop;

770

unsigned int loop;

771

772

prefetchw(p);

772

prefetchw(p);

773

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

773

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

774

prefetchw(p + 1);

774

prefetchw(p + 1);

775

__ClearPageReserved(p);

775

__ClearPageReserved(p);

776

set_page_count(p, 0);

776

set_page_count(p, 0);

777

}

777

}

778

__ClearPageReserved(p);

778

__ClearPageReserved(p);

779

set_page_count(p, 0);

779

set_page_count(p, 0);

780

781

page_zone(page)->managed_pages += nr_pages;

781

page_zone(page)->managed_pages += nr_pages;

782

set_page_refcounted(page);

782

set_page_refcounted(page);

783

__free_pages(page, order);

783

__free_pages(page, order);

784

}

784

}

785

786

#ifdef CONFIG_CMA

786

#ifdef CONFIG_CMA

787

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

787

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

788

void __init init_cma_reserved_pageblock(struct page *page)

788

void __init init_cma_reserved_pageblock(struct page *page)

789

{

789

{

790

unsigned i = pageblock_nr_pages;

790

unsigned i = pageblock_nr_pages;

791

struct page *p = page;

791

struct page *p = page;

792

793

do {

793

do {

794

__ClearPageReserved(p);

794

__ClearPageReserved(p);

795

set_page_count(p, 0);

795

set_page_count(p, 0);

796

} while (++p, --i);

796

} while (++p, --i);

797

798

set_pageblock_migratetype(page, MIGRATE_CMA);

798

set_pageblock_migratetype(page, MIGRATE_CMA);

799

800

if (pageblock_order >= MAX_ORDER) {

800

if (pageblock_order >= MAX_ORDER) {

801

i = pageblock_nr_pages;

801

i = pageblock_nr_pages;

802

p = page;

802

p = page;

803

do {

803

do {

804

set_page_refcounted(p);

804

set_page_refcounted(p);

805

__free_pages(p, MAX_ORDER - 1);

805

__free_pages(p, MAX_ORDER - 1);

806

p += MAX_ORDER_NR_PAGES;

806

p += MAX_ORDER_NR_PAGES;

807

} while (i -= MAX_ORDER_NR_PAGES);

807

} while (i -= MAX_ORDER_NR_PAGES);

808

} else {

808

} else {

809

set_page_refcounted(page);

809

set_page_refcounted(page);

810

__free_pages(page, pageblock_order);

810

__free_pages(page, pageblock_order);

811

}

811

}

812

813

adjust_managed_page_count(page, pageblock_nr_pages);

813

adjust_managed_page_count(page, pageblock_nr_pages);

814

}

814

}

815

#endif

815

#endif

816

817

/*

817

/*

818

* The order of subdivision here is critical for the IO subsystem.

818

* The order of subdivision here is critical for the IO subsystem.

819

* Please do not alter this order without good reasons and regression

819

* Please do not alter this order without good reasons and regression

820

* testing. Specifically, as large blocks of memory are subdivided,

820

* testing. Specifically, as large blocks of memory are subdivided,

821

* the order in which smaller blocks are delivered depends on the order

821

* the order in which smaller blocks are delivered depends on the order

822

* they're subdivided in this function. This is the primary factor

822

* they're subdivided in this function. This is the primary factor

823

* influencing the order in which pages are delivered to the IO

823

* influencing the order in which pages are delivered to the IO

824

* subsystem according to empirical testing, and this is also justified

824

* subsystem according to empirical testing, and this is also justified

825

* by considering the behavior of a buddy system containing a single

825

* by considering the behavior of a buddy system containing a single

826

* large block of memory acted on by a series of small allocations.

826

* large block of memory acted on by a series of small allocations.

827

* This behavior is a critical factor in sglist merging's success.

827

* This behavior is a critical factor in sglist merging's success.

828

*

828

*

829

* -- nyc

829

* -- nyc

830

*/

830

*/

831

static inline void expand(struct zone *zone, struct page *page,

831

static inline void expand(struct zone *zone, struct page *page,

832

int low, int high, struct free_area *area,

832

int low, int high, struct free_area *area,

833

int migratetype)

833

int migratetype)

834

{

834

{

835

unsigned long size = 1 << high;

835

unsigned long size = 1 << high;

836

837

while (high > low) {

837

while (high > low) {

838

area--;

838

area--;

839

high--;

839

high--;

840

size >>= 1;

840

size >>= 1;

841

VM_BUG_ON(bad_range(zone, &page[size]));

841

VM_BUG_ON(bad_range(zone, &page[size]));

842

843

#ifdef CONFIG_DEBUG_PAGEALLOC

843

#ifdef CONFIG_DEBUG_PAGEALLOC

844

if (high < debug_guardpage_minorder()) {

844

if (high < debug_guardpage_minorder()) {

845

/*

845

/*

846

* Mark as guard pages (or page), that will allow to

846

* Mark as guard pages (or page), that will allow to

847

* merge back to allocator when buddy will be freed.

847

* merge back to allocator when buddy will be freed.

848

* Corresponding page table entries will not be touched,

848

* Corresponding page table entries will not be touched,

849

* pages will stay not present in virtual address space

849

* pages will stay not present in virtual address space

850

*/

850

*/

851

INIT_LIST_HEAD(&page[size].lru);

851

INIT_LIST_HEAD(&page[size].lru);

852

set_page_guard_flag(&page[size]);

852

set_page_guard_flag(&page[size]);

853

set_page_private(&page[size], high);

853

set_page_private(&page[size], high);

854

/* Guard pages are not available for any usage */

854

/* Guard pages are not available for any usage */

855

__mod_zone_freepage_state(zone, -(1 << high),

855

__mod_zone_freepage_state(zone, -(1 << high),

856

migratetype);

856

migratetype);

857

continue;

857

continue;

858

}

858

}

859

#endif

859

#endif

860

list_add(&page[size].lru, &area->free_list[migratetype]);

860

list_add(&page[size].lru, &area->free_list[migratetype]);

861

area->nr_free++;

861

area->nr_free++;

862

set_page_order(&page[size], high);

862

set_page_order(&page[size], high);

863

}

863

}

864

}

864

}

865

866

/*

866

/*

867

* This page is about to be returned from the page allocator

867

* This page is about to be returned from the page allocator

868

*/

868

*/

869

static inline int check_new_page(struct page *page)

869

static inline int check_new_page(struct page *page)

870

{

870

{

871

if (unlikely(page_mapcount(page) |

871

if (unlikely(page_mapcount(page) |

872

(page->mapping != NULL) |

872

(page->mapping != NULL) |

873

(atomic_read(&page->_count) != 0) |

873

(atomic_read(&page->_count) != 0) |

874

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

874

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

875

(mem_cgroup_bad_page_check(page)))) {

875

(mem_cgroup_bad_page_check(page)))) {

876

bad_page(page);

876

bad_page(page);

877

return 1;

877

return 1;

878

}

878

}

879

return 0;

879

return 0;

880

}

880

}

881

882

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

882

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

883

{

883

{

884

int i;

884

int i;

885

886

for (i = 0; i < (1 << order); i++) {

886

for (i = 0; i < (1 << order); i++) {

887

struct page *p = page + i;

887

struct page *p = page + i;

888

if (unlikely(check_new_page(p)))

888

if (unlikely(check_new_page(p)))

889

return 1;

889

return 1;

890

}

890

}

891

892

set_page_private(page, 0);

892

set_page_private(page, 0);

893

set_page_refcounted(page);

893

set_page_refcounted(page);

894

895

arch_alloc_page(page, order);

895

arch_alloc_page(page, order);

896

kernel_map_pages(page, 1 << order, 1);

896

kernel_map_pages(page, 1 << order, 1);

897

898

if (gfp_flags & __GFP_ZERO)

898

if (gfp_flags & __GFP_ZERO)

899

prep_zero_page(page, order, gfp_flags);

899

prep_zero_page(page, order, gfp_flags);

900

901

if (order && (gfp_flags & __GFP_COMP))

901

if (order && (gfp_flags & __GFP_COMP))

902

prep_compound_page(page, order);

902

prep_compound_page(page, order);

903

904

return 0;

904

return 0;

905

}

905

}

906

907

/*

907

/*

908

* Go through the free lists for the given migratetype and remove

908

* Go through the free lists for the given migratetype and remove

909

* the smallest available page from the freelists

909

* the smallest available page from the freelists

910

*/

910

*/

911

static inline

911

static inline

912

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

912

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

913

int migratetype)

913

int migratetype)

914

{

914

{

915

unsigned int current_order;

915

unsigned int current_order;

916

struct free_area *area;

916

struct free_area *area;

917

struct page *page;

917

struct page *page;

918

919

/* Find a page of the appropriate size in the preferred list */

919

/* Find a page of the appropriate size in the preferred list */

920

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

920

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

921

area = &(zone->free_area[current_order]);

921

area = &(zone->free_area[current_order]);

922

if (list_empty(&area->free_list[migratetype]))

922

if (list_empty(&area->free_list[migratetype]))

923

continue;

923

continue;

924

925

page = list_entry(area->free_list[migratetype].next,

925

page = list_entry(area->free_list[migratetype].next,

926

struct page, lru);

926

struct page, lru);

927

list_del(&page->lru);

927

list_del(&page->lru);

928

rmv_page_order(page);

928

rmv_page_order(page);

929

area->nr_free--;

929

area->nr_free--;

930

expand(zone, page, order, current_order, area, migratetype);

930

expand(zone, page, order, current_order, area, migratetype);

931

set_freepage_migratetype(page, migratetype);

931

set_freepage_migratetype(page, migratetype);

932

return page;

932

return page;

933

}

933

}

934

935

return NULL;

935

return NULL;

936

}

936

}

937

938

939

/*

939

/*

940

* This array describes the order lists are fallen back to when

940

* This array describes the order lists are fallen back to when

941

* the free lists for the desirable migrate type are depleted

941

* the free lists for the desirable migrate type are depleted

942

*/

942

*/

943

static int fallbacks[MIGRATE_TYPES][4] = {

943

static int fallbacks[MIGRATE_TYPES][4] = {

944

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

944

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

945

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

945

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

946

#ifdef CONFIG_CMA

946

#ifdef CONFIG_CMA

947

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

947

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

948

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

948

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

949

#else

949

#else

950

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

950

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

951

#endif

951

#endif

952

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

952

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

953

#ifdef CONFIG_MEMORY_ISOLATION

953

#ifdef CONFIG_MEMORY_ISOLATION

954

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

954

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

955

#endif

955

#endif

956

};

956

};

957

958

/*

958

/*

959

* Move the free pages in a range to the free lists of the requested type.

959

* Move the free pages in a range to the free lists of the requested type.

960

* Note that start_page and end_pages are not aligned on a pageblock

960

* Note that start_page and end_pages are not aligned on a pageblock

961

* boundary. If alignment is required, use move_freepages_block()

961

* boundary. If alignment is required, use move_freepages_block()

962

*/

962

*/

963

int move_freepages(struct zone *zone,

963

int move_freepages(struct zone *zone,

964

struct page *start_page, struct page *end_page,

964

struct page *start_page, struct page *end_page,

965

int migratetype)

965

int migratetype)

966

{

966

{

967

struct page *page;

967

struct page *page;

968

unsigned long order;

968

unsigned long order;

969

int pages_moved = 0;

969

int pages_moved = 0;

970

971

#ifndef CONFIG_HOLES_IN_ZONE

971

#ifndef CONFIG_HOLES_IN_ZONE

972

/*

972

/*

973

* page_zone is not safe to call in this context when

973

* page_zone is not safe to call in this context when

974

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

974

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

975

* anyway as we check zone boundaries in move_freepages_block().

975

* anyway as we check zone boundaries in move_freepages_block().

976

* Remove at a later date when no bug reports exist related to

976

* Remove at a later date when no bug reports exist related to

977

* grouping pages by mobility

977

* grouping pages by mobility

978

*/

978

*/

979

BUG_ON(page_zone(start_page) != page_zone(end_page));

979

BUG_ON(page_zone(start_page) != page_zone(end_page));

980

#endif

980

#endif

981

982

for (page = start_page; page <= end_page;) {

982

for (page = start_page; page <= end_page;) {

983

/* Make sure we are not inadvertently changing nodes */

983

/* Make sure we are not inadvertently changing nodes */

984

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

984

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

985

986

if (!pfn_valid_within(page_to_pfn(page))) {

986

if (!pfn_valid_within(page_to_pfn(page))) {

987

page++;

987

page++;

988

continue;

988

continue;

989

}

989

}

990

991

if (!PageBuddy(page)) {

991

if (!PageBuddy(page)) {

992

page++;

992

page++;

993

continue;

993

continue;

994

}

994

}

995

996

order = page_order(page);

996

order = page_order(page);

997

list_move(&page->lru,

997

list_move(&page->lru,

998

&zone->free_area[order].free_list[migratetype]);

998

&zone->free_area[order].free_list[migratetype]);

999

set_freepage_migratetype(page, migratetype);

999

set_freepage_migratetype(page, migratetype);

1000

page += 1 << order;

1000

page += 1 << order;

1001

pages_moved += 1 << order;

1001

pages_moved += 1 << order;

1002

}

1002

}

1003

1004

return pages_moved;

1004

return pages_moved;

1005

}

1005

}

1006

1007

int move_freepages_block(struct zone *zone, struct page *page,

1007

int move_freepages_block(struct zone *zone, struct page *page,

1008

int migratetype)

1008

int migratetype)

1009

{

1009

{

1010

unsigned long start_pfn, end_pfn;

1010

unsigned long start_pfn, end_pfn;

1011

struct page *start_page, *end_page;

1011

struct page *start_page, *end_page;

1012

1013

start_pfn = page_to_pfn(page);

1013

start_pfn = page_to_pfn(page);

1014

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1014

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1015

start_page = pfn_to_page(start_pfn);

1015

start_page = pfn_to_page(start_pfn);

1016

end_page = start_page + pageblock_nr_pages - 1;

1016

end_page = start_page + pageblock_nr_pages - 1;

1017

end_pfn = start_pfn + pageblock_nr_pages - 1;

1017

end_pfn = start_pfn + pageblock_nr_pages - 1;

1018

1019

/* Do not cross zone boundaries */

1019

/* Do not cross zone boundaries */

1020

if (!zone_spans_pfn(zone, start_pfn))

1020

if (!zone_spans_pfn(zone, start_pfn))

1021

start_page = page;

1021

start_page = page;

1022

if (!zone_spans_pfn(zone, end_pfn))

1022

if (!zone_spans_pfn(zone, end_pfn))

1023

return 0;

1023

return 0;

1024

1025

return move_freepages(zone, start_page, end_page, migratetype);

1025

return move_freepages(zone, start_page, end_page, migratetype);

1026

}

1026

}

1027

1028

static void change_pageblock_range(struct page *pageblock_page,

1028

static void change_pageblock_range(struct page *pageblock_page,

1029

int start_order, int migratetype)

1029

int start_order, int migratetype)

1030

{

1030

{

1031

int nr_pageblocks = 1 << (start_order - pageblock_order);

1031

int nr_pageblocks = 1 << (start_order - pageblock_order);

1032

1033

while (nr_pageblocks--) {

1033

while (nr_pageblocks--) {

1034

set_pageblock_migratetype(pageblock_page, migratetype);

1034

set_pageblock_migratetype(pageblock_page, migratetype);

1035

pageblock_page += pageblock_nr_pages;

1035

pageblock_page += pageblock_nr_pages;

1036

}

1036

}

1037

}

1037

}

1038

1039

/*

1039

/*

1040

* If breaking a large block of pages, move all free pages to the preferred

1040

* If breaking a large block of pages, move all free pages to the preferred

1041

* allocation list. If falling back for a reclaimable kernel allocation, be

1041

* allocation list. If falling back for a reclaimable kernel allocation, be

1042

* more aggressive about taking ownership of free pages.

1042

* more aggressive about taking ownership of free pages.

1043

*

1043

*

1044

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1044

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1045

* nor move CMA pages to different free lists. We don't want unmovable pages

1045

* nor move CMA pages to different free lists. We don't want unmovable pages

1046

* to be allocated from MIGRATE_CMA areas.

1046

* to be allocated from MIGRATE_CMA areas.

1047

*

1047

*

1048

* Returns the new migratetype of the pageblock (or the same old migratetype

1048

* Returns the new migratetype of the pageblock (or the same old migratetype

1049

* if it was unchanged).

1049

* if it was unchanged).

1050

*/

1050

*/

1051

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1051

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1052

int start_type, int fallback_type)

1052

int start_type, int fallback_type)

1053

{

1053

{

1054

int current_order = page_order(page);

1054

int current_order = page_order(page);

1055

1056

/*

1056

/*

1057

* When borrowing from MIGRATE_CMA, we need to release the excess

1057

* When borrowing from MIGRATE_CMA, we need to release the excess

1058

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1058

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1059

* is set to CMA so it is returned to the correct freelist in case

1059

* is set to CMA so it is returned to the correct freelist in case

1060

* the page ends up being not actually allocated from the pcp lists.

1060

* the page ends up being not actually allocated from the pcp lists.

1061

*/

1061

*/

1062

if (is_migrate_cma(fallback_type))

1062

if (is_migrate_cma(fallback_type))

1063

return fallback_type;

1063

return fallback_type;

1064

1065

/* Take ownership for orders >= pageblock_order */

1065

/* Take ownership for orders >= pageblock_order */

1066

if (current_order >= pageblock_order) {

1066

if (current_order >= pageblock_order) {

1067

change_pageblock_range(page, current_order, start_type);

1067

change_pageblock_range(page, current_order, start_type);

1068

return start_type;

1068

return start_type;

1069

}

1069

}

1070

1071

if (current_order >= pageblock_order / 2 ||

1071

if (current_order >= pageblock_order / 2 ||

1072

start_type == MIGRATE_RECLAIMABLE ||

1072

start_type == MIGRATE_RECLAIMABLE ||

1073

page_group_by_mobility_disabled) {

1073

page_group_by_mobility_disabled) {

1074

int pages;

1074

int pages;

1075

1076

pages = move_freepages_block(zone, page, start_type);

1076

pages = move_freepages_block(zone, page, start_type);

1077

1078

/* Claim the whole block if over half of it is free */

1078

/* Claim the whole block if over half of it is free */

1079

if (pages >= (1 << (pageblock_order-1)) ||

1079

if (pages >= (1 << (pageblock_order-1)) ||

1080

page_group_by_mobility_disabled) {

1080

page_group_by_mobility_disabled) {

1081

1082

set_pageblock_migratetype(page, start_type);

1082

set_pageblock_migratetype(page, start_type);

1083

return start_type;

1083

return start_type;

1084

}

1084

}

1085

1086

}

1086

}

1087

1088

return fallback_type;

1088

return fallback_type;

1089

}

1089

}

1090

1091

/* Remove an element from the buddy allocator from the fallback list */

1091

/* Remove an element from the buddy allocator from the fallback list */

1092

static inline struct page *

1092

static inline struct page *

1093

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1093

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1094

{

1094

{

1095

struct free_area *area;

1095

struct free_area *area;

1096

int current_order;

1096

int current_order;

1097

struct page *page;

1097

struct page *page;

1098

int migratetype, new_type, i;

1098

int migratetype, new_type, i;

1099

1100

/* Find the largest possible block of pages in the other list */

1100

/* Find the largest possible block of pages in the other list */

1101

for (current_order = MAX_ORDER-1; current_order >= order;

1101

for (current_order = MAX_ORDER-1; current_order >= order;

1102

--current_order) {

1102

--current_order) {

1103

for (i = 0;; i++) {

1103

for (i = 0;; i++) {

1104

migratetype = fallbacks[start_migratetype][i];

1104

migratetype = fallbacks[start_migratetype][i];

1105

1106

/* MIGRATE_RESERVE handled later if necessary */

1106

/* MIGRATE_RESERVE handled later if necessary */

1107

if (migratetype == MIGRATE_RESERVE)

1107

if (migratetype == MIGRATE_RESERVE)

1108

break;

1108

break;

1109

1110

area = &(zone->free_area[current_order]);

1110

area = &(zone->free_area[current_order]);

1111

if (list_empty(&area->free_list[migratetype]))

1111

if (list_empty(&area->free_list[migratetype]))

1112

continue;

1112

continue;

1113

1114

page = list_entry(area->free_list[migratetype].next,

1114

page = list_entry(area->free_list[migratetype].next,

1115

struct page, lru);

1115

struct page, lru);

1116

area->nr_free--;

1116

area->nr_free--;

1117

1118

new_type = try_to_steal_freepages(zone, page,

1118

new_type = try_to_steal_freepages(zone, page,

1119

start_migratetype,

1119

start_migratetype,

1120

migratetype);

1120

migratetype);

1121

1122

/* Remove the page from the freelists */

1122

/* Remove the page from the freelists */

1123

list_del(&page->lru);

1123

list_del(&page->lru);

1124

rmv_page_order(page);

1124

rmv_page_order(page);

1125

1126

expand(zone, page, order, current_order, area,

1126

expand(zone, page, order, current_order, area,

1127

new_type);

1127

new_type);

1128

/* The freepage_migratetype may differ from pageblock's

1128

/* The freepage_migratetype may differ from pageblock's

1129

* migratetype depending on the decisions in

1129

* migratetype depending on the decisions in

1130

* try_to_steal_freepages. This is OK as long as it does

1130

* try_to_steal_freepages. This is OK as long as it does

1131

* not differ for MIGRATE_CMA type.

1131

* not differ for MIGRATE_CMA type.

1132

*/

1132

*/

1133

set_freepage_migratetype(page, new_type);

1133

set_freepage_migratetype(page, new_type);

1134

1135

trace_mm_page_alloc_extfrag(page, order, current_order,

1135

trace_mm_page_alloc_extfrag(page, order, current_order,

1136

start_migratetype, migratetype, new_type);

1136

start_migratetype, migratetype, new_type);

1137

1138

return page;

1138

return page;

1139

}

1139

}

1140

}

1140

}

1141

1142

return NULL;

1142

return NULL;

1143

}

1143

}

1144

1145

/*

1145

/*

1146

* Do the hard work of removing an element from the buddy allocator.

1146

* Do the hard work of removing an element from the buddy allocator.

1147

* Call me with the zone->lock already held.

1147

* Call me with the zone->lock already held.

1148

*/

1148

*/

1149

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1149

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1150

int migratetype)

1150

int migratetype)

1151

{

1151

{

1152

struct page *page;

1152

struct page *page;

1153

1154

retry_reserve:

1154

retry_reserve:

1155

page = __rmqueue_smallest(zone, order, migratetype);

1155

page = __rmqueue_smallest(zone, order, migratetype);

1156

1157

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1157

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1158

page = __rmqueue_fallback(zone, order, migratetype);

1158

page = __rmqueue_fallback(zone, order, migratetype);

1159

1160

/*

1160

/*

1161

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1161

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1162

* is used because __rmqueue_smallest is an inline function

1162

* is used because __rmqueue_smallest is an inline function

1163

* and we want just one call site

1163

* and we want just one call site

1164

*/

1164

*/

1165

if (!page) {

1165

if (!page) {

1166

migratetype = MIGRATE_RESERVE;

1166

migratetype = MIGRATE_RESERVE;

1167

goto retry_reserve;

1167

goto retry_reserve;

1168

}

1168

}

1169

}

1169

}

1170

1171

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1171

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1172

return page;

1172

return page;

1173

}

1173

}

1174

1175

/*

1175

/*

1176

* Obtain a specified number of elements from the buddy allocator, all under

1176

* Obtain a specified number of elements from the buddy allocator, all under

1177

* a single hold of the lock, for efficiency. Add them to the supplied list.

1177

* a single hold of the lock, for efficiency. Add them to the supplied list.

1178

* Returns the number of new pages which were placed at *list.

1178

* Returns the number of new pages which were placed at *list.

1179

*/

1179

*/

1180

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1180

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1181

unsigned long count, struct list_head *list,

1181

unsigned long count, struct list_head *list,

1182

int migratetype, int cold)

1182

int migratetype, int cold)

1183

{

1183

{

1184

int i;

1184

int i;

1185

1186

spin_lock(&zone->lock);

1186

spin_lock(&zone->lock);

1187

for (i = 0; i < count; ++i) {

1187

for (i = 0; i < count; ++i) {

1188

struct page *page = __rmqueue(zone, order, migratetype);

1188

struct page *page = __rmqueue(zone, order, migratetype);

1189

if (unlikely(page == NULL))

1189

if (unlikely(page == NULL))

1190

break;

1190

break;

1191

1192

/*

1192

/*

1193

* Split buddy pages returned by expand() are received here

1193

* Split buddy pages returned by expand() are received here

1194

* in physical page order. The page is added to the callers and

1194

* in physical page order. The page is added to the callers and

1195

* list and the list head then moves forward. From the callers

1195

* list and the list head then moves forward. From the callers

1196

* perspective, the linked list is ordered by page number in

1196

* perspective, the linked list is ordered by page number in

1197

* some conditions. This is useful for IO devices that can

1197

* some conditions. This is useful for IO devices that can

1198

* merge IO requests if the physical pages are ordered

1198

* merge IO requests if the physical pages are ordered

1199

* properly.

1199

* properly.

1200

*/

1200

*/

1201

if (likely(cold == 0))

1201

if (likely(cold == 0))

1202

list_add(&page->lru, list);

1202

list_add(&page->lru, list);

1203

else

1203

else

1204

list_add_tail(&page->lru, list);

1204

list_add_tail(&page->lru, list);

1205

list = &page->lru;

1205

list = &page->lru;

1206

if (is_migrate_cma(get_freepage_migratetype(page)))

1206

if (is_migrate_cma(get_freepage_migratetype(page)))

1207

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1207

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1208

-(1 << order));

1208

-(1 << order));

1209

}

1209

}

1210

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1210

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1211

spin_unlock(&zone->lock);

1211

spin_unlock(&zone->lock);

1212

return i;

1212

return i;

1213

}

1213

}

1214

1215

#ifdef CONFIG_NUMA

1215

#ifdef CONFIG_NUMA

1216

/*

1216

/*

1217

* Called from the vmstat counter updater to drain pagesets of this

1217

* Called from the vmstat counter updater to drain pagesets of this

1218

* currently executing processor on remote nodes after they have

1218

* currently executing processor on remote nodes after they have

1219

* expired.

1219

* expired.

1220

*

1220

*

1221

* Note that this function must be called with the thread pinned to

1221

* Note that this function must be called with the thread pinned to

1222

* a single processor.

1222

* a single processor.

1223

*/

1223

*/

1224

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1224

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1225

{

1225

{

1226

unsigned long flags;

1226

unsigned long flags;

1227

int to_drain;

1227

int to_drain;

1228

unsigned long batch;

1228

unsigned long batch;

1229

1230

local_irq_save(flags);

1230

local_irq_save(flags);

1231

batch = ACCESS_ONCE(pcp->batch);

1231

batch = ACCESS_ONCE(pcp->batch);

1232

if (pcp->count >= batch)

1232

if (pcp->count >= batch)

1233

to_drain = batch;

1233

to_drain = batch;

1234

else

1234

else

1235

to_drain = pcp->count;

1235

to_drain = pcp->count;

1236

if (to_drain > 0) {

1236

if (to_drain > 0) {

1237

free_pcppages_bulk(zone, to_drain, pcp);

1237

free_pcppages_bulk(zone, to_drain, pcp);

1238

pcp->count -= to_drain;

1238

pcp->count -= to_drain;

1239

}

1239

}

1240

local_irq_restore(flags);

1240

local_irq_restore(flags);

1241

}

1241

}

1242

#endif

1242

#endif

1243

1244

/*

1244

/*

1245

* Drain pages of the indicated processor.

1245

* Drain pages of the indicated processor.

1246

*

1246

*

1247

* The processor must either be the current processor and the

1247

* The processor must either be the current processor and the

1248

* thread pinned to the current processor or a processor that

1248

* thread pinned to the current processor or a processor that

1249

* is not online.

1249

* is not online.

1250

*/

1250

*/

1251

static void drain_pages(unsigned int cpu)

1251

static void drain_pages(unsigned int cpu)

1252

{

1252

{

1253

unsigned long flags;

1253

unsigned long flags;

1254

struct zone *zone;

1254

struct zone *zone;

1255

1256

for_each_populated_zone(zone) {

1256

for_each_populated_zone(zone) {

1257

struct per_cpu_pageset *pset;

1257

struct per_cpu_pageset *pset;

1258

struct per_cpu_pages *pcp;

1258

struct per_cpu_pages *pcp;

1259

1260

local_irq_save(flags);

1260

local_irq_save(flags);

1261

pset = per_cpu_ptr(zone->pageset, cpu);

1261

pset = per_cpu_ptr(zone->pageset, cpu);

1262

1263

pcp = &pset->pcp;

1263

pcp = &pset->pcp;

1264

if (pcp->count) {

1264

if (pcp->count) {

1265

free_pcppages_bulk(zone, pcp->count, pcp);

1265

free_pcppages_bulk(zone, pcp->count, pcp);

1266

pcp->count = 0;

1266

pcp->count = 0;

1267

}

1267

}

1268

local_irq_restore(flags);

1268

local_irq_restore(flags);

1269

}

1269

}

1270

}

1270

}

1271

1272

/*

1272

/*

1273

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1273

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1274

*/

1274

*/

1275

void drain_local_pages(void *arg)

1275

void drain_local_pages(void *arg)

1276

{

1276

{

1277

drain_pages(smp_processor_id());

1277

drain_pages(smp_processor_id());

1278

}

1278

}

1279

1280

/*

1280

/*

1281

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1281

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1282

*

1282

*

1283

* Note that this code is protected against sending an IPI to an offline

1283

* Note that this code is protected against sending an IPI to an offline

1284

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1284

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1285

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1285

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1286

* nothing keeps CPUs from showing up after we populated the cpumask and

1286

* nothing keeps CPUs from showing up after we populated the cpumask and

1287

* before the call to on_each_cpu_mask().

1287

* before the call to on_each_cpu_mask().

1288

*/

1288

*/

1289

void drain_all_pages(void)

1289

void drain_all_pages(void)

1290

{

1290

{

1291

int cpu;

1291

int cpu;

1292

struct per_cpu_pageset *pcp;

1292

struct per_cpu_pageset *pcp;

1293

struct zone *zone;

1293

struct zone *zone;

1294

1295

/*

1295

/*

1296

* Allocate in the BSS so we wont require allocation in

1296

* Allocate in the BSS so we wont require allocation in

1297

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1297

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1298

*/

1298

*/

1299

static cpumask_t cpus_with_pcps;

1299

static cpumask_t cpus_with_pcps;

1300

1301

/*

1301

/*

1302

* We don't care about racing with CPU hotplug event

1302

* We don't care about racing with CPU hotplug event

1303

* as offline notification will cause the notified

1303

* as offline notification will cause the notified

1304

* cpu to drain that CPU pcps and on_each_cpu_mask

1304

* cpu to drain that CPU pcps and on_each_cpu_mask

1305

* disables preemption as part of its processing

1305

* disables preemption as part of its processing

1306

*/

1306

*/

1307

for_each_online_cpu(cpu) {

1307

for_each_online_cpu(cpu) {

1308

bool has_pcps = false;

1308

bool has_pcps = false;

1309

for_each_populated_zone(zone) {

1309

for_each_populated_zone(zone) {

1310

pcp = per_cpu_ptr(zone->pageset, cpu);

1310

pcp = per_cpu_ptr(zone->pageset, cpu);

1311

if (pcp->pcp.count) {

1311

if (pcp->pcp.count) {

1312

has_pcps = true;

1312

has_pcps = true;

1313

break;

1313

break;

1314

}

1314

}

1315

}

1315

}

1316

if (has_pcps)

1316

if (has_pcps)

1317

cpumask_set_cpu(cpu, &cpus_with_pcps);

1317

cpumask_set_cpu(cpu, &cpus_with_pcps);

1318

else

1318

else

1319

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1319

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1320

}

1320

}

1321

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1321

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1322

}

1322

}

1323

1324

#ifdef CONFIG_HIBERNATION

1324

#ifdef CONFIG_HIBERNATION

1325

1326

void mark_free_pages(struct zone *zone)

1326

void mark_free_pages(struct zone *zone)

1327

{

1327

{

1328

unsigned long pfn, max_zone_pfn;

1328

unsigned long pfn, max_zone_pfn;

1329

unsigned long flags;

1329

unsigned long flags;

1330

int order, t;

1330

int order, t;

1331

struct list_head *curr;

1331

struct list_head *curr;

1332

1333

if (zone_is_empty(zone))

1333

if (zone_is_empty(zone))

1334

return;

1334

return;

1335

1336

spin_lock_irqsave(&zone->lock, flags);

1336

spin_lock_irqsave(&zone->lock, flags);

1337

1338

max_zone_pfn = zone_end_pfn(zone);

1338

max_zone_pfn = zone_end_pfn(zone);

1339

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1339

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1340

if (pfn_valid(pfn)) {

1340

if (pfn_valid(pfn)) {

1341

struct page *page = pfn_to_page(pfn);

1341

struct page *page = pfn_to_page(pfn);

1342

1343

if (!swsusp_page_is_forbidden(page))

1343

if (!swsusp_page_is_forbidden(page))

1344

swsusp_unset_page_free(page);

1344

swsusp_unset_page_free(page);

1345

}

1345

}

1346

1347

for_each_migratetype_order(order, t) {

1347

for_each_migratetype_order(order, t) {

1348

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1348

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1349

unsigned long i;

1349

unsigned long i;

1350

1351

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1351

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1352

for (i = 0; i < (1UL << order); i++)

1352

for (i = 0; i < (1UL << order); i++)

1353

swsusp_set_page_free(pfn_to_page(pfn + i));

1353

swsusp_set_page_free(pfn_to_page(pfn + i));

1354

}

1354

}

1355

}

1355

}

1356

spin_unlock_irqrestore(&zone->lock, flags);

1356

spin_unlock_irqrestore(&zone->lock, flags);

1357

}

1357

}

1358

#endif /* CONFIG_PM */

1358

#endif /* CONFIG_PM */

1359

1360

/*

1360

/*

1361

* Free a 0-order page

1361

* Free a 0-order page

1362

* cold == 1 ? free a cold page : free a hot page

1362

* cold == 1 ? free a cold page : free a hot page

1363

*/

1363

*/

1364

void free_hot_cold_page(struct page *page, int cold)

1364

void free_hot_cold_page(struct page *page, int cold)

1365

{

1365

{

1366

struct zone *zone = page_zone(page);

1366

struct zone *zone = page_zone(page);

1367

struct per_cpu_pages *pcp;

1367

struct per_cpu_pages *pcp;

1368

unsigned long flags;

1368

unsigned long flags;

1369

int migratetype;

1369

int migratetype;

1370

1371

if (!free_pages_prepare(page, 0))

1371

if (!free_pages_prepare(page, 0))

1372

return;

1372

return;

1373

1374

migratetype = get_pageblock_migratetype(page);

1374

migratetype = get_pageblock_migratetype(page);

1375

set_freepage_migratetype(page, migratetype);

1375

set_freepage_migratetype(page, migratetype);

1376

local_irq_save(flags);

1376

local_irq_save(flags);

1377

__count_vm_event(PGFREE);

1377

__count_vm_event(PGFREE);

1378

1379

/*

1379

/*

1380

* We only track unmovable, reclaimable and movable on pcp lists.

1380

* We only track unmovable, reclaimable and movable on pcp lists.

1381

* Free ISOLATE pages back to the allocator because they are being

1381

* Free ISOLATE pages back to the allocator because they are being

1382

* offlined but treat RESERVE as movable pages so we can get those

1382

* offlined but treat RESERVE as movable pages so we can get those

1383

* areas back if necessary. Otherwise, we may have to free

1383

* areas back if necessary. Otherwise, we may have to free

1384

* excessively into the page allocator

1384

* excessively into the page allocator

1385

*/

1385

*/

1386

if (migratetype >= MIGRATE_PCPTYPES) {

1386

if (migratetype >= MIGRATE_PCPTYPES) {

1387

if (unlikely(is_migrate_isolate(migratetype))) {

1387

if (unlikely(is_migrate_isolate(migratetype))) {

1388

free_one_page(zone, page, 0, migratetype);

1388

free_one_page(zone, page, 0, migratetype);

1389

goto out;

1389

goto out;

1390

}

1390

}

1391

migratetype = MIGRATE_MOVABLE;

1391

migratetype = MIGRATE_MOVABLE;

1392

}

1392

}

1393

1394

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1394

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1395

if (cold)

1395

if (cold)

1396

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1396

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1397

else

1397

else

1398

list_add(&page->lru, &pcp->lists[migratetype]);

1398

list_add(&page->lru, &pcp->lists[migratetype]);

1399

pcp->count++;

1399

pcp->count++;

1400

if (pcp->count >= pcp->high) {

1400

if (pcp->count >= pcp->high) {

1401

unsigned long batch = ACCESS_ONCE(pcp->batch);

1401

unsigned long batch = ACCESS_ONCE(pcp->batch);

1402

free_pcppages_bulk(zone, batch, pcp);

1402

free_pcppages_bulk(zone, batch, pcp);

1403

pcp->count -= batch;

1403

pcp->count -= batch;

1404

}

1404

}

1405

1406

out:

1406

out:

1407

local_irq_restore(flags);

1407

local_irq_restore(flags);

1408

}

1408

}

1409

1410

/*

1410

/*

1411

* Free a list of 0-order pages

1411

* Free a list of 0-order pages

1412

*/

1412

*/

1413

void free_hot_cold_page_list(struct list_head *list, int cold)

1413

void free_hot_cold_page_list(struct list_head *list, int cold)

1414

{

1414

{

1415

struct page *page, *next;

1415

struct page *page, *next;

1416

1417

list_for_each_entry_safe(page, next, list, lru) {

1417

list_for_each_entry_safe(page, next, list, lru) {

1418

trace_mm_page_free_batched(page, cold);

1418

trace_mm_page_free_batched(page, cold);

1419

free_hot_cold_page(page, cold);

1419

free_hot_cold_page(page, cold);

1420

}

1420

}

1421

}

1421

}

1422

1423

/*

1423

/*

1424

* split_page takes a non-compound higher-order page, and splits it into

1424

* split_page takes a non-compound higher-order page, and splits it into

1425

* n (1<<order) sub-pages: page[0..n]

1425

* n (1<<order) sub-pages: page[0..n]

1426

* Each sub-page must be freed individually.

1426

* Each sub-page must be freed individually.

1427

*

1427

*

1428

* Note: this is probably too low level an operation for use in drivers.

1428

* Note: this is probably too low level an operation for use in drivers.

1429

* Please consult with lkml before using this in your driver.

1429

* Please consult with lkml before using this in your driver.

1430

*/

1430

*/

1431

void split_page(struct page *page, unsigned int order)

1431

void split_page(struct page *page, unsigned int order)

1432

{

1432

{

1433

int i;

1433

int i;

1434

1435

VM_BUG_ON(PageCompound(page));

1435

VM_BUG_ON(PageCompound(page));

1436

VM_BUG_ON(!page_count(page));

1436

VM_BUG_ON(!page_count(page));

1437

1438

#ifdef CONFIG_KMEMCHECK

1438

#ifdef CONFIG_KMEMCHECK

1439

/*

1439

/*

1440

* Split shadow pages too, because free(page[0]) would

1440

* Split shadow pages too, because free(page[0]) would

1441

* otherwise free the whole shadow.

1441

* otherwise free the whole shadow.

1442

*/

1442

*/

1443

if (kmemcheck_page_is_tracked(page))

1443

if (kmemcheck_page_is_tracked(page))

1444

split_page(virt_to_page(page[0].shadow), order);

1444

split_page(virt_to_page(page[0].shadow), order);

1445

#endif

1445

#endif

1446

1447

for (i = 1; i < (1 << order); i++)

1447

for (i = 1; i < (1 << order); i++)

1448

set_page_refcounted(page + i);

1448

set_page_refcounted(page + i);

1449

}

1449

}

1450

EXPORT_SYMBOL_GPL(split_page);

1450

EXPORT_SYMBOL_GPL(split_page);

1451

1452

static int __isolate_free_page(struct page *page, unsigned int order)

1452

static int __isolate_free_page(struct page *page, unsigned int order)

1453

{

1453

{

1454

unsigned long watermark;

1454

unsigned long watermark;

1455

struct zone *zone;

1455

struct zone *zone;

1456

int mt;

1456

int mt;

1457

1458

BUG_ON(!PageBuddy(page));

1458

BUG_ON(!PageBuddy(page));

1459

1460

zone = page_zone(page);

1460

zone = page_zone(page);

1461

mt = get_pageblock_migratetype(page);

1461

mt = get_pageblock_migratetype(page);

1462

1463

if (!is_migrate_isolate(mt)) {

1463

if (!is_migrate_isolate(mt)) {

1464

/* Obey watermarks as if the page was being allocated */

1464

/* Obey watermarks as if the page was being allocated */

1465

watermark = low_wmark_pages(zone) + (1 << order);

1465

watermark = low_wmark_pages(zone) + (1 << order);

1466

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1466

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1467

return 0;

1467

return 0;

1468

1469

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1469

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1470

}

1470

}

1471

1472

/* Remove page from free list */

1472

/* Remove page from free list */

1473

list_del(&page->lru);

1473

list_del(&page->lru);

1474

zone->free_area[order].nr_free--;

1474

zone->free_area[order].nr_free--;

1475

rmv_page_order(page);

1475

rmv_page_order(page);

1476

1477

/* Set the pageblock if the isolated page is at least a pageblock */

1477

/* Set the pageblock if the isolated page is at least a pageblock */

1478

if (order >= pageblock_order - 1) {

1478

if (order >= pageblock_order - 1) {

1479

struct page *endpage = page + (1 << order) - 1;

1479

struct page *endpage = page + (1 << order) - 1;

1480

for (; page < endpage; page += pageblock_nr_pages) {

1480

for (; page < endpage; page += pageblock_nr_pages) {

1481

int mt = get_pageblock_migratetype(page);

1481

int mt = get_pageblock_migratetype(page);

1482

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1482

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1483

set_pageblock_migratetype(page,

1483

set_pageblock_migratetype(page,

1484

MIGRATE_MOVABLE);

1484

MIGRATE_MOVABLE);

1485

}

1485

}

1486

}

1486

}

1487

1488

return 1UL << order;

1488

return 1UL << order;

1489

}

1489

}

1490

1491

/*

1491

/*

1492

* Similar to split_page except the page is already free. As this is only

1492

* Similar to split_page except the page is already free. As this is only

1493

* being used for migration, the migratetype of the block also changes.

1493

* being used for migration, the migratetype of the block also changes.

1494

* As this is called with interrupts disabled, the caller is responsible

1494

* As this is called with interrupts disabled, the caller is responsible

1495

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1495

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1496

* are enabled.

1496

* are enabled.

1497

*

1497

*

1498

* Note: this is probably too low level an operation for use in drivers.

1498

* Note: this is probably too low level an operation for use in drivers.

1499

* Please consult with lkml before using this in your driver.

1499

* Please consult with lkml before using this in your driver.

1500

*/

1500

*/

1501

int split_free_page(struct page *page)

1501

int split_free_page(struct page *page)

1502

{

1502

{

1503

unsigned int order;

1503

unsigned int order;

1504

int nr_pages;

1504

int nr_pages;

1505

1506

order = page_order(page);

1506

order = page_order(page);

1507

1508

nr_pages = __isolate_free_page(page, order);

1508

nr_pages = __isolate_free_page(page, order);

1509

if (!nr_pages)

1509

if (!nr_pages)

1510

return 0;

1510

return 0;

1511

1512

/* Split into individual pages */

1512

/* Split into individual pages */

1513

set_page_refcounted(page);

1513

set_page_refcounted(page);

1514

split_page(page, order);

1514

split_page(page, order);

1515

return nr_pages;

1515

return nr_pages;

1516

}

1516

}

1517

1518

/*

1518

/*

1519

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1519

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1520

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1520

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1521

* or two.

1521

* or two.

1522

*/

1522

*/

1523

static inline

1523

static inline

1524

struct page *buffered_rmqueue(struct zone *preferred_zone,

1524

struct page *buffered_rmqueue(struct zone *preferred_zone,

1525

struct zone *zone, int order, gfp_t gfp_flags,

1525

struct zone *zone, int order, gfp_t gfp_flags,

1526

int migratetype)

1526

int migratetype)

1527

{

1527

{

1528

unsigned long flags;

1528

unsigned long flags;

1529

struct page *page;

1529

struct page *page;

1530

int cold = !!(gfp_flags & __GFP_COLD);

1530

int cold = !!(gfp_flags & __GFP_COLD);

1531

1532

again:

1532

again:

1533

if (likely(order == 0)) {

1533

if (likely(order == 0)) {

1534

struct per_cpu_pages *pcp;

1534

struct per_cpu_pages *pcp;

1535

struct list_head *list;

1535

struct list_head *list;

1536

1537

local_irq_save(flags);

1537

local_irq_save(flags);

1538

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1538

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1539

list = &pcp->lists[migratetype];

1539

list = &pcp->lists[migratetype];

1540

if (list_empty(list)) {

1540

if (list_empty(list)) {

1541

pcp->count += rmqueue_bulk(zone, 0,

1541

pcp->count += rmqueue_bulk(zone, 0,

1542

pcp->batch, list,

1542

pcp->batch, list,

1543

migratetype, cold);

1543

migratetype, cold);

1544

if (unlikely(list_empty(list)))

1544

if (unlikely(list_empty(list)))

1545

goto failed;

1545

goto failed;

1546

}

1546

}

1547

1548

if (cold)

1548

if (cold)

1549

page = list_entry(list->prev, struct page, lru);

1549

page = list_entry(list->prev, struct page, lru);

1550

else

1550

else

1551

page = list_entry(list->next, struct page, lru);

1551

page = list_entry(list->next, struct page, lru);

1552

1553

list_del(&page->lru);

1553

list_del(&page->lru);

1554

pcp->count--;

1554

pcp->count--;

1555

} else {

1555

} else {

1556

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1556

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1557

/*

1557

/*

1558

* __GFP_NOFAIL is not to be used in new code.

1558

* __GFP_NOFAIL is not to be used in new code.

1559

*

1559

*

1560

* All __GFP_NOFAIL callers should be fixed so that they

1560

* All __GFP_NOFAIL callers should be fixed so that they

1561

* properly detect and handle allocation failures.

1561

* properly detect and handle allocation failures.

1562

*

1562

*

1563

* We most definitely don't want callers attempting to

1563

* We most definitely don't want callers attempting to

1564

* allocate greater than order-1 page units with

1564

* allocate greater than order-1 page units with

1565

* __GFP_NOFAIL.

1565

* __GFP_NOFAIL.

1566

*/

1566

*/

1567

WARN_ON_ONCE(order > 1);

1567

WARN_ON_ONCE(order > 1);

1568

}

1568

}

1569

spin_lock_irqsave(&zone->lock, flags);

1569

spin_lock_irqsave(&zone->lock, flags);

1570

page = __rmqueue(zone, order, migratetype);

1570

page = __rmqueue(zone, order, migratetype);

1571

spin_unlock(&zone->lock);

1571

spin_unlock(&zone->lock);

1572

if (!page)

1572

if (!page)

1573

goto failed;

1573

goto failed;

1574

__mod_zone_freepage_state(zone, -(1 << order),

1574

__mod_zone_freepage_state(zone, -(1 << order),

1575

get_freepage_migratetype(page));

1575

get_freepage_migratetype(page));

1576

}

1576

}

1577

1578

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1578

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1579

1580

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1580

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1581

zone_statistics(preferred_zone, zone, gfp_flags);

1581

zone_statistics(preferred_zone, zone, gfp_flags);

1582

local_irq_restore(flags);

1582

local_irq_restore(flags);

1583

1584

VM_BUG_ON(bad_range(zone, page));

1584

VM_BUG_ON(bad_range(zone, page));

1585

if (prep_new_page(page, order, gfp_flags))

1585

if (prep_new_page(page, order, gfp_flags))

1586

goto again;

1586

goto again;

1587

return page;

1587

return page;

1588

1589

failed:

1589

failed:

1590

local_irq_restore(flags);

1590

local_irq_restore(flags);

1591

return NULL;

1591

return NULL;

1592

}

1592

}

1593

1594

#ifdef CONFIG_FAIL_PAGE_ALLOC

1594

#ifdef CONFIG_FAIL_PAGE_ALLOC

1595

1596

static struct {

1596

static struct {

1597

struct fault_attr attr;

1597

struct fault_attr attr;

1598

1599

u32 ignore_gfp_highmem;

1599

u32 ignore_gfp_highmem;

1600

u32 ignore_gfp_wait;

1600

u32 ignore_gfp_wait;

1601

u32 min_order;

1601

u32 min_order;

1602

} fail_page_alloc = {

1602

} fail_page_alloc = {

1603

.attr = FAULT_ATTR_INITIALIZER,

1603

.attr = FAULT_ATTR_INITIALIZER,

1604

.ignore_gfp_wait = 1,

1604

.ignore_gfp_wait = 1,

1605

.ignore_gfp_highmem = 1,

1605

.ignore_gfp_highmem = 1,

1606

.min_order = 1,

1606

.min_order = 1,

1607

};

1607

};

1608

1609

static int __init setup_fail_page_alloc(char *str)

1609

static int __init setup_fail_page_alloc(char *str)

1610

{

1610

{

1611

return setup_fault_attr(&fail_page_alloc.attr, str);

1611

return setup_fault_attr(&fail_page_alloc.attr, str);

1612

}

1612

}

1613

__setup("fail_page_alloc=", setup_fail_page_alloc);

1613

__setup("fail_page_alloc=", setup_fail_page_alloc);

1614

1615

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1615

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1616

{

1616

{

1617

if (order < fail_page_alloc.min_order)

1617

if (order < fail_page_alloc.min_order)

1618

return false;

1618

return false;

1619

if (gfp_mask & __GFP_NOFAIL)

1619

if (gfp_mask & __GFP_NOFAIL)

1620

return false;

1620

return false;

1621

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1621

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1622

return false;

1622

return false;

1623

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1623

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1624

return false;

1624

return false;

1625

1626

return should_fail(&fail_page_alloc.attr, 1 << order);

1626

return should_fail(&fail_page_alloc.attr, 1 << order);

1627

}

1627

}

1628

1629

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1629

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1630

1631

static int __init fail_page_alloc_debugfs(void)

1631

static int __init fail_page_alloc_debugfs(void)

1632

{

1632

{

1633

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1633

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1634

struct dentry *dir;

1634

struct dentry *dir;

1635

1636

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1636

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1637

&fail_page_alloc.attr);

1637

&fail_page_alloc.attr);

1638

if (IS_ERR(dir))

1638

if (IS_ERR(dir))

1639

return PTR_ERR(dir);

1639

return PTR_ERR(dir);

1640

1641

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1641

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1642

&fail_page_alloc.ignore_gfp_wait))

1642

&fail_page_alloc.ignore_gfp_wait))

1643

goto fail;

1643

goto fail;

1644

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1644

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1645

&fail_page_alloc.ignore_gfp_highmem))

1645

&fail_page_alloc.ignore_gfp_highmem))

1646

goto fail;

1646

goto fail;

1647

if (!debugfs_create_u32("min-order", mode, dir,

1647

if (!debugfs_create_u32("min-order", mode, dir,

1648

&fail_page_alloc.min_order))

1648

&fail_page_alloc.min_order))

1649

goto fail;

1649

goto fail;

1650

1651

return 0;

1651

return 0;

1652

fail:

1652

fail:

1653

debugfs_remove_recursive(dir);

1653

debugfs_remove_recursive(dir);

1654

1655

return -ENOMEM;

1655

return -ENOMEM;

1656

}

1656

}

1657

1658

late_initcall(fail_page_alloc_debugfs);

1658

late_initcall(fail_page_alloc_debugfs);

1659

1660

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1660

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1661

1662

#else /* CONFIG_FAIL_PAGE_ALLOC */

1662

#else /* CONFIG_FAIL_PAGE_ALLOC */

1663

1664

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1664

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1665

{

1665

{

1666

return false;

1666

return false;

1667

}

1667

}

1668

1669

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1669

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1670

1671

/*

1671

/*

1672

* Return true if free pages are above 'mark'. This takes into account the order

1672

* Return true if free pages are above 'mark'. This takes into account the order

1673

* of the allocation.

1673

* of the allocation.

1674

*/

1674

*/

1675

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1675

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1676

int classzone_idx, int alloc_flags, long free_pages)

1676

int classzone_idx, int alloc_flags, long free_pages)

1677

{

1677

{

1678

/* free_pages my go negative - that's OK */

1678

/* free_pages my go negative - that's OK */

1679

long min = mark;

1679

long min = mark;

1680

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1680

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1681

int o;

1681

int o;

1682

long free_cma = 0;

1682

long free_cma = 0;

1683

1684

free_pages -= (1 << order) - 1;

1684

free_pages -= (1 << order) - 1;

1685

if (alloc_flags & ALLOC_HIGH)

1685

if (alloc_flags & ALLOC_HIGH)

1686

min -= min / 2;

1686

min -= min / 2;

1687

if (alloc_flags & ALLOC_HARDER)

1687

if (alloc_flags & ALLOC_HARDER)

1688

min -= min / 4;

1688

min -= min / 4;

1689

#ifdef CONFIG_CMA

1689

#ifdef CONFIG_CMA

1690

/* If allocation can't use CMA areas don't use free CMA pages */

1690

/* If allocation can't use CMA areas don't use free CMA pages */

1691

if (!(alloc_flags & ALLOC_CMA))

1691

if (!(alloc_flags & ALLOC_CMA))

1692

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1692

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1693

#endif

1693

#endif

1694

1695

if (free_pages - free_cma <= min + lowmem_reserve)

1695

if (free_pages - free_cma <= min + lowmem_reserve)

1696

return false;

1696

return false;

1697

for (o = 0; o < order; o++) {

1697

for (o = 0; o < order; o++) {

1698

/* At the next order, this order's pages become unavailable */

1698

/* At the next order, this order's pages become unavailable */

1699

free_pages -= z->free_area[o].nr_free << o;

1699

free_pages -= z->free_area[o].nr_free << o;

1700

1701

/* Require fewer higher order pages to be free */

1701

/* Require fewer higher order pages to be free */

1702

min >>= 1;

1702

min >>= 1;

1703

1704

if (free_pages <= min)

1704

if (free_pages <= min)

1705

return false;

1705

return false;

1706

}

1706

}

1707

return true;

1707

return true;

1708

}

1708

}

1709

1710

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1710

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1711

int classzone_idx, int alloc_flags)

1711

int classzone_idx, int alloc_flags)

1712

{

1712

{

1713

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1713

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1714

zone_page_state(z, NR_FREE_PAGES));

1714

zone_page_state(z, NR_FREE_PAGES));

1715

}

1715

}

1716

1717

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1717

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1718

int classzone_idx, int alloc_flags)

1718

int classzone_idx, int alloc_flags)

1719

{

1719

{

1720

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1720

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1721

1722

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1722

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1723

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1723

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1724

1725

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1725

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1726

free_pages);

1726

free_pages);

1727

}

1727

}

1728

1729

#ifdef CONFIG_NUMA

1729

#ifdef CONFIG_NUMA

1730

/*

1730

/*

1731

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1731

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1732

* skip over zones that are not allowed by the cpuset, or that have

1732

* skip over zones that are not allowed by the cpuset, or that have

1733

* been recently (in last second) found to be nearly full. See further

1733

* been recently (in last second) found to be nearly full. See further

1734

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1734

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1735

* that have to skip over a lot of full or unallowed zones.

1735

* that have to skip over a lot of full or unallowed zones.

1736

*

1736

*

1737

* If the zonelist cache is present in the passed in zonelist, then

1737

* If the zonelist cache is present in the passed in zonelist, then

1738

* returns a pointer to the allowed node mask (either the current

1738

* returns a pointer to the allowed node mask (either the current

1739

* tasks mems_allowed, or node_states[N_MEMORY].)

1739

* tasks mems_allowed, or node_states[N_MEMORY].)

1740

*

1740

*

1741

* If the zonelist cache is not available for this zonelist, does

1741

* If the zonelist cache is not available for this zonelist, does

1742

* nothing and returns NULL.

1742

* nothing and returns NULL.

1743

*

1743

*

1744

* If the fullzones BITMAP in the zonelist cache is stale (more than

1744

* If the fullzones BITMAP in the zonelist cache is stale (more than

1745

* a second since last zap'd) then we zap it out (clear its bits.)

1745

* a second since last zap'd) then we zap it out (clear its bits.)

1746

*

1746

*

1747

* We hold off even calling zlc_setup, until after we've checked the

1747

* We hold off even calling zlc_setup, until after we've checked the

1748

* first zone in the zonelist, on the theory that most allocations will

1748

* first zone in the zonelist, on the theory that most allocations will

1749

* be satisfied from that first zone, so best to examine that zone as

1749

* be satisfied from that first zone, so best to examine that zone as

1750

* quickly as we can.

1750

* quickly as we can.

1751

*/

1751

*/

1752

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1752

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1753

{

1753

{

1754

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1754

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1755

nodemask_t *allowednodes; /* zonelist_cache approximation */

1755

nodemask_t *allowednodes; /* zonelist_cache approximation */

1756

1757

zlc = zonelist->zlcache_ptr;

1757

zlc = zonelist->zlcache_ptr;

1758

if (!zlc)

1758

if (!zlc)

1759

return NULL;

1759

return NULL;

1760

1761

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1761

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1762

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1762

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1763

zlc->last_full_zap = jiffies;

1763

zlc->last_full_zap = jiffies;

1764

}

1764

}

1765

1766

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1766

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1767

&cpuset_current_mems_allowed :

1767

&cpuset_current_mems_allowed :

1768

&node_states[N_MEMORY];

1768

&node_states[N_MEMORY];

1769

return allowednodes;

1769

return allowednodes;

1770

}

1770

}

1771

1772

/*

1772

/*

1773

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1773

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1774

* if it is worth looking at further for free memory:

1774

* if it is worth looking at further for free memory:

1775

* 1) Check that the zone isn't thought to be full (doesn't have its

1775

* 1) Check that the zone isn't thought to be full (doesn't have its

1776

* bit set in the zonelist_cache fullzones BITMAP).

1776

* bit set in the zonelist_cache fullzones BITMAP).

1777

* 2) Check that the zones node (obtained from the zonelist_cache

1777

* 2) Check that the zones node (obtained from the zonelist_cache

1778

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1778

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1779

* Return true (non-zero) if zone is worth looking at further, or

1779

* Return true (non-zero) if zone is worth looking at further, or

1780

* else return false (zero) if it is not.

1780

* else return false (zero) if it is not.

1781

*

1781

*

1782

* This check -ignores- the distinction between various watermarks,

1782

* This check -ignores- the distinction between various watermarks,

1783

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1783

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1784

* found to be full for any variation of these watermarks, it will

1784

* found to be full for any variation of these watermarks, it will

1785

* be considered full for up to one second by all requests, unless

1785

* be considered full for up to one second by all requests, unless

1786

* we are so low on memory on all allowed nodes that we are forced

1786

* we are so low on memory on all allowed nodes that we are forced

1787

* into the second scan of the zonelist.

1787

* into the second scan of the zonelist.

1788

*

1788

*

1789

* In the second scan we ignore this zonelist cache and exactly

1789

* In the second scan we ignore this zonelist cache and exactly

1790

* apply the watermarks to all zones, even it is slower to do so.

1790

* apply the watermarks to all zones, even it is slower to do so.

1791

* We are low on memory in the second scan, and should leave no stone

1791

* We are low on memory in the second scan, and should leave no stone

1792

* unturned looking for a free page.

1792

* unturned looking for a free page.

1793

*/

1793

*/

1794

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1794

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1795

nodemask_t *allowednodes)

1795

nodemask_t *allowednodes)

1796

{

1796

{

1797

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1797

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1798

int i; /* index of *z in zonelist zones */

1798

int i; /* index of *z in zonelist zones */

1799

int n; /* node that zone *z is on */

1799

int n; /* node that zone *z is on */

1800

1801

zlc = zonelist->zlcache_ptr;

1801

zlc = zonelist->zlcache_ptr;

1802

if (!zlc)

1802

if (!zlc)

1803

return 1;

1803

return 1;

1804

1805

i = z - zonelist->_zonerefs;

1805

i = z - zonelist->_zonerefs;

1806

n = zlc->z_to_n[i];

1806

n = zlc->z_to_n[i];

1807

1808

/* This zone is worth trying if it is allowed but not full */

1808

/* This zone is worth trying if it is allowed but not full */

1809

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1809

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1810

}

1810

}

1811

1812

/*

1812

/*

1813

* Given 'z' scanning a zonelist, set the corresponding bit in

1813

* Given 'z' scanning a zonelist, set the corresponding bit in

1814

* zlc->fullzones, so that subsequent attempts to allocate a page

1814

* zlc->fullzones, so that subsequent attempts to allocate a page

1815

* from that zone don't waste time re-examining it.

1815

* from that zone don't waste time re-examining it.

1816

*/

1816

*/

1817

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1817

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1818

{

1818

{

1819

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1819

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1820

int i; /* index of *z in zonelist zones */

1820

int i; /* index of *z in zonelist zones */

1821

1822

zlc = zonelist->zlcache_ptr;

1822

zlc = zonelist->zlcache_ptr;

1823

if (!zlc)

1823

if (!zlc)

1824

return;

1824

return;

1825

1826

i = z - zonelist->_zonerefs;

1826

i = z - zonelist->_zonerefs;

1827

1828

set_bit(i, zlc->fullzones);

1828

set_bit(i, zlc->fullzones);

1829

}

1829

}

1830

1831

/*

1831

/*

1832

* clear all zones full, called after direct reclaim makes progress so that

1832

* clear all zones full, called after direct reclaim makes progress so that

1833

* a zone that was recently full is not skipped over for up to a second

1833

* a zone that was recently full is not skipped over for up to a second

1834

*/

1834

*/

1835

static void zlc_clear_zones_full(struct zonelist *zonelist)

1835

static void zlc_clear_zones_full(struct zonelist *zonelist)

1836

{

1836

{

1837

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1837

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1838

1839

zlc = zonelist->zlcache_ptr;

1839

zlc = zonelist->zlcache_ptr;

1840

if (!zlc)

1840

if (!zlc)

1841

return;

1841

return;

1842

1843

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1843

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1844

}

1844

}

1845

1846

static bool zone_local(struct zone *local_zone, struct zone *zone)

1846

static bool zone_local(struct zone *local_zone, struct zone *zone)

1847

{

1847

{

1848

return local_zone->node == zone->node;

1848

return local_zone->node == zone->node;

1849

}

1849

}

1850

1851

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1851

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1852

{

1852

{

1853

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1853

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1854

}

1854

}

1855

1856

static void __paginginit init_zone_allows_reclaim(int nid)

1856

static void __paginginit init_zone_allows_reclaim(int nid)

1857

{

1857

{

1858

int i;

1858

int i;

1859

1860

for_each_node_state(i, N_MEMORY)

1860

for_each_node_state(i, N_MEMORY)

1861

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1861

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1862

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1862

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1863

else

1863

else

1864

zone_reclaim_mode = 1;

1864

zone_reclaim_mode = 1;

1865

}

1865

}

1866

1867

#else /* CONFIG_NUMA */

1867

#else /* CONFIG_NUMA */

1868

1869

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1869

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1870

{

1870

{

1871

return NULL;

1871

return NULL;

1872

}

1872

}

1873

1874

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1874

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1875

nodemask_t *allowednodes)

1875

nodemask_t *allowednodes)

1876

{

1876

{

1877

return 1;

1877

return 1;

1878

}

1878

}

1879

1880

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1880

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1881

{

1881

{

1882

}

1882

}

1883

1884

static void zlc_clear_zones_full(struct zonelist *zonelist)

1884

static void zlc_clear_zones_full(struct zonelist *zonelist)

1885

{

1885

{

1886

}

1886

}

1887

1888

static bool zone_local(struct zone *local_zone, struct zone *zone)

1888

static bool zone_local(struct zone *local_zone, struct zone *zone)

1889

{

1889

{

1890

return true;

1890

return true;

1891

}

1891

}

1892

1893

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1893

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1894

{

1894

{

1895

return true;

1895

return true;

1896

}

1896

}

1897

1898

static inline void init_zone_allows_reclaim(int nid)

1898

static inline void init_zone_allows_reclaim(int nid)

1899

{

1899

{

1900

}

1900

}

1901

#endif /* CONFIG_NUMA */

1901

#endif /* CONFIG_NUMA */

1902

1903

/*

1903

/*

1904

* get_page_from_freelist goes through the zonelist trying to allocate

1904

* get_page_from_freelist goes through the zonelist trying to allocate

1905

* a page.

1905

* a page.

1906

*/

1906

*/

1907

static struct page *

1907

static struct page *

1908

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1908

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1909

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1909

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1910

struct zone *preferred_zone, int classzone_idx, int migratetype)

1910

struct zone *preferred_zone, int classzone_idx, int migratetype)

1911

{

1911

{

1912

struct zoneref *z;

1912

struct zoneref *z;

1913

struct page *page = NULL;

1913

struct page *page = NULL;

1914

struct zone *zone;

1914

struct zone *zone;

1915

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1915

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1916

int zlc_active = 0; /* set if using zonelist_cache */

1916

int zlc_active = 0; /* set if using zonelist_cache */

1917

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1917

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1918

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1918

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1919

(gfp_mask & __GFP_WRITE);

1919

(gfp_mask & __GFP_WRITE);

1920

1921

zonelist_scan:

1921

zonelist_scan:

1922

/*

1922

/*

1923

* Scan zonelist, looking for a zone with enough free.

1923

* Scan zonelist, looking for a zone with enough free.

1924

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1924

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1925

*/

1925

*/

1926

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1926

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1927

high_zoneidx, nodemask) {

1927

high_zoneidx, nodemask) {

1928

unsigned long mark;

1928

unsigned long mark;

1929

1930

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1930

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1931

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1931

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1932

continue;

1932

continue;

1933

if (cpusets_enabled() &&

1933

if (cpusets_enabled() &&

1934

(alloc_flags & ALLOC_CPUSET) &&

1934

(alloc_flags & ALLOC_CPUSET) &&

1935

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1935

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1936

continue;

1936

continue;

1937

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1938

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1939

goto try_this_zone;

1940

/*

1937

/*

1941

* Distribute pages in proportion to the individual

1938

* Distribute pages in proportion to the individual

1942

* zone size to ensure fair page aging. The zone a

1939

* zone size to ensure fair page aging. The zone a

1943

* page was allocated in should have no effect on the

1940

* page was allocated in should have no effect on the

1944

* time the page has in memory before being reclaimed.

1941

* time the page has in memory before being reclaimed.

1945

*/

1942

*/

1946

if (alloc_flags & ALLOC_FAIR) {

1943

if (alloc_flags & ALLOC_FAIR) {

1947

if (!zone_local(preferred_zone, zone))

1944

if (!zone_local(preferred_zone, zone))

1948

continue;

1945

continue;

1949

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1946

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1950

continue;

1947

continue;

1951

}

1948

}

1952

/*

1949

/*

1953

* When allocating a page cache page for writing, we

1950

* When allocating a page cache page for writing, we

1954

* want to get it from a zone that is within its dirty

1951

* want to get it from a zone that is within its dirty

1955

* limit, such that no single zone holds more than its

1952

* limit, such that no single zone holds more than its

1956

* proportional share of globally allowed dirty pages.

1953

* proportional share of globally allowed dirty pages.

1957

* The dirty limits take into account the zone's

1954

* The dirty limits take into account the zone's

1958

* lowmem reserves and high watermark so that kswapd

1955

* lowmem reserves and high watermark so that kswapd

1959

* should be able to balance it without having to

1956

* should be able to balance it without having to

1960

* write pages from its LRU list.

1957

* write pages from its LRU list.

1961

*

1958

*

1962

* This may look like it could increase pressure on

1959

* This may look like it could increase pressure on

1963

* lower zones by failing allocations in higher zones

1960

* lower zones by failing allocations in higher zones

1964

* before they are full. But the pages that do spill

1961

* before they are full. But the pages that do spill

1965

* over are limited as the lower zones are protected

1962

* over are limited as the lower zones are protected

1966

* by this very same mechanism. It should not become

1963

* by this very same mechanism. It should not become

1967

* a practical burden to them.

1964

* a practical burden to them.

1968

*

1965

*

1969

* XXX: For now, allow allocations to potentially

1966

* XXX: For now, allow allocations to potentially

1970

* exceed the per-zone dirty limit in the slowpath

1967

* exceed the per-zone dirty limit in the slowpath

1971

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1968

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1972

* which is important when on a NUMA setup the allowed

1969

* which is important when on a NUMA setup the allowed

1973

* zones are together not big enough to reach the

1970

* zones are together not big enough to reach the

1974

* global limit. The proper fix for these situations

1971

* global limit. The proper fix for these situations

1975

* will require awareness of zones in the

1972

* will require awareness of zones in the

1976

* dirty-throttling and the flusher threads.

1973

* dirty-throttling and the flusher threads.

1977

*/

1974

*/

1978

if (consider_zone_dirty && !zone_dirty_ok(zone))

1975

if (consider_zone_dirty && !zone_dirty_ok(zone))

1979

continue;

1976

continue;

1980

1977

1981

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1978

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1982

if (!zone_watermark_ok(zone, order, mark,

1979

if (!zone_watermark_ok(zone, order, mark,

1983

classzone_idx, alloc_flags)) {

1980

classzone_idx, alloc_flags)) {

1984

int ret;

1981

int ret;

1982

1983

/* Checked here to keep the fast path fast */

1984

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1985

if (alloc_flags & ALLOC_NO_WATERMARKS)

1986

goto try_this_zone;

1985

1987

1986

if (IS_ENABLED(CONFIG_NUMA) &&

1988

if (IS_ENABLED(CONFIG_NUMA) &&

1987

!did_zlc_setup && nr_online_nodes > 1) {

1989

!did_zlc_setup && nr_online_nodes > 1) {

1988

/*

1990

/*

1989

* we do zlc_setup if there are multiple nodes

1991

* we do zlc_setup if there are multiple nodes

1990

* and before considering the first zone allowed

1992

* and before considering the first zone allowed

1991

* by the cpuset.

1993

* by the cpuset.

1992

*/

1994

*/

1993

allowednodes = zlc_setup(zonelist, alloc_flags);

1995

allowednodes = zlc_setup(zonelist, alloc_flags);

1994

zlc_active = 1;

1996

zlc_active = 1;

1995

did_zlc_setup = 1;

1997

did_zlc_setup = 1;

1996

}

1998

}

1997

1999

1998

if (zone_reclaim_mode == 0 ||

2000

if (zone_reclaim_mode == 0 ||

1999

!zone_allows_reclaim(preferred_zone, zone))

2001

!zone_allows_reclaim(preferred_zone, zone))

2000

goto this_zone_full;

2002

goto this_zone_full;

2001

2003

2002

/*

2004

/*

2003

* As we may have just activated ZLC, check if the first

2005

* As we may have just activated ZLC, check if the first

2004

* eligible zone has failed zone_reclaim recently.

2006

* eligible zone has failed zone_reclaim recently.

2005

*/

2007

*/

2006

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2008

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2007

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2009

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2008

continue;

2010

continue;

2009

2011

2010

ret = zone_reclaim(zone, gfp_mask, order);

2012

ret = zone_reclaim(zone, gfp_mask, order);

2011

switch (ret) {

2013

switch (ret) {

2012

case ZONE_RECLAIM_NOSCAN:

2014

case ZONE_RECLAIM_NOSCAN:

2013

/* did not scan */

2015

/* did not scan */

2014

continue;

2016

continue;

2015

case ZONE_RECLAIM_FULL:

2017

case ZONE_RECLAIM_FULL:

2016

/* scanned but unreclaimable */

2018

/* scanned but unreclaimable */

2017

continue;

2019

continue;

2018

default:

2020

default:

2019

/* did we reclaim enough */

2021

/* did we reclaim enough */

2020

if (zone_watermark_ok(zone, order, mark,

2022

if (zone_watermark_ok(zone, order, mark,

2021

classzone_idx, alloc_flags))

2023

classzone_idx, alloc_flags))

2022

goto try_this_zone;

2024

goto try_this_zone;

2023

2025

2024

/*

2026

/*

2025

* Failed to reclaim enough to meet watermark.

2027

* Failed to reclaim enough to meet watermark.

2026

* Only mark the zone full if checking the min

2028

* Only mark the zone full if checking the min

2027

* watermark or if we failed to reclaim just

2029

* watermark or if we failed to reclaim just

2028

* 1<<order pages or else the page allocator

2030

* 1<<order pages or else the page allocator

2029

* fastpath will prematurely mark zones full

2031

* fastpath will prematurely mark zones full

2030

* when the watermark is between the low and

2032

* when the watermark is between the low and

2031

* min watermarks.

2033

* min watermarks.

2032

*/

2034

*/

2033

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2035

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2034

ret == ZONE_RECLAIM_SOME)

2036

ret == ZONE_RECLAIM_SOME)

2035

goto this_zone_full;

2037

goto this_zone_full;

2036

2038

2037

continue;

2039

continue;

2038

}

2040

}

2039

}

2041

}

2040

2042

2041

try_this_zone:

2043

try_this_zone:

2042

page = buffered_rmqueue(preferred_zone, zone, order,

2044

page = buffered_rmqueue(preferred_zone, zone, order,

2043

gfp_mask, migratetype);

2045

gfp_mask, migratetype);

2044

if (page)

2046

if (page)

2045

break;

2047

break;

2046

this_zone_full:

2048

this_zone_full:

2047

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2049

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2048

zlc_mark_zone_full(zonelist, z);

2050

zlc_mark_zone_full(zonelist, z);

2049

}

2051

}

2050

2052

2051

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2053

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2052

/* Disable zlc cache for second zonelist scan */

2054

/* Disable zlc cache for second zonelist scan */

2053

zlc_active = 0;

2055

zlc_active = 0;

2054

goto zonelist_scan;

2056

goto zonelist_scan;

2055

}

2057

}

2056

2058

2057

if (page)

2059

if (page)

2058

/*

2060

/*

2059

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2061

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2060

* necessary to allocate the page. The expectation is

2062

* necessary to allocate the page. The expectation is

2061

* that the caller is taking steps that will free more

2063

* that the caller is taking steps that will free more

2062

* memory. The caller should avoid the page being used

2064

* memory. The caller should avoid the page being used

2063

* for !PFMEMALLOC purposes.

2065

* for !PFMEMALLOC purposes.

2064

*/

2066

*/

2065

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2067

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2066

2068

2067

return page;

2069

return page;

2068

}

2070

}

2069

2071

2070

/*

2072

/*

2071

* Large machines with many possible nodes should not always dump per-node

2073

* Large machines with many possible nodes should not always dump per-node

2072

* meminfo in irq context.

2074

* meminfo in irq context.

2073

*/

2075

*/

2074

static inline bool should_suppress_show_mem(void)

2076

static inline bool should_suppress_show_mem(void)

2075

{

2077

{

2076

bool ret = false;

2078

bool ret = false;

2077

2079

2078

#if NODES_SHIFT > 8

2080

#if NODES_SHIFT > 8

2079

ret = in_interrupt();

2081

ret = in_interrupt();

2080

#endif

2082

#endif

2081

return ret;

2083

return ret;

2082

}

2084

}

2083

2085

2084

static DEFINE_RATELIMIT_STATE(nopage_rs,

2086

static DEFINE_RATELIMIT_STATE(nopage_rs,

2085

DEFAULT_RATELIMIT_INTERVAL,

2087

DEFAULT_RATELIMIT_INTERVAL,

2086

DEFAULT_RATELIMIT_BURST);

2088

DEFAULT_RATELIMIT_BURST);

2087

2089

2088

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2090

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2089

{

2091

{

2090

unsigned int filter = SHOW_MEM_FILTER_NODES;

2092

unsigned int filter = SHOW_MEM_FILTER_NODES;

2091

2093

2092

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2094

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2093

debug_guardpage_minorder() > 0)

2095

debug_guardpage_minorder() > 0)

2094

return;

2096

return;

2095

2097

2096

/*

2098

/*

2097

* Walking all memory to count page types is very expensive and should

2099

* Walking all memory to count page types is very expensive and should

2098

* be inhibited in non-blockable contexts.

2100

* be inhibited in non-blockable contexts.

2099

*/

2101

*/

2100

if (!(gfp_mask & __GFP_WAIT))

2102

if (!(gfp_mask & __GFP_WAIT))

2101

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2103

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2102

2104

2103

/*

2105

/*

2104

* This documents exceptions given to allocations in certain

2106

* This documents exceptions given to allocations in certain

2105

* contexts that are allowed to allocate outside current's set

2107

* contexts that are allowed to allocate outside current's set

2106

* of allowed nodes.

2108

* of allowed nodes.

2107

*/

2109

*/

2108

if (!(gfp_mask & __GFP_NOMEMALLOC))

2110

if (!(gfp_mask & __GFP_NOMEMALLOC))

2109

if (test_thread_flag(TIF_MEMDIE) ||

2111

if (test_thread_flag(TIF_MEMDIE) ||

2110

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2112

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2111

filter &= ~SHOW_MEM_FILTER_NODES;

2113

filter &= ~SHOW_MEM_FILTER_NODES;

2112

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2114

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2113

filter &= ~SHOW_MEM_FILTER_NODES;

2115

filter &= ~SHOW_MEM_FILTER_NODES;

2114

2116

2115

if (fmt) {

2117

if (fmt) {

2116

struct va_format vaf;

2118

struct va_format vaf;

2117

va_list args;

2119

va_list args;

2118

2120

2119

va_start(args, fmt);

2121

va_start(args, fmt);

2120

2122

2121

vaf.fmt = fmt;

2123

vaf.fmt = fmt;

2122

vaf.va = &args;

2124

vaf.va = &args;

2123

2125

2124

pr_warn("%pV", &vaf);

2126

pr_warn("%pV", &vaf);

2125

2127

2126

va_end(args);

2128

va_end(args);

2127

}

2129

}

2128

2130

2129

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2131

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2130

current->comm, order, gfp_mask);

2132

current->comm, order, gfp_mask);

2131

2133

2132

dump_stack();

2134

dump_stack();

2133

if (!should_suppress_show_mem())

2135

if (!should_suppress_show_mem())

2134

show_mem(filter);

2136

show_mem(filter);

2135

}

2137

}

2136

2138

2137

static inline int

2139

static inline int

2138

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2140

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2139

unsigned long did_some_progress,

2141

unsigned long did_some_progress,

2140

unsigned long pages_reclaimed)

2142

unsigned long pages_reclaimed)

2141

{

2143

{

2142

/* Do not loop if specifically requested */

2144

/* Do not loop if specifically requested */

2143

if (gfp_mask & __GFP_NORETRY)

2145

if (gfp_mask & __GFP_NORETRY)

2144

return 0;

2146

return 0;

2145

2147

2146

/* Always retry if specifically requested */

2148

/* Always retry if specifically requested */

2147

if (gfp_mask & __GFP_NOFAIL)

2149

if (gfp_mask & __GFP_NOFAIL)

2148

return 1;

2150

return 1;

2149

2151

2150

/*

2152

/*

2151

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2153

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2152

* making forward progress without invoking OOM. Suspend also disables

2154

* making forward progress without invoking OOM. Suspend also disables

2153

* storage devices so kswapd will not help. Bail if we are suspending.

2155

* storage devices so kswapd will not help. Bail if we are suspending.

2154

*/

2156

*/

2155

if (!did_some_progress && pm_suspended_storage())

2157

if (!did_some_progress && pm_suspended_storage())

2156

return 0;

2158

return 0;

2157

2159

2158

/*

2160

/*

2159

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2161

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2160

* means __GFP_NOFAIL, but that may not be true in other

2162

* means __GFP_NOFAIL, but that may not be true in other

2161

* implementations.

2163

* implementations.

2162

*/

2164

*/

2163

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2165

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2164

return 1;

2166

return 1;

2165

2167

2166

/*

2168

/*

2167

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2169

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2168

* specified, then we retry until we no longer reclaim any pages

2170

* specified, then we retry until we no longer reclaim any pages

2169

* (above), or we've reclaimed an order of pages at least as

2171

* (above), or we've reclaimed an order of pages at least as

2170

* large as the allocation's order. In both cases, if the

2172

* large as the allocation's order. In both cases, if the

2171

* allocation still fails, we stop retrying.

2173

* allocation still fails, we stop retrying.

2172

*/

2174

*/

2173

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2175

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2174

return 1;

2176

return 1;

2175

2177

2176

return 0;

2178

return 0;

2177

}

2179

}

2178

2180

2179

static inline struct page *

2181

static inline struct page *

2180

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2182

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2181

struct zonelist *zonelist, enum zone_type high_zoneidx,

2183

struct zonelist *zonelist, enum zone_type high_zoneidx,

2182

nodemask_t *nodemask, struct zone *preferred_zone,

2184

nodemask_t *nodemask, struct zone *preferred_zone,

2183

int classzone_idx, int migratetype)

2185

int classzone_idx, int migratetype)

2184

{

2186

{

2185

struct page *page;

2187

struct page *page;

2186

2188

2187

/* Acquire the OOM killer lock for the zones in zonelist */

2189

/* Acquire the OOM killer lock for the zones in zonelist */

2188

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2190

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2189

schedule_timeout_uninterruptible(1);

2191

schedule_timeout_uninterruptible(1);

2190

return NULL;

2192

return NULL;

2191

}

2193

}

2192

2194

2193

/*

2195

/*

2194

* Go through the zonelist yet one more time, keep very high watermark

2196

* Go through the zonelist yet one more time, keep very high watermark

2195

* here, this is only to catch a parallel oom killing, we must fail if

2197

* here, this is only to catch a parallel oom killing, we must fail if

2196

* we're still under heavy pressure.

2198

* we're still under heavy pressure.

2197

*/

2199

*/

2198

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2200

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2199

order, zonelist, high_zoneidx,

2201

order, zonelist, high_zoneidx,

2200

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2202

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2201

preferred_zone, classzone_idx, migratetype);

2203

preferred_zone, classzone_idx, migratetype);

2202

if (page)

2204

if (page)

2203

goto out;

2205

goto out;

2204

2206

2205

if (!(gfp_mask & __GFP_NOFAIL)) {

2207

if (!(gfp_mask & __GFP_NOFAIL)) {

2206

/* The OOM killer will not help higher order allocs */

2208

/* The OOM killer will not help higher order allocs */

2207

if (order > PAGE_ALLOC_COSTLY_ORDER)

2209

if (order > PAGE_ALLOC_COSTLY_ORDER)

2208

goto out;

2210

goto out;

2209

/* The OOM killer does not needlessly kill tasks for lowmem */

2211

/* The OOM killer does not needlessly kill tasks for lowmem */

2210

if (high_zoneidx < ZONE_NORMAL)

2212

if (high_zoneidx < ZONE_NORMAL)

2211

goto out;

2213

goto out;

2212

/*

2214

/*

2213

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2215

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2214

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2216

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2215

* The caller should handle page allocation failure by itself if

2217

* The caller should handle page allocation failure by itself if

2216

* it specifies __GFP_THISNODE.

2218

* it specifies __GFP_THISNODE.

2217

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2219

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2218

*/

2220

*/

2219

if (gfp_mask & __GFP_THISNODE)

2221

if (gfp_mask & __GFP_THISNODE)

2220

goto out;

2222

goto out;

2221

}

2223

}

2222

/* Exhausted what can be done so it's blamo time */

2224

/* Exhausted what can be done so it's blamo time */

2223

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2225

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2224

2226

2225

out:

2227

out:

2226

clear_zonelist_oom(zonelist, gfp_mask);

2228

clear_zonelist_oom(zonelist, gfp_mask);

2227

return page;

2229

return page;

2228

}

2230

}

2229

2231

2230

#ifdef CONFIG_COMPACTION

2232

#ifdef CONFIG_COMPACTION

2231

/* Try memory compaction for high-order allocations before reclaim */

2233

/* Try memory compaction for high-order allocations before reclaim */

2232

static struct page *

2234

static struct page *

2233

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2235

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2234

struct zonelist *zonelist, enum zone_type high_zoneidx,

2236

struct zonelist *zonelist, enum zone_type high_zoneidx,

2235

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2237

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2236

int classzone_idx, int migratetype, enum migrate_mode mode,

2238

int classzone_idx, int migratetype, enum migrate_mode mode,

2237

bool *contended_compaction, bool *deferred_compaction,

2239

bool *contended_compaction, bool *deferred_compaction,

2238

unsigned long *did_some_progress)

2240

unsigned long *did_some_progress)

2239

{

2241

{

2240

if (!order)

2242

if (!order)

2241

return NULL;

2243

return NULL;

2242

2244

2243

if (compaction_deferred(preferred_zone, order)) {

2245

if (compaction_deferred(preferred_zone, order)) {

2244

*deferred_compaction = true;

2246

*deferred_compaction = true;

2245

return NULL;

2247

return NULL;

2246

}

2248

}

2247

2249

2248

current->flags |= PF_MEMALLOC;

2250

current->flags |= PF_MEMALLOC;

2249

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2251

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2250

nodemask, mode,

2252

nodemask, mode,

2251

contended_compaction);

2253

contended_compaction);

2252

current->flags &= ~PF_MEMALLOC;

2254

current->flags &= ~PF_MEMALLOC;

2253

2255

2254

if (*did_some_progress != COMPACT_SKIPPED) {

2256

if (*did_some_progress != COMPACT_SKIPPED) {

2255

struct page *page;

2257

struct page *page;

2256

2258

2257

/* Page migration frees to the PCP lists but we want merging */

2259

/* Page migration frees to the PCP lists but we want merging */

2258

drain_pages(get_cpu());

2260

drain_pages(get_cpu());

2259

put_cpu();

2261

put_cpu();

2260

2262

2261

page = get_page_from_freelist(gfp_mask, nodemask,

2263

page = get_page_from_freelist(gfp_mask, nodemask,

2262

order, zonelist, high_zoneidx,

2264

order, zonelist, high_zoneidx,

2263

alloc_flags & ~ALLOC_NO_WATERMARKS,

2265

alloc_flags & ~ALLOC_NO_WATERMARKS,

2264

preferred_zone, classzone_idx, migratetype);

2266

preferred_zone, classzone_idx, migratetype);

2265

if (page) {

2267

if (page) {

2266

preferred_zone->compact_blockskip_flush = false;

2268

preferred_zone->compact_blockskip_flush = false;

2267

compaction_defer_reset(preferred_zone, order, true);

2269

compaction_defer_reset(preferred_zone, order, true);

2268

count_vm_event(COMPACTSUCCESS);

2270

count_vm_event(COMPACTSUCCESS);

2269

return page;

2271

return page;

2270

}

2272

}

2271

2273

2272

/*

2274

/*

2273

* It's bad if compaction run occurs and fails.

2275

* It's bad if compaction run occurs and fails.

2274

* The most likely reason is that pages exist,

2276

* The most likely reason is that pages exist,

2275

* but not enough to satisfy watermarks.

2277

* but not enough to satisfy watermarks.

2276

*/

2278

*/

2277

count_vm_event(COMPACTFAIL);

2279

count_vm_event(COMPACTFAIL);

2278

2280

2279

/*

2281

/*

2280

* As async compaction considers a subset of pageblocks, only

2282

* As async compaction considers a subset of pageblocks, only

2281

* defer if the failure was a sync compaction failure.

2283

* defer if the failure was a sync compaction failure.

2282

*/

2284

*/

2283

if (mode != MIGRATE_ASYNC)

2285

if (mode != MIGRATE_ASYNC)

2284

defer_compaction(preferred_zone, order);

2286

defer_compaction(preferred_zone, order);

2285

2287

2286

cond_resched();

2288

cond_resched();

2287

}

2289

}

2288

2290

2289

return NULL;

2291

return NULL;

2290

}

2292

}

2291

#else

2293

#else

2292

static inline struct page *

2294

static inline struct page *

2293

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2295

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2294

struct zonelist *zonelist, enum zone_type high_zoneidx,

2296

struct zonelist *zonelist, enum zone_type high_zoneidx,

2295

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2297

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2296

int classzone_idx, int migratetype,

2298

int classzone_idx, int migratetype,

2297

enum migrate_mode mode, bool *contended_compaction,

2299

enum migrate_mode mode, bool *contended_compaction,

2298

bool *deferred_compaction, unsigned long *did_some_progress)

2300

bool *deferred_compaction, unsigned long *did_some_progress)

2299

{

2301

{

2300

return NULL;

2302

return NULL;

2301

}

2303

}

2302

#endif /* CONFIG_COMPACTION */

2304

#endif /* CONFIG_COMPACTION */

2303

2305

2304

/* Perform direct synchronous page reclaim */

2306

/* Perform direct synchronous page reclaim */

2305

static int

2307

static int

2306

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2308

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2307

nodemask_t *nodemask)

2309

nodemask_t *nodemask)

2308

{

2310

{

2309

struct reclaim_state reclaim_state;

2311

struct reclaim_state reclaim_state;

2310

int progress;

2312

int progress;

2311

2313

2312

cond_resched();

2314

cond_resched();

2313

2315

2314

/* We now go into synchronous reclaim */

2316

/* We now go into synchronous reclaim */

2315

cpuset_memory_pressure_bump();

2317

cpuset_memory_pressure_bump();

2316

current->flags |= PF_MEMALLOC;

2318

current->flags |= PF_MEMALLOC;

2317

lockdep_set_current_reclaim_state(gfp_mask);

2319

lockdep_set_current_reclaim_state(gfp_mask);

2318

reclaim_state.reclaimed_slab = 0;

2320

reclaim_state.reclaimed_slab = 0;

2319

current->reclaim_state = &reclaim_state;

2321

current->reclaim_state = &reclaim_state;

2320

2322

2321

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2323

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2322

2324

2323

current->reclaim_state = NULL;

2325

current->reclaim_state = NULL;

2324

lockdep_clear_current_reclaim_state();

2326

lockdep_clear_current_reclaim_state();

2325

current->flags &= ~PF_MEMALLOC;

2327

current->flags &= ~PF_MEMALLOC;

2326

2328

2327

cond_resched();

2329

cond_resched();

2328

2330

2329

return progress;

2331

return progress;

2330

}

2332

}

2331

2333

2332

/* The really slow allocator path where we enter direct reclaim */

2334

/* The really slow allocator path where we enter direct reclaim */

2333

static inline struct page *

2335

static inline struct page *

2334

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2336

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2335

struct zonelist *zonelist, enum zone_type high_zoneidx,

2337

struct zonelist *zonelist, enum zone_type high_zoneidx,

2336

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2338

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2337

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2339

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2338

{

2340

{

2339

struct page *page = NULL;

2341

struct page *page = NULL;

2340

bool drained = false;

2342

bool drained = false;

2341

2343

2342

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2344

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2343

nodemask);

2345

nodemask);

2344

if (unlikely(!(*did_some_progress)))

2346

if (unlikely(!(*did_some_progress)))

2345

return NULL;

2347

return NULL;

2346

2348

2347

/* After successful reclaim, reconsider all zones for allocation */

2349

/* After successful reclaim, reconsider all zones for allocation */

2348

if (IS_ENABLED(CONFIG_NUMA))

2350

if (IS_ENABLED(CONFIG_NUMA))

2349

zlc_clear_zones_full(zonelist);

2351

zlc_clear_zones_full(zonelist);

2350

2352

2351

retry:

2353

retry:

2352

page = get_page_from_freelist(gfp_mask, nodemask, order,

2354

page = get_page_from_freelist(gfp_mask, nodemask, order,

2353

zonelist, high_zoneidx,

2355

zonelist, high_zoneidx,

2354

alloc_flags & ~ALLOC_NO_WATERMARKS,

2356

alloc_flags & ~ALLOC_NO_WATERMARKS,

2355

preferred_zone, classzone_idx,

2357

preferred_zone, classzone_idx,

2356

migratetype);

2358

migratetype);

2357

2359

2358

/*

2360

/*

2359

* If an allocation failed after direct reclaim, it could be because

2361

* If an allocation failed after direct reclaim, it could be because

2360

* pages are pinned on the per-cpu lists. Drain them and try again

2362

* pages are pinned on the per-cpu lists. Drain them and try again

2361

*/

2363

*/

2362

if (!page && !drained) {

2364

if (!page && !drained) {

2363

drain_all_pages();

2365

drain_all_pages();

2364

drained = true;

2366

drained = true;

2365

goto retry;

2367

goto retry;

2366

}

2368

}

2367

2369

2368

return page;

2370

return page;

2369

}

2371

}

2370

2372

2371

/*

2373

/*

2372

* This is called in the allocator slow-path if the allocation request is of

2374

* This is called in the allocator slow-path if the allocation request is of

2373

* sufficient urgency to ignore watermarks and take other desperate measures

2375

* sufficient urgency to ignore watermarks and take other desperate measures

2374

*/

2376

*/

2375

static inline struct page *

2377

static inline struct page *

2376

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2378

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2377

struct zonelist *zonelist, enum zone_type high_zoneidx,

2379

struct zonelist *zonelist, enum zone_type high_zoneidx,

2378

nodemask_t *nodemask, struct zone *preferred_zone,

2380

nodemask_t *nodemask, struct zone *preferred_zone,

2379

int classzone_idx, int migratetype)

2381

int classzone_idx, int migratetype)

2380

{

2382

{

2381

struct page *page;

2383

struct page *page;

2382

2384

2383

do {

2385

do {

2384

page = get_page_from_freelist(gfp_mask, nodemask, order,

2386

page = get_page_from_freelist(gfp_mask, nodemask, order,

2385

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2387

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2386

preferred_zone, classzone_idx, migratetype);

2388

preferred_zone, classzone_idx, migratetype);

2387

2389

2388

if (!page && gfp_mask & __GFP_NOFAIL)

2390

if (!page && gfp_mask & __GFP_NOFAIL)

2389

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2391

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2390

} while (!page && (gfp_mask & __GFP_NOFAIL));

2392

} while (!page && (gfp_mask & __GFP_NOFAIL));

2391

2393

2392

return page;

2394

return page;

2393

}

2395

}

2394

2396

2395

static void reset_alloc_batches(struct zonelist *zonelist,

2397

static void reset_alloc_batches(struct zonelist *zonelist,

2396

enum zone_type high_zoneidx,

2398

enum zone_type high_zoneidx,

2397

struct zone *preferred_zone)

2399

struct zone *preferred_zone)

2398

{

2400

{

2399

struct zoneref *z;

2401

struct zoneref *z;

2400

struct zone *zone;

2402

struct zone *zone;

2401

2403

2402

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2404

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2403

/*

2405

/*

2404

* Only reset the batches of zones that were actually

2406

* Only reset the batches of zones that were actually

2405

* considered in the fairness pass, we don't want to

2407

* considered in the fairness pass, we don't want to

2406

* trash fairness information for zones that are not

2408

* trash fairness information for zones that are not

2407

* actually part of this zonelist's round-robin cycle.

2409

* actually part of this zonelist's round-robin cycle.

2408

*/

2410

*/

2409

if (!zone_local(preferred_zone, zone))

2411

if (!zone_local(preferred_zone, zone))

2410

continue;

2412

continue;

2411

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2413

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2412

high_wmark_pages(zone) - low_wmark_pages(zone) -

2414

high_wmark_pages(zone) - low_wmark_pages(zone) -

2413

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2415

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2414

}

2416

}

2415

}

2417

}

2416

2418

2417

static void wake_all_kswapds(unsigned int order,

2419

static void wake_all_kswapds(unsigned int order,

2418

struct zonelist *zonelist,

2420

struct zonelist *zonelist,

2419

enum zone_type high_zoneidx,

2421

enum zone_type high_zoneidx,

2420

struct zone *preferred_zone)

2422

struct zone *preferred_zone)

2421

{

2423

{

2422

struct zoneref *z;

2424

struct zoneref *z;

2423

struct zone *zone;

2425

struct zone *zone;

2424

2426

2425

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2427

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2426

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2428

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2427

}

2429

}

2428

2430

2429

static inline int

2431

static inline int

2430

gfp_to_alloc_flags(gfp_t gfp_mask)

2432

gfp_to_alloc_flags(gfp_t gfp_mask)

2431

{

2433

{

2432

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2434

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2433

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2435

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2434

2436

2435

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2437

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2436

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2438

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2437

2439

2438

/*

2440

/*

2439

* The caller may dip into page reserves a bit more if the caller

2441

* The caller may dip into page reserves a bit more if the caller

2440

* cannot run direct reclaim, or if the caller has realtime scheduling

2442

* cannot run direct reclaim, or if the caller has realtime scheduling

2441

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2443

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2442

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2444

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2443

*/

2445

*/

2444

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2446

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2445

2447

2446

if (atomic) {

2448

if (atomic) {

2447

/*

2449

/*

2448

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2450

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2449

* if it can't schedule.

2451

* if it can't schedule.

2450

*/

2452

*/

2451

if (!(gfp_mask & __GFP_NOMEMALLOC))

2453

if (!(gfp_mask & __GFP_NOMEMALLOC))

2452

alloc_flags |= ALLOC_HARDER;

2454

alloc_flags |= ALLOC_HARDER;

2453

/*

2455

/*

2454

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2456

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2455

* comment for __cpuset_node_allowed_softwall().

2457

* comment for __cpuset_node_allowed_softwall().

2456

*/

2458

*/

2457

alloc_flags &= ~ALLOC_CPUSET;

2459

alloc_flags &= ~ALLOC_CPUSET;

2458

} else if (unlikely(rt_task(current)) && !in_interrupt())

2460

} else if (unlikely(rt_task(current)) && !in_interrupt())

2459

alloc_flags |= ALLOC_HARDER;

2461

alloc_flags |= ALLOC_HARDER;

2460

2462

2461

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2463

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2462

if (gfp_mask & __GFP_MEMALLOC)

2464

if (gfp_mask & __GFP_MEMALLOC)

2463

alloc_flags |= ALLOC_NO_WATERMARKS;

2465

alloc_flags |= ALLOC_NO_WATERMARKS;

2464

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2466

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2465

alloc_flags |= ALLOC_NO_WATERMARKS;

2467

alloc_flags |= ALLOC_NO_WATERMARKS;

2466

else if (!in_interrupt() &&

2468

else if (!in_interrupt() &&

2467

((current->flags & PF_MEMALLOC) ||

2469

((current->flags & PF_MEMALLOC) ||

2468

unlikely(test_thread_flag(TIF_MEMDIE))))

2470

unlikely(test_thread_flag(TIF_MEMDIE))))

2469

alloc_flags |= ALLOC_NO_WATERMARKS;

2471

alloc_flags |= ALLOC_NO_WATERMARKS;

2470

}

2472

}

2471

#ifdef CONFIG_CMA

2473

#ifdef CONFIG_CMA

2472

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2474

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2473

alloc_flags |= ALLOC_CMA;

2475

alloc_flags |= ALLOC_CMA;

2474

#endif

2476

#endif

2475

return alloc_flags;

2477

return alloc_flags;

2476

}

2478

}

2477

2479

2478

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2480

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2479

{

2481

{

2480

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2482

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2481

}

2483

}

2482

2484

2483

static inline struct page *

2485

static inline struct page *

2484

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2486

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2485

struct zonelist *zonelist, enum zone_type high_zoneidx,

2487

struct zonelist *zonelist, enum zone_type high_zoneidx,

2486

nodemask_t *nodemask, struct zone *preferred_zone,

2488

nodemask_t *nodemask, struct zone *preferred_zone,

2487

int classzone_idx, int migratetype)

2489

int classzone_idx, int migratetype)

2488

{

2490

{

2489

const gfp_t wait = gfp_mask & __GFP_WAIT;

2491

const gfp_t wait = gfp_mask & __GFP_WAIT;

2490

struct page *page = NULL;

2492

struct page *page = NULL;

2491

int alloc_flags;

2493

int alloc_flags;

2492

unsigned long pages_reclaimed = 0;

2494

unsigned long pages_reclaimed = 0;

2493

unsigned long did_some_progress;

2495

unsigned long did_some_progress;

2494

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2496

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2495

bool deferred_compaction = false;

2497

bool deferred_compaction = false;

2496

bool contended_compaction = false;

2498

bool contended_compaction = false;

2497

2499

2498

/*

2500

/*

2499

* In the slowpath, we sanity check order to avoid ever trying to

2501

* In the slowpath, we sanity check order to avoid ever trying to

2500

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2502

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2501

* be using allocators in order of preference for an area that is

2503

* be using allocators in order of preference for an area that is

2502

* too large.

2504

* too large.

2503

*/

2505

*/

2504

if (order >= MAX_ORDER) {

2506

if (order >= MAX_ORDER) {

2505

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2507

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2506

return NULL;

2508

return NULL;

2507

}

2509

}

2508

2510

2509

/*

2511

/*

2510

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2512

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2511

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2513

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2512

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2514

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2513

* using a larger set of nodes after it has established that the

2515

* using a larger set of nodes after it has established that the

2514

* allowed per node queues are empty and that nodes are

2516

* allowed per node queues are empty and that nodes are

2515

* over allocated.

2517

* over allocated.

2516

*/

2518

*/

2517

if (IS_ENABLED(CONFIG_NUMA) &&

2519

if (IS_ENABLED(CONFIG_NUMA) &&

2518

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2520

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2519

goto nopage;

2521

goto nopage;

2520

2522

2521

restart:

2523

restart:

2522

if (!(gfp_mask & __GFP_NO_KSWAPD))

2524

if (!(gfp_mask & __GFP_NO_KSWAPD))

2523

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2525

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2524

2526

2525

/*

2527

/*

2526

* OK, we're below the kswapd watermark and have kicked background

2528

* OK, we're below the kswapd watermark and have kicked background

2527

* reclaim. Now things get more complex, so set up alloc_flags according

2529

* reclaim. Now things get more complex, so set up alloc_flags according

2528

* to how we want to proceed.

2530

* to how we want to proceed.

2529

*/

2531

*/

2530

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2532

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2531

2533

2532

/*

2534

/*

2533

* Find the true preferred zone if the allocation is unconstrained by

2535

* Find the true preferred zone if the allocation is unconstrained by

2534

* cpusets.

2536

* cpusets.

2535

*/

2537

*/

2536

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2538

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2537

struct zoneref *preferred_zoneref;

2539

struct zoneref *preferred_zoneref;

2538

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2540

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2539

NULL,

2541

NULL,

2540

&preferred_zone);

2542

&preferred_zone);

2541

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2543

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2542

}

2544

}

2543

2545

2544

rebalance:

2546

rebalance:

2545

/* This is the last chance, in general, before the goto nopage. */

2547

/* This is the last chance, in general, before the goto nopage. */

2546

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2548

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2547

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2549

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2548

preferred_zone, classzone_idx, migratetype);

2550

preferred_zone, classzone_idx, migratetype);

2549

if (page)

2551

if (page)

2550

goto got_pg;

2552

goto got_pg;

2551

2553

2552

/* Allocate without watermarks if the context allows */

2554

/* Allocate without watermarks if the context allows */

2553

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2555

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2554

/*

2556

/*

2555

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2557

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2556

* the allocation is high priority and these type of

2558

* the allocation is high priority and these type of

2557

* allocations are system rather than user orientated

2559

* allocations are system rather than user orientated

2558

*/

2560

*/

2559

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2561

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2560

2562

2561

page = __alloc_pages_high_priority(gfp_mask, order,

2563

page = __alloc_pages_high_priority(gfp_mask, order,

2562

zonelist, high_zoneidx, nodemask,

2564

zonelist, high_zoneidx, nodemask,

2563

preferred_zone, classzone_idx, migratetype);

2565

preferred_zone, classzone_idx, migratetype);

2564

if (page) {

2566

if (page) {

2565

goto got_pg;

2567

goto got_pg;

2566

}

2568

}

2567

}

2569

}

2568

2570

2569

/* Atomic allocations - we can't balance anything */

2571

/* Atomic allocations - we can't balance anything */

2570

if (!wait)

2572

if (!wait)

2571

goto nopage;

2573

goto nopage;

2572

2574

2573

/* Avoid recursion of direct reclaim */

2575

/* Avoid recursion of direct reclaim */

2574

if (current->flags & PF_MEMALLOC)

2576

if (current->flags & PF_MEMALLOC)

2575

goto nopage;

2577

goto nopage;

2576

2578

2577

/* Avoid allocations with no watermarks from looping endlessly */

2579

/* Avoid allocations with no watermarks from looping endlessly */

2578

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2580

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2579

goto nopage;

2581

goto nopage;

2580

2582

2581

/*

2583

/*

2582

* Try direct compaction. The first pass is asynchronous. Subsequent

2584

* Try direct compaction. The first pass is asynchronous. Subsequent

2583

* attempts after direct reclaim are synchronous

2585

* attempts after direct reclaim are synchronous

2584

*/

2586

*/

2585

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2587

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2586

high_zoneidx, nodemask, alloc_flags,

2588

high_zoneidx, nodemask, alloc_flags,

2587

preferred_zone,

2589

preferred_zone,

2588

classzone_idx, migratetype,

2590

classzone_idx, migratetype,

2589

migration_mode, &contended_compaction,

2591

migration_mode, &contended_compaction,

2590

&deferred_compaction,

2592

&deferred_compaction,

2591

&did_some_progress);

2593

&did_some_progress);

2592

if (page)

2594

if (page)

2593

goto got_pg;

2595

goto got_pg;

2594

migration_mode = MIGRATE_SYNC_LIGHT;

2596

migration_mode = MIGRATE_SYNC_LIGHT;

2595

2597

2596

/*

2598

/*

2597

* If compaction is deferred for high-order allocations, it is because

2599

* If compaction is deferred for high-order allocations, it is because

2598

* sync compaction recently failed. In this is the case and the caller

2600

* sync compaction recently failed. In this is the case and the caller

2599

* requested a movable allocation that does not heavily disrupt the

2601

* requested a movable allocation that does not heavily disrupt the

2600

* system then fail the allocation instead of entering direct reclaim.

2602

* system then fail the allocation instead of entering direct reclaim.

2601

*/

2603

*/

2602

if ((deferred_compaction || contended_compaction) &&

2604

if ((deferred_compaction || contended_compaction) &&

2603

(gfp_mask & __GFP_NO_KSWAPD))

2605

(gfp_mask & __GFP_NO_KSWAPD))

2604

goto nopage;

2606

goto nopage;

2605

2607

2606

/* Try direct reclaim and then allocating */

2608

/* Try direct reclaim and then allocating */

2607

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2609

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2608

zonelist, high_zoneidx,

2610

zonelist, high_zoneidx,

2609

nodemask,

2611

nodemask,

2610

alloc_flags, preferred_zone,

2612

alloc_flags, preferred_zone,

2611

classzone_idx, migratetype,

2613

classzone_idx, migratetype,

2612

&did_some_progress);

2614

&did_some_progress);

2613

if (page)

2615

if (page)

2614

goto got_pg;

2616

goto got_pg;

2615

2617

2616

/*

2618

/*

2617

* If we failed to make any progress reclaiming, then we are

2619

* If we failed to make any progress reclaiming, then we are

2618

* running out of options and have to consider going OOM

2620

* running out of options and have to consider going OOM

2619

*/

2621

*/

2620

if (!did_some_progress) {

2622

if (!did_some_progress) {

2621

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2623

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2622

if (oom_killer_disabled)

2624

if (oom_killer_disabled)

2623

goto nopage;

2625

goto nopage;

2624

/* Coredumps can quickly deplete all memory reserves */

2626

/* Coredumps can quickly deplete all memory reserves */

2625

if ((current->flags & PF_DUMPCORE) &&

2627

if ((current->flags & PF_DUMPCORE) &&

2626

!(gfp_mask & __GFP_NOFAIL))

2628

!(gfp_mask & __GFP_NOFAIL))

2627

goto nopage;

2629

goto nopage;

2628

page = __alloc_pages_may_oom(gfp_mask, order,

2630

page = __alloc_pages_may_oom(gfp_mask, order,

2629

zonelist, high_zoneidx,

2631

zonelist, high_zoneidx,

2630

nodemask, preferred_zone,

2632

nodemask, preferred_zone,

2631

classzone_idx, migratetype);

2633

classzone_idx, migratetype);

2632

if (page)

2634

if (page)

2633

goto got_pg;

2635

goto got_pg;

2634

2636

2635

if (!(gfp_mask & __GFP_NOFAIL)) {

2637

if (!(gfp_mask & __GFP_NOFAIL)) {

2636

/*

2638

/*

2637

* The oom killer is not called for high-order

2639

* The oom killer is not called for high-order

2638

* allocations that may fail, so if no progress

2640

* allocations that may fail, so if no progress

2639

* is being made, there are no other options and

2641

* is being made, there are no other options and

2640

* retrying is unlikely to help.

2642

* retrying is unlikely to help.

2641

*/

2643

*/

2642

if (order > PAGE_ALLOC_COSTLY_ORDER)

2644

if (order > PAGE_ALLOC_COSTLY_ORDER)

2643

goto nopage;

2645

goto nopage;

2644

/*

2646

/*

2645

* The oom killer is not called for lowmem

2647

* The oom killer is not called for lowmem

2646

* allocations to prevent needlessly killing

2648

* allocations to prevent needlessly killing

2647

* innocent tasks.

2649

* innocent tasks.

2648

*/

2650

*/

2649

if (high_zoneidx < ZONE_NORMAL)

2651

if (high_zoneidx < ZONE_NORMAL)

2650

goto nopage;

2652

goto nopage;

2651

}

2653

}

2652

2654

2653

goto restart;

2655

goto restart;

2654

}

2656

}

2655

}

2657

}

2656

2658

2657

/* Check if we should retry the allocation */

2659

/* Check if we should retry the allocation */

2658

pages_reclaimed += did_some_progress;

2660

pages_reclaimed += did_some_progress;

2659

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2661

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2660

pages_reclaimed)) {

2662

pages_reclaimed)) {

2661

/* Wait for some write requests to complete then retry */

2663

/* Wait for some write requests to complete then retry */

2662

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2664

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2663

goto rebalance;

2665

goto rebalance;

2664

} else {

2666

} else {

2665

/*

2667

/*

2666

* High-order allocations do not necessarily loop after

2668

* High-order allocations do not necessarily loop after

2667

* direct reclaim and reclaim/compaction depends on compaction

2669

* direct reclaim and reclaim/compaction depends on compaction

2668

* being called after reclaim so call directly if necessary

2670

* being called after reclaim so call directly if necessary

2669

*/

2671

*/

2670

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2672

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2671

high_zoneidx, nodemask, alloc_flags,

2673

high_zoneidx, nodemask, alloc_flags,

2672

preferred_zone,

2674

preferred_zone,

2673

classzone_idx, migratetype,

2675

classzone_idx, migratetype,

2674

migration_mode, &contended_compaction,

2676

migration_mode, &contended_compaction,

2675

&deferred_compaction,

2677

&deferred_compaction,

2676

&did_some_progress);

2678

&did_some_progress);

2677

if (page)

2679

if (page)

2678

goto got_pg;

2680

goto got_pg;

2679

}

2681

}

2680

2682

2681

nopage:

2683

nopage:

2682

warn_alloc_failed(gfp_mask, order, NULL);

2684

warn_alloc_failed(gfp_mask, order, NULL);

2683

return page;

2685

return page;

2684

got_pg:

2686

got_pg:

2685

if (kmemcheck_enabled)

2687

if (kmemcheck_enabled)

2686

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2688

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2687

2689

2688

return page;

2690

return page;

2689

}

2691

}

2690

2692

2691

/*

2693

/*

2692

* This is the 'heart' of the zoned buddy allocator.

2694

* This is the 'heart' of the zoned buddy allocator.

2693

*/

2695

*/

2694

struct page *

2696

struct page *

2695

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2697

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2696

struct zonelist *zonelist, nodemask_t *nodemask)

2698

struct zonelist *zonelist, nodemask_t *nodemask)

2697

{

2699

{

2698

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2700

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2699

struct zone *preferred_zone;

2701

struct zone *preferred_zone;

2700

struct zoneref *preferred_zoneref;

2702

struct zoneref *preferred_zoneref;

2701

struct page *page = NULL;

2703

struct page *page = NULL;

2702

int migratetype = allocflags_to_migratetype(gfp_mask);

2704

int migratetype = allocflags_to_migratetype(gfp_mask);

2703

unsigned int cpuset_mems_cookie;

2705

unsigned int cpuset_mems_cookie;

2704

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2706

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2705

struct mem_cgroup *memcg = NULL;

2707

struct mem_cgroup *memcg = NULL;

2706

int classzone_idx;

2708

int classzone_idx;

2707

2709

2708

gfp_mask &= gfp_allowed_mask;

2710

gfp_mask &= gfp_allowed_mask;

2709

2711

2710

lockdep_trace_alloc(gfp_mask);

2712

lockdep_trace_alloc(gfp_mask);

2711

2713

2712

might_sleep_if(gfp_mask & __GFP_WAIT);

2714

might_sleep_if(gfp_mask & __GFP_WAIT);

2713

2715

2714

if (should_fail_alloc_page(gfp_mask, order))

2716

if (should_fail_alloc_page(gfp_mask, order))

2715

return NULL;

2717

return NULL;

2716

2718

2717

/*

2719

/*

2718

* Check the zones suitable for the gfp_mask contain at least one

2720

* Check the zones suitable for the gfp_mask contain at least one

2719

* valid zone. It's possible to have an empty zonelist as a result

2721

* valid zone. It's possible to have an empty zonelist as a result

2720

* of GFP_THISNODE and a memoryless node

2722

* of GFP_THISNODE and a memoryless node

2721

*/

2723

*/

2722

if (unlikely(!zonelist->_zonerefs->zone))

2724

if (unlikely(!zonelist->_zonerefs->zone))

2723

return NULL;

2725

return NULL;

2724

2726

2725

/*

2727

/*

2726

* Will only have any effect when __GFP_KMEMCG is set. This is

2728

* Will only have any effect when __GFP_KMEMCG is set. This is

2727

* verified in the (always inline) callee

2729

* verified in the (always inline) callee

2728

*/

2730

*/

2729

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2731

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2730

return NULL;

2732

return NULL;

2731

2733

2732

retry_cpuset:

2734

retry_cpuset:

2733

cpuset_mems_cookie = read_mems_allowed_begin();

2735

cpuset_mems_cookie = read_mems_allowed_begin();

2734

2736

2735

/* The preferred zone is used for statistics later */

2737

/* The preferred zone is used for statistics later */

2736

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2738

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2737

nodemask ? : &cpuset_current_mems_allowed,

2739

nodemask ? : &cpuset_current_mems_allowed,

2738

&preferred_zone);

2740

&preferred_zone);

2739

if (!preferred_zone)

2741

if (!preferred_zone)

2740

goto out;

2742

goto out;

2741

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2743

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2742

2744

2743

#ifdef CONFIG_CMA

2745

#ifdef CONFIG_CMA

2744

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2746

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2745

alloc_flags |= ALLOC_CMA;

2747

alloc_flags |= ALLOC_CMA;

2746

#endif

2748

#endif

2747

retry:

2749

retry:

2748

/* First allocation attempt */

2750

/* First allocation attempt */

2749

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2751

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2750

zonelist, high_zoneidx, alloc_flags,

2752

zonelist, high_zoneidx, alloc_flags,

2751

preferred_zone, classzone_idx, migratetype);

2753

preferred_zone, classzone_idx, migratetype);

2752

if (unlikely(!page)) {

2754

if (unlikely(!page)) {

2753

/*

2755

/*

2754

* The first pass makes sure allocations are spread

2756

* The first pass makes sure allocations are spread

2755

* fairly within the local node. However, the local

2757

* fairly within the local node. However, the local

2756

* node might have free pages left after the fairness

2758

* node might have free pages left after the fairness

2757

* batches are exhausted, and remote zones haven't

2759

* batches are exhausted, and remote zones haven't

2758

* even been considered yet. Try once more without

2760

* even been considered yet. Try once more without

2759

* fairness, and include remote zones now, before

2761

* fairness, and include remote zones now, before

2760

* entering the slowpath and waking kswapd: prefer

2762

* entering the slowpath and waking kswapd: prefer

2761

* spilling to a remote zone over swapping locally.

2763

* spilling to a remote zone over swapping locally.

2762

*/

2764

*/

2763

if (alloc_flags & ALLOC_FAIR) {

2765

if (alloc_flags & ALLOC_FAIR) {

2764

reset_alloc_batches(zonelist, high_zoneidx,

2766

reset_alloc_batches(zonelist, high_zoneidx,

2765

preferred_zone);

2767

preferred_zone);

2766

alloc_flags &= ~ALLOC_FAIR;

2768

alloc_flags &= ~ALLOC_FAIR;

2767

goto retry;

2769

goto retry;

2768

}

2770

}

2769

/*

2771

/*

2770

* Runtime PM, block IO and its error handling path

2772

* Runtime PM, block IO and its error handling path

2771

* can deadlock because I/O on the device might not

2773

* can deadlock because I/O on the device might not

2772

* complete.

2774

* complete.

2773

*/

2775

*/

2774

gfp_mask = memalloc_noio_flags(gfp_mask);

2776

gfp_mask = memalloc_noio_flags(gfp_mask);

2775

page = __alloc_pages_slowpath(gfp_mask, order,

2777

page = __alloc_pages_slowpath(gfp_mask, order,

2776

zonelist, high_zoneidx, nodemask,

2778

zonelist, high_zoneidx, nodemask,

2777

preferred_zone, classzone_idx, migratetype);

2779

preferred_zone, classzone_idx, migratetype);

2778

}

2780

}

2779

2781

2780

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2782

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2781

2783

2782

out:

2784

out:

2783

/*

2785

/*

2784

* When updating a task's mems_allowed, it is possible to race with

2786

* When updating a task's mems_allowed, it is possible to race with

2785

* parallel threads in such a way that an allocation can fail while

2787

* parallel threads in such a way that an allocation can fail while

2786

* the mask is being updated. If a page allocation is about to fail,

2788

* the mask is being updated. If a page allocation is about to fail,

2787

* check if the cpuset changed during allocation and if so, retry.

2789

* check if the cpuset changed during allocation and if so, retry.

2788

*/

2790

*/

2789

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2791

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2790

goto retry_cpuset;

2792

goto retry_cpuset;

2791

2793

2792

memcg_kmem_commit_charge(page, memcg, order);

2794

memcg_kmem_commit_charge(page, memcg, order);

2793

2795

2794

return page;

2796

return page;

2795

}

2797

}

2796

EXPORT_SYMBOL(__alloc_pages_nodemask);

2798

EXPORT_SYMBOL(__alloc_pages_nodemask);

2797

2799

2798

/*

2800

/*

2799

* Common helper functions.

2801

* Common helper functions.

2800

*/

2802

*/

2801

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2803

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2802

{

2804

{

2803

struct page *page;

2805

struct page *page;

2804

2806

2805

/*

2807

/*

2806

* __get_free_pages() returns a 32-bit address, which cannot represent

2808

* __get_free_pages() returns a 32-bit address, which cannot represent

2807

* a highmem page

2809

* a highmem page

2808

*/

2810

*/

2809

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2811

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2810

2812

2811

page = alloc_pages(gfp_mask, order);

2813

page = alloc_pages(gfp_mask, order);

2812

if (!page)

2814

if (!page)

2813

return 0;

2815

return 0;

2814

return (unsigned long) page_address(page);

2816

return (unsigned long) page_address(page);

2815

}

2817

}

2816

EXPORT_SYMBOL(__get_free_pages);

2818

EXPORT_SYMBOL(__get_free_pages);

2817

2819

2818

unsigned long get_zeroed_page(gfp_t gfp_mask)

2820

unsigned long get_zeroed_page(gfp_t gfp_mask)

2819

{

2821

{

2820

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2822

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2821

}

2823

}

2822

EXPORT_SYMBOL(get_zeroed_page);

2824

EXPORT_SYMBOL(get_zeroed_page);

2823

2825

2824

void __free_pages(struct page *page, unsigned int order)

2826

void __free_pages(struct page *page, unsigned int order)

2825

{

2827

{

2826

if (put_page_testzero(page)) {

2828

if (put_page_testzero(page)) {

2827

if (order == 0)

2829

if (order == 0)

2828

free_hot_cold_page(page, 0);

2830

free_hot_cold_page(page, 0);

2829

else

2831

else

2830

__free_pages_ok(page, order);

2832

__free_pages_ok(page, order);

2831

}

2833

}

2832

}

2834

}

2833

2835

2834

EXPORT_SYMBOL(__free_pages);

2836

EXPORT_SYMBOL(__free_pages);

2835

2837

2836

void free_pages(unsigned long addr, unsigned int order)

2838

void free_pages(unsigned long addr, unsigned int order)

2837

{

2839

{

2838

if (addr != 0) {

2840

if (addr != 0) {

2839

VM_BUG_ON(!virt_addr_valid((void *)addr));

2841

VM_BUG_ON(!virt_addr_valid((void *)addr));

2840

__free_pages(virt_to_page((void *)addr), order);

2842

__free_pages(virt_to_page((void *)addr), order);

2841

}

2843

}

2842

}

2844

}

2843

2845

2844

EXPORT_SYMBOL(free_pages);

2846

EXPORT_SYMBOL(free_pages);

2845

2847

2846

/*

2848

/*

2847

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2849

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2848

* pages allocated with __GFP_KMEMCG.

2850

* pages allocated with __GFP_KMEMCG.

2849

*

2851

*

2850

* Those pages are accounted to a particular memcg, embedded in the

2852

* Those pages are accounted to a particular memcg, embedded in the

2851

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2853

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2852

* for that information only to find out that it is NULL for users who have no

2854

* for that information only to find out that it is NULL for users who have no

2853

* interest in that whatsoever, we provide these functions.

2855

* interest in that whatsoever, we provide these functions.

2854

*

2856

*

2855

* The caller knows better which flags it relies on.

2857

* The caller knows better which flags it relies on.

2856

*/

2858

*/

2857

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2859

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2858

{

2860

{

2859

memcg_kmem_uncharge_pages(page, order);

2861

memcg_kmem_uncharge_pages(page, order);

2860

__free_pages(page, order);

2862

__free_pages(page, order);

2861

}

2863

}

2862

2864

2863

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2865

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2864

{

2866

{

2865

if (addr != 0) {

2867

if (addr != 0) {

2866

VM_BUG_ON(!virt_addr_valid((void *)addr));

2868

VM_BUG_ON(!virt_addr_valid((void *)addr));

2867

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2869

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2868

}

2870

}

2869

}

2871

}

2870

2872

2871

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2873

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2872

{

2874

{

2873

if (addr) {

2875

if (addr) {

2874

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2876

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2875

unsigned long used = addr + PAGE_ALIGN(size);

2877

unsigned long used = addr + PAGE_ALIGN(size);

2876

2878

2877

split_page(virt_to_page((void *)addr), order);

2879

split_page(virt_to_page((void *)addr), order);

2878

while (used < alloc_end) {

2880

while (used < alloc_end) {

2879

free_page(used);

2881

free_page(used);

2880

used += PAGE_SIZE;

2882

used += PAGE_SIZE;

2881

}

2883

}

2882

}

2884

}

2883

return (void *)addr;

2885

return (void *)addr;

2884

}

2886

}

2885

2887

2886

/**

2888

/**

2887

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2889

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2888

* @size: the number of bytes to allocate

2890

* @size: the number of bytes to allocate

2889

* @gfp_mask: GFP flags for the allocation

2891

* @gfp_mask: GFP flags for the allocation

2890

*

2892

*

2891

* This function is similar to alloc_pages(), except that it allocates the

2893

* This function is similar to alloc_pages(), except that it allocates the

2892

* minimum number of pages to satisfy the request. alloc_pages() can only

2894

* minimum number of pages to satisfy the request. alloc_pages() can only

2893

* allocate memory in power-of-two pages.

2895

* allocate memory in power-of-two pages.

2894

*

2896

*

2895

* This function is also limited by MAX_ORDER.

2897

* This function is also limited by MAX_ORDER.

2896

*

2898

*

2897

* Memory allocated by this function must be released by free_pages_exact().

2899

* Memory allocated by this function must be released by free_pages_exact().

2898

*/

2900

*/

2899

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2901

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2900

{

2902

{

2901

unsigned int order = get_order(size);

2903

unsigned int order = get_order(size);

2902

unsigned long addr;

2904

unsigned long addr;

2903

2905

2904

addr = __get_free_pages(gfp_mask, order);

2906

addr = __get_free_pages(gfp_mask, order);

2905

return make_alloc_exact(addr, order, size);

2907

return make_alloc_exact(addr, order, size);

2906

}

2908

}

2907

EXPORT_SYMBOL(alloc_pages_exact);

2909

EXPORT_SYMBOL(alloc_pages_exact);

2908

2910

2909

/**

2911

/**

2910

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2912

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2911

* pages on a node.

2913

* pages on a node.

2912

* @nid: the preferred node ID where memory should be allocated

2914

* @nid: the preferred node ID where memory should be allocated

2913

* @size: the number of bytes to allocate

2915

* @size: the number of bytes to allocate

2914

* @gfp_mask: GFP flags for the allocation

2916

* @gfp_mask: GFP flags for the allocation

2915

*

2917

*

2916

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2918

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2917

* back.

2919

* back.

2918

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2920

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2919

* but is not exact.

2921

* but is not exact.

2920

*/

2922

*/

2921

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2923

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2922

{

2924

{

2923

unsigned order = get_order(size);

2925

unsigned order = get_order(size);

2924

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2926

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2925

if (!p)

2927

if (!p)

2926

return NULL;

2928

return NULL;

2927

return make_alloc_exact((unsigned long)page_address(p), order, size);

2929

return make_alloc_exact((unsigned long)page_address(p), order, size);

2928

}

2930

}

2929

EXPORT_SYMBOL(alloc_pages_exact_nid);

2931

EXPORT_SYMBOL(alloc_pages_exact_nid);

2930

2932

2931

/**

2933

/**

2932

* free_pages_exact - release memory allocated via alloc_pages_exact()

2934

* free_pages_exact - release memory allocated via alloc_pages_exact()

2933

* @virt: the value returned by alloc_pages_exact.

2935

* @virt: the value returned by alloc_pages_exact.

2934

* @size: size of allocation, same value as passed to alloc_pages_exact().

2936

* @size: size of allocation, same value as passed to alloc_pages_exact().

2935

*

2937

*

2936

* Release the memory allocated by a previous call to alloc_pages_exact.

2938

* Release the memory allocated by a previous call to alloc_pages_exact.

2937

*/

2939

*/

2938

void free_pages_exact(void *virt, size_t size)

2940

void free_pages_exact(void *virt, size_t size)

2939

{

2941

{

2940

unsigned long addr = (unsigned long)virt;

2942

unsigned long addr = (unsigned long)virt;

2941

unsigned long end = addr + PAGE_ALIGN(size);

2943

unsigned long end = addr + PAGE_ALIGN(size);

2942

2944

2943

while (addr < end) {

2945

while (addr < end) {

2944

free_page(addr);

2946

free_page(addr);

2945

addr += PAGE_SIZE;

2947

addr += PAGE_SIZE;

2946

}

2948

}

2947

}

2949

}

2948

EXPORT_SYMBOL(free_pages_exact);

2950

EXPORT_SYMBOL(free_pages_exact);

2949

2951

2950

/**

2952

/**

2951

* nr_free_zone_pages - count number of pages beyond high watermark

2953

* nr_free_zone_pages - count number of pages beyond high watermark

2952

* @offset: The zone index of the highest zone

2954

* @offset: The zone index of the highest zone

2953

*

2955

*

2954

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2956

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2955

* high watermark within all zones at or below a given zone index. For each

2957

* high watermark within all zones at or below a given zone index. For each

2956

* zone, the number of pages is calculated as:

2958

* zone, the number of pages is calculated as:

2957

* managed_pages - high_pages

2959

* managed_pages - high_pages

2958

*/

2960

*/

2959

static unsigned long nr_free_zone_pages(int offset)

2961

static unsigned long nr_free_zone_pages(int offset)

2960

{

2962

{

2961

struct zoneref *z;

2963

struct zoneref *z;

2962

struct zone *zone;

2964

struct zone *zone;

2963

2965

2964

/* Just pick one node, since fallback list is circular */

2966

/* Just pick one node, since fallback list is circular */

2965

unsigned long sum = 0;

2967

unsigned long sum = 0;

2966

2968

2967

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2969

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2968

2970

2969

for_each_zone_zonelist(zone, z, zonelist, offset) {

2971

for_each_zone_zonelist(zone, z, zonelist, offset) {

2970

unsigned long size = zone->managed_pages;

2972

unsigned long size = zone->managed_pages;

2971

unsigned long high = high_wmark_pages(zone);

2973

unsigned long high = high_wmark_pages(zone);

2972

if (size > high)

2974

if (size > high)

2973

sum += size - high;

2975

sum += size - high;

2974

}

2976

}

2975

2977

2976

return sum;

2978

return sum;

2977

}

2979

}

2978

2980

2979

/**

2981

/**

2980

* nr_free_buffer_pages - count number of pages beyond high watermark

2982

* nr_free_buffer_pages - count number of pages beyond high watermark

2981

*

2983

*

2982

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2984

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2983

* watermark within ZONE_DMA and ZONE_NORMAL.

2985

* watermark within ZONE_DMA and ZONE_NORMAL.

2984

*/

2986

*/

2985

unsigned long nr_free_buffer_pages(void)

2987

unsigned long nr_free_buffer_pages(void)

2986

{

2988

{

2987

return nr_free_zone_pages(gfp_zone(GFP_USER));

2989

return nr_free_zone_pages(gfp_zone(GFP_USER));

2988

}

2990

}

2989

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2991

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2990

2992

2991

/**

2993

/**

2992

* nr_free_pagecache_pages - count number of pages beyond high watermark

2994

* nr_free_pagecache_pages - count number of pages beyond high watermark

2993

*

2995

*

2994

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2996

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2995

* high watermark within all zones.

2997

* high watermark within all zones.

2996

*/

2998

*/

2997

unsigned long nr_free_pagecache_pages(void)

2999

unsigned long nr_free_pagecache_pages(void)

2998

{

3000

{

2999

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3001

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3000

}

3002

}

3001

3003

3002

static inline void show_node(struct zone *zone)

3004

static inline void show_node(struct zone *zone)

3003

{

3005

{

3004

if (IS_ENABLED(CONFIG_NUMA))

3006

if (IS_ENABLED(CONFIG_NUMA))

3005

printk("Node %d ", zone_to_nid(zone));

3007

printk("Node %d ", zone_to_nid(zone));

3006

}

3008

}

3007

3009

3008

void si_meminfo(struct sysinfo *val)

3010

void si_meminfo(struct sysinfo *val)

3009

{

3011

{

3010

val->totalram = totalram_pages;

3012

val->totalram = totalram_pages;

3011

val->sharedram = 0;

3013

val->sharedram = 0;

3012

val->freeram = global_page_state(NR_FREE_PAGES);

3014

val->freeram = global_page_state(NR_FREE_PAGES);

3013

val->bufferram = nr_blockdev_pages();

3015

val->bufferram = nr_blockdev_pages();

3014

val->totalhigh = totalhigh_pages;

3016

val->totalhigh = totalhigh_pages;

3015

val->freehigh = nr_free_highpages();

3017

val->freehigh = nr_free_highpages();

3016

val->mem_unit = PAGE_SIZE;

3018

val->mem_unit = PAGE_SIZE;

3017

}

3019

}

3018

3020

3019

EXPORT_SYMBOL(si_meminfo);

3021

EXPORT_SYMBOL(si_meminfo);

3020

3022

3021

#ifdef CONFIG_NUMA

3023

#ifdef CONFIG_NUMA

3022

void si_meminfo_node(struct sysinfo *val, int nid)

3024

void si_meminfo_node(struct sysinfo *val, int nid)

3023

{

3025

{

3024

int zone_type; /* needs to be signed */

3026

int zone_type; /* needs to be signed */

3025

unsigned long managed_pages = 0;

3027

unsigned long managed_pages = 0;

3026

pg_data_t *pgdat = NODE_DATA(nid);

3028

pg_data_t *pgdat = NODE_DATA(nid);

3027

3029

3028

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3030

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3029

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3031

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3030

val->totalram = managed_pages;

3032

val->totalram = managed_pages;

3031

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3033

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3032

#ifdef CONFIG_HIGHMEM

3034

#ifdef CONFIG_HIGHMEM

3033

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3035

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3034

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3036

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3035

NR_FREE_PAGES);

3037

NR_FREE_PAGES);

3036

#else

3038

#else

3037

val->totalhigh = 0;

3039

val->totalhigh = 0;

3038

val->freehigh = 0;

3040

val->freehigh = 0;

3039

#endif

3041

#endif

3040

val->mem_unit = PAGE_SIZE;

3042

val->mem_unit = PAGE_SIZE;

3041

}

3043

}

3042

#endif

3044

#endif

3043

3045

3044

/*

3046

/*

3045

* Determine whether the node should be displayed or not, depending on whether

3047

* Determine whether the node should be displayed or not, depending on whether

3046

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3048

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3047

*/

3049

*/

3048

bool skip_free_areas_node(unsigned int flags, int nid)

3050

bool skip_free_areas_node(unsigned int flags, int nid)

3049

{

3051

{

3050

bool ret = false;

3052

bool ret = false;

3051

unsigned int cpuset_mems_cookie;

3053

unsigned int cpuset_mems_cookie;

3052

3054

3053

if (!(flags & SHOW_MEM_FILTER_NODES))

3055

if (!(flags & SHOW_MEM_FILTER_NODES))

3054

goto out;

3056

goto out;

3055

3057

3056

do {

3058

do {

3057

cpuset_mems_cookie = read_mems_allowed_begin();

3059

cpuset_mems_cookie = read_mems_allowed_begin();

3058

ret = !node_isset(nid, cpuset_current_mems_allowed);

3060

ret = !node_isset(nid, cpuset_current_mems_allowed);

3059

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3061

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3060

out:

3062

out:

3061

return ret;

3063

return ret;

3062

}

3064

}

3063

3065

3064

#define K(x) ((x) << (PAGE_SHIFT-10))

3066

#define K(x) ((x) << (PAGE_SHIFT-10))

3065

3067

3066

static void show_migration_types(unsigned char type)

3068

static void show_migration_types(unsigned char type)

3067

{

3069

{

3068

static const char types[MIGRATE_TYPES] = {

3070

static const char types[MIGRATE_TYPES] = {

3069

[MIGRATE_UNMOVABLE] = 'U',

3071

[MIGRATE_UNMOVABLE] = 'U',

3070

[MIGRATE_RECLAIMABLE] = 'E',

3072

[MIGRATE_RECLAIMABLE] = 'E',

3071

[MIGRATE_MOVABLE] = 'M',

3073

[MIGRATE_MOVABLE] = 'M',

3072

[MIGRATE_RESERVE] = 'R',

3074

[MIGRATE_RESERVE] = 'R',

3073

#ifdef CONFIG_CMA

3075

#ifdef CONFIG_CMA

3074

[MIGRATE_CMA] = 'C',

3076

[MIGRATE_CMA] = 'C',

3075

#endif

3077

#endif

3076

#ifdef CONFIG_MEMORY_ISOLATION

3078

#ifdef CONFIG_MEMORY_ISOLATION

3077

[MIGRATE_ISOLATE] = 'I',

3079

[MIGRATE_ISOLATE] = 'I',

3078

#endif

3080

#endif

3079

};

3081

};

3080

char tmp[MIGRATE_TYPES + 1];

3082

char tmp[MIGRATE_TYPES + 1];

3081

char *p = tmp;

3083

char *p = tmp;

3082

int i;

3084

int i;

3083

3085

3084

for (i = 0; i < MIGRATE_TYPES; i++) {

3086

for (i = 0; i < MIGRATE_TYPES; i++) {

3085

if (type & (1 << i))

3087

if (type & (1 << i))

3086

*p++ = types[i];

3088

*p++ = types[i];

3087

}

3089

}

3088

3090

3089

*p = '\0';

3091

*p = '\0';

3090

printk("(%s) ", tmp);

3092

printk("(%s) ", tmp);

3091

}

3093

}

3092

3094

3093

/*

3095

/*

3094

* Show free area list (used inside shift_scroll-lock stuff)

3096

* Show free area list (used inside shift_scroll-lock stuff)

3095

* We also calculate the percentage fragmentation. We do this by counting the

3097

* We also calculate the percentage fragmentation. We do this by counting the

3096

* memory on each free list with the exception of the first item on the list.

3098

* memory on each free list with the exception of the first item on the list.

3097

* Suppresses nodes that are not allowed by current's cpuset if

3099

* Suppresses nodes that are not allowed by current's cpuset if

3098

* SHOW_MEM_FILTER_NODES is passed.

3100

* SHOW_MEM_FILTER_NODES is passed.

3099

*/

3101

*/

3100

void show_free_areas(unsigned int filter)

3102

void show_free_areas(unsigned int filter)

3101

{

3103

{

3102

int cpu;

3104

int cpu;

3103

struct zone *zone;

3105

struct zone *zone;

3104

3106

3105

for_each_populated_zone(zone) {

3107

for_each_populated_zone(zone) {

3106

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3108

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3107

continue;

3109

continue;

3108

show_node(zone);

3110

show_node(zone);

3109

printk("%s per-cpu:\n", zone->name);

3111

printk("%s per-cpu:\n", zone->name);

3110

3112

3111

for_each_online_cpu(cpu) {

3113

for_each_online_cpu(cpu) {

3112

struct per_cpu_pageset *pageset;

3114

struct per_cpu_pageset *pageset;

3113

3115

3114

pageset = per_cpu_ptr(zone->pageset, cpu);

3116

pageset = per_cpu_ptr(zone->pageset, cpu);

3115

3117

3116

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3118

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3117

cpu, pageset->pcp.high,

3119

cpu, pageset->pcp.high,

3118

pageset->pcp.batch, pageset->pcp.count);

3120

pageset->pcp.batch, pageset->pcp.count);

3119

}

3121

}

3120

}

3122

}

3121

3123

3122

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3124

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3123

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3125

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3124

" unevictable:%lu"

3126

" unevictable:%lu"

3125

" dirty:%lu writeback:%lu unstable:%lu\n"

3127

" dirty:%lu writeback:%lu unstable:%lu\n"

3126

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3128

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3127

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3129

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3128

" free_cma:%lu\n",

3130

" free_cma:%lu\n",

3129

global_page_state(NR_ACTIVE_ANON),

3131

global_page_state(NR_ACTIVE_ANON),

3130

global_page_state(NR_INACTIVE_ANON),

3132

global_page_state(NR_INACTIVE_ANON),

3131

global_page_state(NR_ISOLATED_ANON),

3133

global_page_state(NR_ISOLATED_ANON),

3132

global_page_state(NR_ACTIVE_FILE),

3134

global_page_state(NR_ACTIVE_FILE),

3133

global_page_state(NR_INACTIVE_FILE),

3135

global_page_state(NR_INACTIVE_FILE),

3134

global_page_state(NR_ISOLATED_FILE),

3136

global_page_state(NR_ISOLATED_FILE),

3135

global_page_state(NR_UNEVICTABLE),

3137

global_page_state(NR_UNEVICTABLE),

3136

global_page_state(NR_FILE_DIRTY),

3138

global_page_state(NR_FILE_DIRTY),

3137

global_page_state(NR_WRITEBACK),

3139

global_page_state(NR_WRITEBACK),

3138

global_page_state(NR_UNSTABLE_NFS),

3140

global_page_state(NR_UNSTABLE_NFS),

3139

global_page_state(NR_FREE_PAGES),

3141

global_page_state(NR_FREE_PAGES),

3140

global_page_state(NR_SLAB_RECLAIMABLE),

3142

global_page_state(NR_SLAB_RECLAIMABLE),

3141

global_page_state(NR_SLAB_UNRECLAIMABLE),

3143

global_page_state(NR_SLAB_UNRECLAIMABLE),

3142

global_page_state(NR_FILE_MAPPED),

3144

global_page_state(NR_FILE_MAPPED),

3143

global_page_state(NR_SHMEM),

3145

global_page_state(NR_SHMEM),

3144

global_page_state(NR_PAGETABLE),

3146

global_page_state(NR_PAGETABLE),

3145

global_page_state(NR_BOUNCE),

3147

global_page_state(NR_BOUNCE),

3146

global_page_state(NR_FREE_CMA_PAGES));

3148

global_page_state(NR_FREE_CMA_PAGES));

3147

3149

3148

for_each_populated_zone(zone) {

3150

for_each_populated_zone(zone) {

3149

int i;

3151

int i;

3150

3152

3151

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3153

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3152

continue;

3154

continue;

3153

show_node(zone);

3155

show_node(zone);

3154

printk("%s"

3156

printk("%s"

3155

" free:%lukB"

3157

" free:%lukB"

3156

" min:%lukB"

3158

" min:%lukB"

3157

" low:%lukB"

3159

" low:%lukB"

3158

" high:%lukB"

3160

" high:%lukB"

3159

" active_anon:%lukB"

3161

" active_anon:%lukB"

3160

" inactive_anon:%lukB"

3162

" inactive_anon:%lukB"

3161

" active_file:%lukB"

3163

" active_file:%lukB"

3162

" inactive_file:%lukB"

3164

" inactive_file:%lukB"

3163

" unevictable:%lukB"

3165

" unevictable:%lukB"

3164

" isolated(anon):%lukB"

3166

" isolated(anon):%lukB"

3165

" isolated(file):%lukB"

3167

" isolated(file):%lukB"

3166

" present:%lukB"

3168

" present:%lukB"

3167

" managed:%lukB"

3169

" managed:%lukB"

3168

" mlocked:%lukB"

3170

" mlocked:%lukB"

3169

" dirty:%lukB"

3171

" dirty:%lukB"

3170

" writeback:%lukB"

3172

" writeback:%lukB"

3171

" mapped:%lukB"

3173

" mapped:%lukB"

3172

" shmem:%lukB"

3174

" shmem:%lukB"

3173

" slab_reclaimable:%lukB"

3175

" slab_reclaimable:%lukB"

3174

" slab_unreclaimable:%lukB"

3176

" slab_unreclaimable:%lukB"

3175

" kernel_stack:%lukB"

3177

" kernel_stack:%lukB"

3176

" pagetables:%lukB"

3178

" pagetables:%lukB"

3177

" unstable:%lukB"

3179

" unstable:%lukB"

3178

" bounce:%lukB"

3180

" bounce:%lukB"

3179

" free_cma:%lukB"

3181

" free_cma:%lukB"

3180

" writeback_tmp:%lukB"

3182

" writeback_tmp:%lukB"

3181

" pages_scanned:%lu"

3183

" pages_scanned:%lu"

3182

" all_unreclaimable? %s"

3184

" all_unreclaimable? %s"

3183

"\n",

3185

"\n",

3184

zone->name,

3186

zone->name,

3185

K(zone_page_state(zone, NR_FREE_PAGES)),

3187

K(zone_page_state(zone, NR_FREE_PAGES)),

3186

K(min_wmark_pages(zone)),

3188

K(min_wmark_pages(zone)),

3187

K(low_wmark_pages(zone)),

3189

K(low_wmark_pages(zone)),

3188

K(high_wmark_pages(zone)),

3190

K(high_wmark_pages(zone)),

3189

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3191

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3190

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3192

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3191

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3193

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3192

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3194

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3193

K(zone_page_state(zone, NR_UNEVICTABLE)),

3195

K(zone_page_state(zone, NR_UNEVICTABLE)),

3194

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3196

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3195

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3197

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3196

K(zone->present_pages),

3198

K(zone->present_pages),

3197

K(zone->managed_pages),

3199

K(zone->managed_pages),

3198

K(zone_page_state(zone, NR_MLOCK)),

3200

K(zone_page_state(zone, NR_MLOCK)),

3199

K(zone_page_state(zone, NR_FILE_DIRTY)),

3201

K(zone_page_state(zone, NR_FILE_DIRTY)),

3200

K(zone_page_state(zone, NR_WRITEBACK)),

3202

K(zone_page_state(zone, NR_WRITEBACK)),

3201

K(zone_page_state(zone, NR_FILE_MAPPED)),

3203

K(zone_page_state(zone, NR_FILE_MAPPED)),

3202

K(zone_page_state(zone, NR_SHMEM)),

3204

K(zone_page_state(zone, NR_SHMEM)),

3203

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3205

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3204

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3206

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3205

zone_page_state(zone, NR_KERNEL_STACK) *

3207

zone_page_state(zone, NR_KERNEL_STACK) *

3206

THREAD_SIZE / 1024,

3208

THREAD_SIZE / 1024,

3207

K(zone_page_state(zone, NR_PAGETABLE)),

3209

K(zone_page_state(zone, NR_PAGETABLE)),

3208

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3210

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3209

K(zone_page_state(zone, NR_BOUNCE)),

3211

K(zone_page_state(zone, NR_BOUNCE)),

3210

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3212

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3211

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3213

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3212

zone->pages_scanned,

3214

zone->pages_scanned,

3213

(!zone_reclaimable(zone) ? "yes" : "no")

3215

(!zone_reclaimable(zone) ? "yes" : "no")

3214

);

3216

);

3215

printk("lowmem_reserve[]:");

3217

printk("lowmem_reserve[]:");

3216

for (i = 0; i < MAX_NR_ZONES; i++)

3218

for (i = 0; i < MAX_NR_ZONES; i++)

3217

printk(" %lu", zone->lowmem_reserve[i]);

3219

printk(" %lu", zone->lowmem_reserve[i]);

3218

printk("\n");

3220

printk("\n");

3219

}

3221

}

3220

3222

3221

for_each_populated_zone(zone) {

3223

for_each_populated_zone(zone) {

3222

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3224

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3223

unsigned char types[MAX_ORDER];

3225

unsigned char types[MAX_ORDER];

3224

3226

3225

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3227

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3226

continue;

3228

continue;

3227

show_node(zone);

3229

show_node(zone);

3228

printk("%s: ", zone->name);

3230

printk("%s: ", zone->name);

3229

3231

3230

spin_lock_irqsave(&zone->lock, flags);

3232

spin_lock_irqsave(&zone->lock, flags);

3231

for (order = 0; order < MAX_ORDER; order++) {

3233

for (order = 0; order < MAX_ORDER; order++) {

3232

struct free_area *area = &zone->free_area[order];

3234

struct free_area *area = &zone->free_area[order];

3233

int type;

3235

int type;

3234

3236

3235

nr[order] = area->nr_free;

3237

nr[order] = area->nr_free;

3236

total += nr[order] << order;

3238

total += nr[order] << order;

3237

3239

3238

types[order] = 0;

3240

types[order] = 0;

3239

for (type = 0; type < MIGRATE_TYPES; type++) {

3241

for (type = 0; type < MIGRATE_TYPES; type++) {

3240

if (!list_empty(&area->free_list[type]))

3242

if (!list_empty(&area->free_list[type]))

3241

types[order] |= 1 << type;

3243

types[order] |= 1 << type;

3242

}

3244

}

3243

}

3245

}

3244

spin_unlock_irqrestore(&zone->lock, flags);

3246

spin_unlock_irqrestore(&zone->lock, flags);

3245

for (order = 0; order < MAX_ORDER; order++) {

3247

for (order = 0; order < MAX_ORDER; order++) {

3246

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3248

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3247

if (nr[order])

3249

if (nr[order])

3248

show_migration_types(types[order]);

3250

show_migration_types(types[order]);

3249

}

3251

}

3250

printk("= %lukB\n", K(total));

3252

printk("= %lukB\n", K(total));

3251

}

3253

}

3252

3254

3253

hugetlb_show_meminfo();

3255

hugetlb_show_meminfo();

3254

3256

3255

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3257

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3256

3258

3257

show_swap_cache_info();

3259

show_swap_cache_info();

3258

}

3260

}

3259

3261

3260

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3262

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3261

{

3263

{

3262

zoneref->zone = zone;

3264

zoneref->zone = zone;

3263

zoneref->zone_idx = zone_idx(zone);

3265

zoneref->zone_idx = zone_idx(zone);

3264

}

3266

}

3265

3267

3266

/*

3268

/*

3267

* Builds allocation fallback zone lists.

3269

* Builds allocation fallback zone lists.

3268

*

3270

*

3269

* Add all populated zones of a node to the zonelist.

3271

* Add all populated zones of a node to the zonelist.

3270

*/

3272

*/

3271

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3273

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3272

int nr_zones)

3274

int nr_zones)

3273

{

3275

{

3274

struct zone *zone;

3276

struct zone *zone;

3275

enum zone_type zone_type = MAX_NR_ZONES;

3277

enum zone_type zone_type = MAX_NR_ZONES;

3276

3278

3277

do {

3279

do {

3278

zone_type--;

3280

zone_type--;

3279

zone = pgdat->node_zones + zone_type;

3281

zone = pgdat->node_zones + zone_type;

3280

if (populated_zone(zone)) {

3282

if (populated_zone(zone)) {

3281

zoneref_set_zone(zone,

3283

zoneref_set_zone(zone,

3282

&zonelist->_zonerefs[nr_zones++]);

3284

&zonelist->_zonerefs[nr_zones++]);

3283

check_highest_zone(zone_type);

3285

check_highest_zone(zone_type);

3284

}

3286

}

3285

} while (zone_type);

3287

} while (zone_type);

3286

3288

3287

return nr_zones;

3289

return nr_zones;

3288

}

3290

}

3289

3291

3290

3292

3291

/*

3293

/*

3292

* zonelist_order:

3294

* zonelist_order:

3293

* 0 = automatic detection of better ordering.

3295

* 0 = automatic detection of better ordering.

3294

* 1 = order by ([node] distance, -zonetype)

3296

* 1 = order by ([node] distance, -zonetype)

3295

* 2 = order by (-zonetype, [node] distance)

3297

* 2 = order by (-zonetype, [node] distance)

3296

*

3298

*

3297

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3299

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3298

* the same zonelist. So only NUMA can configure this param.

3300

* the same zonelist. So only NUMA can configure this param.

3299

*/

3301

*/

3300

#define ZONELIST_ORDER_DEFAULT 0

3302

#define ZONELIST_ORDER_DEFAULT 0

3301

#define ZONELIST_ORDER_NODE 1

3303

#define ZONELIST_ORDER_NODE 1

3302

#define ZONELIST_ORDER_ZONE 2

3304

#define ZONELIST_ORDER_ZONE 2

3303

3305

3304

/* zonelist order in the kernel.

3306

/* zonelist order in the kernel.

3305

* set_zonelist_order() will set this to NODE or ZONE.

3307

* set_zonelist_order() will set this to NODE or ZONE.

3306

*/

3308

*/

3307

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3309

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3308

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3310

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3309

3311

3310

3312

3311

#ifdef CONFIG_NUMA

3313

#ifdef CONFIG_NUMA

3312

/* The value user specified ....changed by config */

3314

/* The value user specified ....changed by config */

3313

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3315

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3314

/* string for sysctl */

3316

/* string for sysctl */

3315

#define NUMA_ZONELIST_ORDER_LEN 16

3317

#define NUMA_ZONELIST_ORDER_LEN 16

3316

char numa_zonelist_order[16] = "default";

3318

char numa_zonelist_order[16] = "default";

3317

3319

3318

/*

3320

/*

3319

* interface for configure zonelist ordering.

3321

* interface for configure zonelist ordering.

3320

* command line option "numa_zonelist_order"

3322

* command line option "numa_zonelist_order"

3321

* = "[dD]efault - default, automatic configuration.

3323

* = "[dD]efault - default, automatic configuration.

3322

* = "[nN]ode - order by node locality, then by zone within node

3324

* = "[nN]ode - order by node locality, then by zone within node

3323

* = "[zZ]one - order by zone, then by locality within zone

3325

* = "[zZ]one - order by zone, then by locality within zone

3324

*/

3326

*/

3325

3327

3326

static int __parse_numa_zonelist_order(char *s)

3328

static int __parse_numa_zonelist_order(char *s)

3327

{

3329

{

3328

if (*s == 'd' || *s == 'D') {

3330

if (*s == 'd' || *s == 'D') {

3329

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3331

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3330

} else if (*s == 'n' || *s == 'N') {

3332

} else if (*s == 'n' || *s == 'N') {

3331

user_zonelist_order = ZONELIST_ORDER_NODE;

3333

user_zonelist_order = ZONELIST_ORDER_NODE;

3332

} else if (*s == 'z' || *s == 'Z') {

3334

} else if (*s == 'z' || *s == 'Z') {

3333

user_zonelist_order = ZONELIST_ORDER_ZONE;

3335

user_zonelist_order = ZONELIST_ORDER_ZONE;

3334

} else {

3336

} else {

3335

printk(KERN_WARNING

3337

printk(KERN_WARNING

3336

"Ignoring invalid numa_zonelist_order value: "

3338

"Ignoring invalid numa_zonelist_order value: "

3337

"%s\n", s);

3339

"%s\n", s);

3338

return -EINVAL;

3340

return -EINVAL;

3339

}

3341

}

3340

return 0;

3342

return 0;

3341

}

3343

}

3342

3344

3343

static __init int setup_numa_zonelist_order(char *s)

3345

static __init int setup_numa_zonelist_order(char *s)

3344

{

3346

{

3345

int ret;

3347

int ret;

3346

3348

3347

if (!s)

3349

if (!s)

3348

return 0;

3350

return 0;

3349

3351

3350

ret = __parse_numa_zonelist_order(s);

3352

ret = __parse_numa_zonelist_order(s);

3351

if (ret == 0)

3353

if (ret == 0)

3352

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3354

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3353

3355

3354

return ret;

3356

return ret;

3355

}

3357

}

3356

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3358

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3357

3359

3358

/*

3360

/*

3359

* sysctl handler for numa_zonelist_order

3361

* sysctl handler for numa_zonelist_order

3360

*/

3362

*/

3361

int numa_zonelist_order_handler(ctl_table *table, int write,

3363

int numa_zonelist_order_handler(ctl_table *table, int write,

3362

void __user *buffer, size_t *length,

3364

void __user *buffer, size_t *length,

3363

loff_t *ppos)

3365

loff_t *ppos)

3364

{

3366

{

3365

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3367

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3366

int ret;

3368

int ret;

3367

static DEFINE_MUTEX(zl_order_mutex);

3369

static DEFINE_MUTEX(zl_order_mutex);

3368

3370

3369

mutex_lock(&zl_order_mutex);

3371

mutex_lock(&zl_order_mutex);

3370

if (write) {

3372

if (write) {

3371

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3373

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3372

ret = -EINVAL;

3374

ret = -EINVAL;

3373

goto out;

3375

goto out;

3374

}

3376

}

3375

strcpy(saved_string, (char *)table->data);

3377

strcpy(saved_string, (char *)table->data);

3376

}

3378

}

3377

ret = proc_dostring(table, write, buffer, length, ppos);

3379

ret = proc_dostring(table, write, buffer, length, ppos);

3378

if (ret)

3380

if (ret)

3379

goto out;

3381

goto out;

3380

if (write) {

3382

if (write) {

3381

int oldval = user_zonelist_order;

3383

int oldval = user_zonelist_order;

3382

3384

3383

ret = __parse_numa_zonelist_order((char *)table->data);

3385

ret = __parse_numa_zonelist_order((char *)table->data);

3384

if (ret) {

3386

if (ret) {

3385

/*

3387

/*

3386

* bogus value. restore saved string

3388

* bogus value. restore saved string

3387

*/

3389

*/

3388

strncpy((char *)table->data, saved_string,

3390

strncpy((char *)table->data, saved_string,

3389

NUMA_ZONELIST_ORDER_LEN);

3391

NUMA_ZONELIST_ORDER_LEN);

3390

user_zonelist_order = oldval;

3392

user_zonelist_order = oldval;

3391

} else if (oldval != user_zonelist_order) {

3393

} else if (oldval != user_zonelist_order) {

3392

mutex_lock(&zonelists_mutex);

3394

mutex_lock(&zonelists_mutex);

3393

build_all_zonelists(NULL, NULL);

3395

build_all_zonelists(NULL, NULL);

3394

mutex_unlock(&zonelists_mutex);

3396

mutex_unlock(&zonelists_mutex);

3395

}

3397

}

3396

}

3398

}

3397

out:

3399

out:

3398

mutex_unlock(&zl_order_mutex);

3400

mutex_unlock(&zl_order_mutex);

3399

return ret;

3401

return ret;

3400

}

3402

}

3401

3403

3402

3404

3403

#define MAX_NODE_LOAD (nr_online_nodes)

3405

#define MAX_NODE_LOAD (nr_online_nodes)

3404

static int node_load[MAX_NUMNODES];

3406

static int node_load[MAX_NUMNODES];

3405

3407

3406

/**

3408

/**

3407

* find_next_best_node - find the next node that should appear in a given node's fallback list

3409

* find_next_best_node - find the next node that should appear in a given node's fallback list

3408

* @node: node whose fallback list we're appending

3410

* @node: node whose fallback list we're appending

3409

* @used_node_mask: nodemask_t of already used nodes

3411

* @used_node_mask: nodemask_t of already used nodes

3410

*

3412

*

3411

* We use a number of factors to determine which is the next node that should

3413

* We use a number of factors to determine which is the next node that should

3412

* appear on a given node's fallback list. The node should not have appeared

3414

* appear on a given node's fallback list. The node should not have appeared

3413

* already in @node's fallback list, and it should be the next closest node

3415

* already in @node's fallback list, and it should be the next closest node

3414

* according to the distance array (which contains arbitrary distance values

3416

* according to the distance array (which contains arbitrary distance values

3415

* from each node to each node in the system), and should also prefer nodes

3417

* from each node to each node in the system), and should also prefer nodes

3416

* with no CPUs, since presumably they'll have very little allocation pressure

3418

* with no CPUs, since presumably they'll have very little allocation pressure

3417

* on them otherwise.

3419

* on them otherwise.

3418

* It returns -1 if no node is found.

3420

* It returns -1 if no node is found.

3419

*/

3421

*/

3420

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3422

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3421

{

3423

{

3422

int n, val;

3424

int n, val;

3423

int min_val = INT_MAX;

3425

int min_val = INT_MAX;

3424

int best_node = NUMA_NO_NODE;

3426

int best_node = NUMA_NO_NODE;

3425

const struct cpumask *tmp = cpumask_of_node(0);

3427

const struct cpumask *tmp = cpumask_of_node(0);

3426

3428

3427

/* Use the local node if we haven't already */

3429

/* Use the local node if we haven't already */

3428

if (!node_isset(node, *used_node_mask)) {

3430

if (!node_isset(node, *used_node_mask)) {

3429

node_set(node, *used_node_mask);

3431

node_set(node, *used_node_mask);

3430

return node;

3432

return node;

3431

}

3433

}

3432

3434

3433

for_each_node_state(n, N_MEMORY) {

3435

for_each_node_state(n, N_MEMORY) {

3434

3436

3435

/* Don't want a node to appear more than once */

3437

/* Don't want a node to appear more than once */

3436

if (node_isset(n, *used_node_mask))

3438

if (node_isset(n, *used_node_mask))

3437

continue;

3439

continue;

3438

3440

3439

/* Use the distance array to find the distance */

3441

/* Use the distance array to find the distance */

3440

val = node_distance(node, n);

3442

val = node_distance(node, n);

3441

3443

3442

/* Penalize nodes under us ("prefer the next node") */

3444

/* Penalize nodes under us ("prefer the next node") */

3443

val += (n < node);

3445

val += (n < node);

3444

3446

3445

/* Give preference to headless and unused nodes */

3447

/* Give preference to headless and unused nodes */

3446

tmp = cpumask_of_node(n);

3448

tmp = cpumask_of_node(n);

3447

if (!cpumask_empty(tmp))

3449

if (!cpumask_empty(tmp))

3448

val += PENALTY_FOR_NODE_WITH_CPUS;

3450

val += PENALTY_FOR_NODE_WITH_CPUS;

3449

3451

3450

/* Slight preference for less loaded node */

3452

/* Slight preference for less loaded node */

3451

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3453

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3452

val += node_load[n];

3454

val += node_load[n];

3453

3455

3454

if (val < min_val) {

3456

if (val < min_val) {

3455

min_val = val;

3457

min_val = val;

3456

best_node = n;

3458

best_node = n;

3457

}

3459

}

3458

}

3460

}

3459

3461

3460

if (best_node >= 0)

3462

if (best_node >= 0)

3461

node_set(best_node, *used_node_mask);

3463

node_set(best_node, *used_node_mask);

3462

3464

3463

return best_node;

3465

return best_node;

3464

}

3466

}

3465

3467

3466

3468

3467

/*

3469

/*

3468

* Build zonelists ordered by node and zones within node.

3470

* Build zonelists ordered by node and zones within node.

3469

* This results in maximum locality--normal zone overflows into local

3471

* This results in maximum locality--normal zone overflows into local

3470

* DMA zone, if any--but risks exhausting DMA zone.

3472

* DMA zone, if any--but risks exhausting DMA zone.

3471

*/

3473

*/

3472

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3474

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3473

{

3475

{

3474

int j;

3476

int j;

3475

struct zonelist *zonelist;

3477

struct zonelist *zonelist;

3476

3478

3477

zonelist = &pgdat->node_zonelists[0];

3479

zonelist = &pgdat->node_zonelists[0];

3478

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3480

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3479

;

3481

;

3480

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3482

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3481

zonelist->_zonerefs[j].zone = NULL;

3483

zonelist->_zonerefs[j].zone = NULL;

3482

zonelist->_zonerefs[j].zone_idx = 0;

3484

zonelist->_zonerefs[j].zone_idx = 0;

3483

}

3485

}

3484

3486

3485

/*

3487

/*

3486

* Build gfp_thisnode zonelists

3488

* Build gfp_thisnode zonelists

3487

*/

3489

*/

3488

static void build_thisnode_zonelists(pg_data_t *pgdat)

3490

static void build_thisnode_zonelists(pg_data_t *pgdat)

3489

{

3491

{

3490

int j;

3492

int j;

3491

struct zonelist *zonelist;

3493

struct zonelist *zonelist;

3492

3494

3493

zonelist = &pgdat->node_zonelists[1];

3495

zonelist = &pgdat->node_zonelists[1];

3494

j = build_zonelists_node(pgdat, zonelist, 0);

3496

j = build_zonelists_node(pgdat, zonelist, 0);

3495

zonelist->_zonerefs[j].zone = NULL;

3497

zonelist->_zonerefs[j].zone = NULL;

3496

zonelist->_zonerefs[j].zone_idx = 0;

3498

zonelist->_zonerefs[j].zone_idx = 0;

3497

}

3499

}

3498

3500

3499

/*

3501

/*

3500

* Build zonelists ordered by zone and nodes within zones.

3502

* Build zonelists ordered by zone and nodes within zones.

3501

* This results in conserving DMA zone[s] until all Normal memory is

3503

* This results in conserving DMA zone[s] until all Normal memory is

3502

* exhausted, but results in overflowing to remote node while memory

3504

* exhausted, but results in overflowing to remote node while memory

3503

* may still exist in local DMA zone.

3505

* may still exist in local DMA zone.

3504

*/

3506

*/

3505

static int node_order[MAX_NUMNODES];

3507

static int node_order[MAX_NUMNODES];

3506

3508

3507

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3509

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3508

{

3510

{

3509

int pos, j, node;

3511

int pos, j, node;

3510

int zone_type; /* needs to be signed */

3512

int zone_type; /* needs to be signed */

3511

struct zone *z;

3513

struct zone *z;

3512

struct zonelist *zonelist;

3514

struct zonelist *zonelist;

3513

3515

3514

zonelist = &pgdat->node_zonelists[0];

3516

zonelist = &pgdat->node_zonelists[0];

3515

pos = 0;

3517

pos = 0;

3516

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3518

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3517

for (j = 0; j < nr_nodes; j++) {

3519

for (j = 0; j < nr_nodes; j++) {

3518

node = node_order[j];

3520

node = node_order[j];

3519

z = &NODE_DATA(node)->node_zones[zone_type];

3521

z = &NODE_DATA(node)->node_zones[zone_type];

3520

if (populated_zone(z)) {

3522

if (populated_zone(z)) {

3521

zoneref_set_zone(z,

3523

zoneref_set_zone(z,

3522

&zonelist->_zonerefs[pos++]);

3524

&zonelist->_zonerefs[pos++]);

3523

check_highest_zone(zone_type);

3525

check_highest_zone(zone_type);

3524

}

3526

}

3525

}

3527

}

3526

}

3528

}

3527

zonelist->_zonerefs[pos].zone = NULL;

3529

zonelist->_zonerefs[pos].zone = NULL;

3528

zonelist->_zonerefs[pos].zone_idx = 0;

3530

zonelist->_zonerefs[pos].zone_idx = 0;

3529

}

3531

}

3530

3532

3531

static int default_zonelist_order(void)

3533

static int default_zonelist_order(void)

3532

{

3534

{

3533

int nid, zone_type;

3535

int nid, zone_type;

3534

unsigned long low_kmem_size, total_size;

3536

unsigned long low_kmem_size, total_size;

3535

struct zone *z;

3537

struct zone *z;

3536

int average_size;

3538

int average_size;

3537

/*

3539

/*

3538

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3540

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3539

* If they are really small and used heavily, the system can fall

3541

* If they are really small and used heavily, the system can fall

3540

* into OOM very easily.

3542

* into OOM very easily.

3541

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3543

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3542

*/

3544

*/

3543

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3545

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3544

low_kmem_size = 0;

3546

low_kmem_size = 0;

3545

total_size = 0;

3547

total_size = 0;

3546

for_each_online_node(nid) {

3548

for_each_online_node(nid) {

3547

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3549

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3548

z = &NODE_DATA(nid)->node_zones[zone_type];

3550

z = &NODE_DATA(nid)->node_zones[zone_type];

3549

if (populated_zone(z)) {

3551

if (populated_zone(z)) {

3550

if (zone_type < ZONE_NORMAL)

3552

if (zone_type < ZONE_NORMAL)

3551

low_kmem_size += z->managed_pages;

3553

low_kmem_size += z->managed_pages;

3552

total_size += z->managed_pages;

3554

total_size += z->managed_pages;

3553

} else if (zone_type == ZONE_NORMAL) {

3555

} else if (zone_type == ZONE_NORMAL) {

3554

/*

3556

/*

3555

* If any node has only lowmem, then node order

3557

* If any node has only lowmem, then node order

3556

* is preferred to allow kernel allocations

3558

* is preferred to allow kernel allocations

3557

* locally; otherwise, they can easily infringe

3559

* locally; otherwise, they can easily infringe

3558

* on other nodes when there is an abundance of

3560

* on other nodes when there is an abundance of

3559

* lowmem available to allocate from.

3561

* lowmem available to allocate from.

3560

*/

3562

*/

3561

return ZONELIST_ORDER_NODE;

3563

return ZONELIST_ORDER_NODE;

3562

}

3564

}

3563

}

3565

}

3564

}

3566

}

3565

if (!low_kmem_size || /* there are no DMA area. */

3567

if (!low_kmem_size || /* there are no DMA area. */

3566

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3568

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3567

return ZONELIST_ORDER_NODE;

3569

return ZONELIST_ORDER_NODE;

3568

/*

3570

/*

3569

* look into each node's config.

3571

* look into each node's config.

3570

* If there is a node whose DMA/DMA32 memory is very big area on

3572

* If there is a node whose DMA/DMA32 memory is very big area on

3571

* local memory, NODE_ORDER may be suitable.

3573

* local memory, NODE_ORDER may be suitable.

3572

*/

3574

*/

3573

average_size = total_size /

3575

average_size = total_size /

3574

(nodes_weight(node_states[N_MEMORY]) + 1);

3576

(nodes_weight(node_states[N_MEMORY]) + 1);

3575

for_each_online_node(nid) {

3577

for_each_online_node(nid) {

3576

low_kmem_size = 0;

3578

low_kmem_size = 0;

3577

total_size = 0;

3579

total_size = 0;

3578

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3580

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3579

z = &NODE_DATA(nid)->node_zones[zone_type];

3581

z = &NODE_DATA(nid)->node_zones[zone_type];

3580

if (populated_zone(z)) {

3582

if (populated_zone(z)) {

3581

if (zone_type < ZONE_NORMAL)

3583

if (zone_type < ZONE_NORMAL)

3582

low_kmem_size += z->present_pages;

3584

low_kmem_size += z->present_pages;

3583

total_size += z->present_pages;

3585

total_size += z->present_pages;

3584

}

3586

}

3585

}

3587

}

3586

if (low_kmem_size &&

3588

if (low_kmem_size &&

3587

total_size > average_size && /* ignore small node */

3589

total_size > average_size && /* ignore small node */

3588

low_kmem_size > total_size * 70/100)

3590

low_kmem_size > total_size * 70/100)

3589

return ZONELIST_ORDER_NODE;

3591

return ZONELIST_ORDER_NODE;

3590

}

3592

}

3591

return ZONELIST_ORDER_ZONE;

3593

return ZONELIST_ORDER_ZONE;

3592

}

3594

}

3593

3595

3594

static void set_zonelist_order(void)

3596

static void set_zonelist_order(void)

3595

{

3597

{

3596

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3598

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3597

current_zonelist_order = default_zonelist_order();

3599

current_zonelist_order = default_zonelist_order();

3598

else

3600

else

3599

current_zonelist_order = user_zonelist_order;

3601

current_zonelist_order = user_zonelist_order;

3600

}

3602

}

3601

3603

3602

static void build_zonelists(pg_data_t *pgdat)

3604

static void build_zonelists(pg_data_t *pgdat)

3603

{

3605

{

3604

int j, node, load;

3606

int j, node, load;

3605

enum zone_type i;

3607

enum zone_type i;

3606

nodemask_t used_mask;

3608

nodemask_t used_mask;

3607

int local_node, prev_node;

3609

int local_node, prev_node;

3608

struct zonelist *zonelist;

3610

struct zonelist *zonelist;

3609

int order = current_zonelist_order;

3611

int order = current_zonelist_order;

3610

3612

3611

/* initialize zonelists */

3613

/* initialize zonelists */

3612

for (i = 0; i < MAX_ZONELISTS; i++) {

3614

for (i = 0; i < MAX_ZONELISTS; i++) {

3613

zonelist = pgdat->node_zonelists + i;

3615

zonelist = pgdat->node_zonelists + i;

3614

zonelist->_zonerefs[0].zone = NULL;

3616

zonelist->_zonerefs[0].zone = NULL;

3615

zonelist->_zonerefs[0].zone_idx = 0;

3617

zonelist->_zonerefs[0].zone_idx = 0;

3616

}

3618

}

3617

3619

3618

/* NUMA-aware ordering of nodes */

3620

/* NUMA-aware ordering of nodes */

3619

local_node = pgdat->node_id;

3621

local_node = pgdat->node_id;

3620

load = nr_online_nodes;

3622

load = nr_online_nodes;

3621

prev_node = local_node;

3623

prev_node = local_node;

3622

nodes_clear(used_mask);

3624

nodes_clear(used_mask);

3623

3625

3624

memset(node_order, 0, sizeof(node_order));

3626

memset(node_order, 0, sizeof(node_order));

3625

j = 0;

3627

j = 0;

3626

3628

3627

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3629

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3628

/*

3630

/*

3629

* We don't want to pressure a particular node.

3631

* We don't want to pressure a particular node.

3630

* So adding penalty to the first node in same

3632

* So adding penalty to the first node in same

3631

* distance group to make it round-robin.

3633

* distance group to make it round-robin.

3632

*/

3634

*/

3633

if (node_distance(local_node, node) !=

3635

if (node_distance(local_node, node) !=

3634

node_distance(local_node, prev_node))

3636

node_distance(local_node, prev_node))

3635

node_load[node] = load;

3637

node_load[node] = load;

3636

3638

3637

prev_node = node;

3639

prev_node = node;

3638

load--;

3640

load--;

3639

if (order == ZONELIST_ORDER_NODE)

3641

if (order == ZONELIST_ORDER_NODE)

3640

build_zonelists_in_node_order(pgdat, node);

3642

build_zonelists_in_node_order(pgdat, node);

3641

else

3643

else

3642

node_order[j++] = node; /* remember order */

3644

node_order[j++] = node; /* remember order */

3643

}

3645

}

3644

3646

3645

if (order == ZONELIST_ORDER_ZONE) {

3647

if (order == ZONELIST_ORDER_ZONE) {

3646

/* calculate node order -- i.e., DMA last! */

3648

/* calculate node order -- i.e., DMA last! */

3647

build_zonelists_in_zone_order(pgdat, j);

3649

build_zonelists_in_zone_order(pgdat, j);

3648

}

3650

}

3649

3651

3650

build_thisnode_zonelists(pgdat);

3652

build_thisnode_zonelists(pgdat);

3651

}

3653

}

3652

3654

3653

/* Construct the zonelist performance cache - see further mmzone.h */

3655

/* Construct the zonelist performance cache - see further mmzone.h */

3654

static void build_zonelist_cache(pg_data_t *pgdat)

3656

static void build_zonelist_cache(pg_data_t *pgdat)

3655

{

3657

{

3656

struct zonelist *zonelist;

3658

struct zonelist *zonelist;

3657

struct zonelist_cache *zlc;

3659

struct zonelist_cache *zlc;

3658

struct zoneref *z;

3660

struct zoneref *z;

3659

3661

3660

zonelist = &pgdat->node_zonelists[0];

3662

zonelist = &pgdat->node_zonelists[0];

3661

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3663

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3662

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3664

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3663

for (z = zonelist->_zonerefs; z->zone; z++)

3665

for (z = zonelist->_zonerefs; z->zone; z++)

3664

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3666

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3665

}

3667

}

3666

3668

3667

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3669

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3668

/*

3670

/*

3669

* Return node id of node used for "local" allocations.

3671

* Return node id of node used for "local" allocations.

3670

* I.e., first node id of first zone in arg node's generic zonelist.

3672

* I.e., first node id of first zone in arg node's generic zonelist.

3671

* Used for initializing percpu 'numa_mem', which is used primarily

3673

* Used for initializing percpu 'numa_mem', which is used primarily

3672

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3674

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3673

*/

3675

*/

3674

int local_memory_node(int node)

3676

int local_memory_node(int node)

3675

{

3677

{

3676

struct zone *zone;

3678

struct zone *zone;

3677

3679

3678

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3680

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3679

gfp_zone(GFP_KERNEL),

3681

gfp_zone(GFP_KERNEL),

3680

NULL,

3682

NULL,

3681

&zone);

3683

&zone);

3682

return zone->node;

3684

return zone->node;

3683

}

3685

}

3684

#endif

3686

#endif

3685

3687

3686

#else /* CONFIG_NUMA */

3688

#else /* CONFIG_NUMA */

3687

3689

3688

static void set_zonelist_order(void)

3690

static void set_zonelist_order(void)

3689

{

3691

{

3690

current_zonelist_order = ZONELIST_ORDER_ZONE;

3692

current_zonelist_order = ZONELIST_ORDER_ZONE;

3691

}

3693

}

3692

3694

3693

static void build_zonelists(pg_data_t *pgdat)

3695

static void build_zonelists(pg_data_t *pgdat)

3694

{

3696

{

3695

int node, local_node;

3697

int node, local_node;

3696

enum zone_type j;

3698

enum zone_type j;

3697

struct zonelist *zonelist;

3699

struct zonelist *zonelist;

3698

3700

3699

local_node = pgdat->node_id;

3701

local_node = pgdat->node_id;

3700

3702

3701

zonelist = &pgdat->node_zonelists[0];

3703

zonelist = &pgdat->node_zonelists[0];

3702

j = build_zonelists_node(pgdat, zonelist, 0);

3704

j = build_zonelists_node(pgdat, zonelist, 0);

3703

3705

3704

/*

3706

/*

3705

* Now we build the zonelist so that it contains the zones

3707

* Now we build the zonelist so that it contains the zones

3706

* of all the other nodes.

3708

* of all the other nodes.

3707

* We don't want to pressure a particular node, so when

3709

* We don't want to pressure a particular node, so when

3708

* building the zones for node N, we make sure that the

3710

* building the zones for node N, we make sure that the

3709

* zones coming right after the local ones are those from

3711

* zones coming right after the local ones are those from

3710

* node N+1 (modulo N)

3712

* node N+1 (modulo N)

3711

*/

3713

*/

3712

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3714

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3713

if (!node_online(node))

3715

if (!node_online(node))

3714

continue;

3716

continue;

3715

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3717

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3716

}

3718

}

3717

for (node = 0; node < local_node; node++) {

3719

for (node = 0; node < local_node; node++) {

3718

if (!node_online(node))

3720

if (!node_online(node))

3719

continue;

3721

continue;

3720

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3722

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3721

}

3723

}

3722

3724

3723

zonelist->_zonerefs[j].zone = NULL;

3725

zonelist->_zonerefs[j].zone = NULL;

3724

zonelist->_zonerefs[j].zone_idx = 0;

3726

zonelist->_zonerefs[j].zone_idx = 0;

3725

}

3727

}

3726

3728

3727

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3729

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3728

static void build_zonelist_cache(pg_data_t *pgdat)

3730

static void build_zonelist_cache(pg_data_t *pgdat)

3729

{

3731

{

3730

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3732

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3731

}

3733

}

3732

3734

3733

#endif /* CONFIG_NUMA */

3735

#endif /* CONFIG_NUMA */

3734

3736

3735

/*

3737

/*

3736

* Boot pageset table. One per cpu which is going to be used for all

3738

* Boot pageset table. One per cpu which is going to be used for all

3737

* zones and all nodes. The parameters will be set in such a way

3739

* zones and all nodes. The parameters will be set in such a way

3738

* that an item put on a list will immediately be handed over to

3740

* that an item put on a list will immediately be handed over to

3739

* the buddy list. This is safe since pageset manipulation is done

3741

* the buddy list. This is safe since pageset manipulation is done

3740

* with interrupts disabled.

3742

* with interrupts disabled.

3741

*

3743

*

3742

* The boot_pagesets must be kept even after bootup is complete for

3744

* The boot_pagesets must be kept even after bootup is complete for

3743

* unused processors and/or zones. They do play a role for bootstrapping

3745

* unused processors and/or zones. They do play a role for bootstrapping

3744

* hotplugged processors.

3746

* hotplugged processors.

3745

*

3747

*

3746

* zoneinfo_show() and maybe other functions do

3748

* zoneinfo_show() and maybe other functions do

3747

* not check if the processor is online before following the pageset pointer.

3749

* not check if the processor is online before following the pageset pointer.

3748

* Other parts of the kernel may not check if the zone is available.

3750

* Other parts of the kernel may not check if the zone is available.

3749

*/

3751

*/

3750

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3752

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3751

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3753

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3752

static void setup_zone_pageset(struct zone *zone);

3754

static void setup_zone_pageset(struct zone *zone);

3753

3755

3754

/*

3756

/*

3755

* Global mutex to protect against size modification of zonelists

3757

* Global mutex to protect against size modification of zonelists

3756

* as well as to serialize pageset setup for the new populated zone.

3758

* as well as to serialize pageset setup for the new populated zone.

3757

*/

3759

*/

3758

DEFINE_MUTEX(zonelists_mutex);

3760

DEFINE_MUTEX(zonelists_mutex);

3759

3761

3760

/* return values int ....just for stop_machine() */

3762

/* return values int ....just for stop_machine() */

3761

static int __build_all_zonelists(void *data)

3763

static int __build_all_zonelists(void *data)

3762

{

3764

{

3763

int nid;

3765

int nid;

3764

int cpu;

3766

int cpu;

3765

pg_data_t *self = data;

3767

pg_data_t *self = data;

3766

3768

3767

#ifdef CONFIG_NUMA

3769

#ifdef CONFIG_NUMA

3768

memset(node_load, 0, sizeof(node_load));

3770

memset(node_load, 0, sizeof(node_load));

3769

#endif

3771

#endif

3770

3772

3771

if (self && !node_online(self->node_id)) {

3773

if (self && !node_online(self->node_id)) {

3772

build_zonelists(self);

3774

build_zonelists(self);

3773

build_zonelist_cache(self);

3775

build_zonelist_cache(self);

3774

}

3776

}

3775

3777

3776

for_each_online_node(nid) {

3778

for_each_online_node(nid) {

3777

pg_data_t *pgdat = NODE_DATA(nid);

3779

pg_data_t *pgdat = NODE_DATA(nid);

3778

3780

3779

build_zonelists(pgdat);

3781

build_zonelists(pgdat);

3780

build_zonelist_cache(pgdat);

3782

build_zonelist_cache(pgdat);

3781

}

3783

}

3782

3784

3783

/*

3785

/*

3784

* Initialize the boot_pagesets that are going to be used

3786

* Initialize the boot_pagesets that are going to be used

3785

* for bootstrapping processors. The real pagesets for

3787

* for bootstrapping processors. The real pagesets for

3786

* each zone will be allocated later when the per cpu

3788

* each zone will be allocated later when the per cpu

3787

* allocator is available.

3789

* allocator is available.

3788

*

3790

*

3789

* boot_pagesets are used also for bootstrapping offline

3791

* boot_pagesets are used also for bootstrapping offline

3790

* cpus if the system is already booted because the pagesets

3792

* cpus if the system is already booted because the pagesets

3791

* are needed to initialize allocators on a specific cpu too.

3793

* are needed to initialize allocators on a specific cpu too.

3792

* F.e. the percpu allocator needs the page allocator which

3794

* F.e. the percpu allocator needs the page allocator which

3793

* needs the percpu allocator in order to allocate its pagesets

3795

* needs the percpu allocator in order to allocate its pagesets

3794

* (a chicken-egg dilemma).

3796

* (a chicken-egg dilemma).

3795

*/

3797

*/

3796

for_each_possible_cpu(cpu) {

3798

for_each_possible_cpu(cpu) {

3797

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3799

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3798

3800

3799

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3801

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3800

/*

3802

/*

3801

* We now know the "local memory node" for each node--

3803

* We now know the "local memory node" for each node--

3802

* i.e., the node of the first zone in the generic zonelist.

3804

* i.e., the node of the first zone in the generic zonelist.

3803

* Set up numa_mem percpu variable for on-line cpus. During

3805

* Set up numa_mem percpu variable for on-line cpus. During

3804

* boot, only the boot cpu should be on-line; we'll init the

3806

* boot, only the boot cpu should be on-line; we'll init the

3805

* secondary cpus' numa_mem as they come on-line. During

3807

* secondary cpus' numa_mem as they come on-line. During

3806

* node/memory hotplug, we'll fixup all on-line cpus.

3808

* node/memory hotplug, we'll fixup all on-line cpus.

3807

*/

3809

*/

3808

if (cpu_online(cpu))

3810

if (cpu_online(cpu))

3809

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3811

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3810

#endif

3812

#endif

3811

}

3813

}

3812

3814

3813

return 0;

3815

return 0;

3814

}

3816

}

3815

3817

3816

/*

3818

/*

3817

* Called with zonelists_mutex held always

3819

* Called with zonelists_mutex held always

3818

* unless system_state == SYSTEM_BOOTING.

3820

* unless system_state == SYSTEM_BOOTING.

3819

*/

3821

*/

3820

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3822

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3821

{

3823

{

3822

set_zonelist_order();

3824

set_zonelist_order();

3823

3825

3824

if (system_state == SYSTEM_BOOTING) {

3826

if (system_state == SYSTEM_BOOTING) {

3825

__build_all_zonelists(NULL);

3827

__build_all_zonelists(NULL);

3826

mminit_verify_zonelist();

3828

mminit_verify_zonelist();

3827

cpuset_init_current_mems_allowed();

3829

cpuset_init_current_mems_allowed();

3828

} else {

3830

} else {

3829

#ifdef CONFIG_MEMORY_HOTPLUG

3831

#ifdef CONFIG_MEMORY_HOTPLUG

3830

if (zone)

3832

if (zone)

3831

setup_zone_pageset(zone);

3833

setup_zone_pageset(zone);

3832

#endif

3834

#endif

3833

/* we have to stop all cpus to guarantee there is no user

3835

/* we have to stop all cpus to guarantee there is no user

3834

of zonelist */

3836

of zonelist */

3835

stop_machine(__build_all_zonelists, pgdat, NULL);

3837

stop_machine(__build_all_zonelists, pgdat, NULL);

3836

/* cpuset refresh routine should be here */

3838

/* cpuset refresh routine should be here */

3837

}

3839

}

3838

vm_total_pages = nr_free_pagecache_pages();

3840

vm_total_pages = nr_free_pagecache_pages();

3839

/*

3841

/*

3840

* Disable grouping by mobility if the number of pages in the

3842

* Disable grouping by mobility if the number of pages in the

3841

* system is too low to allow the mechanism to work. It would be

3843

* system is too low to allow the mechanism to work. It would be

3842

* more accurate, but expensive to check per-zone. This check is

3844

* more accurate, but expensive to check per-zone. This check is

3843

* made on memory-hotadd so a system can start with mobility

3845

* made on memory-hotadd so a system can start with mobility

3844

* disabled and enable it later

3846

* disabled and enable it later

3845

*/

3847

*/

3846

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3848

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3847

page_group_by_mobility_disabled = 1;

3849

page_group_by_mobility_disabled = 1;

3848

else

3850

else

3849

page_group_by_mobility_disabled = 0;

3851

page_group_by_mobility_disabled = 0;

3850

3852

3851

printk("Built %i zonelists in %s order, mobility grouping %s. "

3853

printk("Built %i zonelists in %s order, mobility grouping %s. "

3852

"Total pages: %ld\n",

3854

"Total pages: %ld\n",

3853

nr_online_nodes,

3855

nr_online_nodes,

3854

zonelist_order_name[current_zonelist_order],

3856

zonelist_order_name[current_zonelist_order],

3855

page_group_by_mobility_disabled ? "off" : "on",

3857

page_group_by_mobility_disabled ? "off" : "on",

3856

vm_total_pages);

3858

vm_total_pages);

3857

#ifdef CONFIG_NUMA

3859

#ifdef CONFIG_NUMA

3858

printk("Policy zone: %s\n", zone_names[policy_zone]);

3860

printk("Policy zone: %s\n", zone_names[policy_zone]);

3859

#endif

3861

#endif

3860

}

3862

}

3861

3863

3862

/*

3864

/*

3863

* Helper functions to size the waitqueue hash table.

3865

* Helper functions to size the waitqueue hash table.

3864

* Essentially these want to choose hash table sizes sufficiently

3866

* Essentially these want to choose hash table sizes sufficiently

3865

* large so that collisions trying to wait on pages are rare.

3867

* large so that collisions trying to wait on pages are rare.

3866

* But in fact, the number of active page waitqueues on typical

3868

* But in fact, the number of active page waitqueues on typical

3867

* systems is ridiculously low, less than 200. So this is even

3869

* systems is ridiculously low, less than 200. So this is even

3868

* conservative, even though it seems large.

3870

* conservative, even though it seems large.

3869

*

3871

*

3870

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3872

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3871

* waitqueues, i.e. the size of the waitq table given the number of pages.

3873

* waitqueues, i.e. the size of the waitq table given the number of pages.

3872

*/

3874

*/

3873

#define PAGES_PER_WAITQUEUE 256

3875

#define PAGES_PER_WAITQUEUE 256

3874

3876

3875

#ifndef CONFIG_MEMORY_HOTPLUG

3877

#ifndef CONFIG_MEMORY_HOTPLUG

3876

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3878

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3877

{

3879

{

3878

unsigned long size = 1;

3880

unsigned long size = 1;

3879

3881

3880

pages /= PAGES_PER_WAITQUEUE;

3882

pages /= PAGES_PER_WAITQUEUE;

3881

3883

3882

while (size < pages)

3884

while (size < pages)

3883

size <<= 1;

3885

size <<= 1;

3884

3886

3885

/*

3887

/*

3886

* Once we have dozens or even hundreds of threads sleeping

3888

* Once we have dozens or even hundreds of threads sleeping

3887

* on IO we've got bigger problems than wait queue collision.

3889

* on IO we've got bigger problems than wait queue collision.

3888

* Limit the size of the wait table to a reasonable size.

3890

* Limit the size of the wait table to a reasonable size.

3889

*/

3891

*/

3890

size = min(size, 4096UL);

3892

size = min(size, 4096UL);

3891

3893

3892

return max(size, 4UL);

3894

return max(size, 4UL);

3893

}

3895

}

3894

#else

3896

#else

3895

/*

3897

/*

3896

* A zone's size might be changed by hot-add, so it is not possible to determine

3898

* A zone's size might be changed by hot-add, so it is not possible to determine

3897

* a suitable size for its wait_table. So we use the maximum size now.

3899

* a suitable size for its wait_table. So we use the maximum size now.

3898

*

3900

*

3899

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3901

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3900

*

3902

*

3901

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3903

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3902

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3904

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3903

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3905

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3904

*

3906

*

3905

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3907

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3906

* or more by the traditional way. (See above). It equals:

3908

* or more by the traditional way. (See above). It equals:

3907

*

3909

*

3908

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3910

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3909

* ia64(16K page size) : = ( 8G + 4M)byte.

3911

* ia64(16K page size) : = ( 8G + 4M)byte.

3910

* powerpc (64K page size) : = (32G +16M)byte.

3912

* powerpc (64K page size) : = (32G +16M)byte.

3911

*/

3913

*/

3912

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3914

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3913

{

3915

{

3914

return 4096UL;

3916

return 4096UL;

3915

}

3917

}

3916

#endif

3918

#endif

3917

3919

3918

/*

3920

/*

3919

* This is an integer logarithm so that shifts can be used later

3921

* This is an integer logarithm so that shifts can be used later

3920

* to extract the more random high bits from the multiplicative

3922

* to extract the more random high bits from the multiplicative

3921

* hash function before the remainder is taken.

3923

* hash function before the remainder is taken.

3922

*/

3924

*/

3923

static inline unsigned long wait_table_bits(unsigned long size)

3925

static inline unsigned long wait_table_bits(unsigned long size)

3924

{

3926

{

3925

return ffz(~size);

3927

return ffz(~size);

3926

}

3928

}

3927

3929

3928

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3930

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3929

3931

3930

/*

3932

/*

3931

* Check if a pageblock contains reserved pages

3933

* Check if a pageblock contains reserved pages

3932

*/

3934

*/

3933

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3935

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3934

{

3936

{

3935

unsigned long pfn;

3937

unsigned long pfn;

3936

3938

3937

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3939

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3938

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3940

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3939

return 1;

3941

return 1;

3940

}

3942

}

3941

return 0;

3943

return 0;

3942

}

3944

}

3943

3945

3944

/*

3946

/*

3945

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3947

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3946

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3948

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3947

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3949

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3948

* higher will lead to a bigger reserve which will get freed as contiguous

3950

* higher will lead to a bigger reserve which will get freed as contiguous

3949

* blocks as reclaim kicks in

3951

* blocks as reclaim kicks in

3950

*/

3952

*/

3951

static void setup_zone_migrate_reserve(struct zone *zone)

3953

static void setup_zone_migrate_reserve(struct zone *zone)

3952

{

3954

{

3953

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3955

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3954

struct page *page;

3956

struct page *page;

3955

unsigned long block_migratetype;

3957

unsigned long block_migratetype;

3956

int reserve;

3958

int reserve;

3957

int old_reserve;

3959

int old_reserve;

3958

3960

3959

/*

3961

/*

3960

* Get the start pfn, end pfn and the number of blocks to reserve

3962

* Get the start pfn, end pfn and the number of blocks to reserve

3961

* We have to be careful to be aligned to pageblock_nr_pages to

3963

* We have to be careful to be aligned to pageblock_nr_pages to

3962

* make sure that we always check pfn_valid for the first page in

3964

* make sure that we always check pfn_valid for the first page in

3963

* the block.

3965

* the block.

3964

*/

3966

*/

3965

start_pfn = zone->zone_start_pfn;

3967

start_pfn = zone->zone_start_pfn;

3966

end_pfn = zone_end_pfn(zone);

3968

end_pfn = zone_end_pfn(zone);

3967

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3969

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3968

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3970

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3969

pageblock_order;

3971

pageblock_order;

3970

3972

3971

/*

3973

/*

3972

* Reserve blocks are generally in place to help high-order atomic

3974

* Reserve blocks are generally in place to help high-order atomic

3973

* allocations that are short-lived. A min_free_kbytes value that

3975

* allocations that are short-lived. A min_free_kbytes value that

3974

* would result in more than 2 reserve blocks for atomic allocations

3976

* would result in more than 2 reserve blocks for atomic allocations

3975

* is assumed to be in place to help anti-fragmentation for the

3977

* is assumed to be in place to help anti-fragmentation for the

3976

* future allocation of hugepages at runtime.

3978

* future allocation of hugepages at runtime.

3977

*/

3979

*/

3978

reserve = min(2, reserve);

3980

reserve = min(2, reserve);

3979

old_reserve = zone->nr_migrate_reserve_block;

3981

old_reserve = zone->nr_migrate_reserve_block;

3980

3982

3981

/* When memory hot-add, we almost always need to do nothing */

3983

/* When memory hot-add, we almost always need to do nothing */

3982

if (reserve == old_reserve)

3984

if (reserve == old_reserve)

3983

return;

3985

return;

3984

zone->nr_migrate_reserve_block = reserve;

3986

zone->nr_migrate_reserve_block = reserve;

3985

3987

3986

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3988

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3987

if (!pfn_valid(pfn))

3989

if (!pfn_valid(pfn))

3988

continue;

3990

continue;

3989

page = pfn_to_page(pfn);

3991

page = pfn_to_page(pfn);

3990

3992

3991

/* Watch out for overlapping nodes */

3993

/* Watch out for overlapping nodes */

3992

if (page_to_nid(page) != zone_to_nid(zone))

3994

if (page_to_nid(page) != zone_to_nid(zone))

3993

continue;

3995

continue;

3994

3996

3995

block_migratetype = get_pageblock_migratetype(page);

3997

block_migratetype = get_pageblock_migratetype(page);

3996

3998

3997

/* Only test what is necessary when the reserves are not met */

3999

/* Only test what is necessary when the reserves are not met */

3998

if (reserve > 0) {

4000

if (reserve > 0) {

3999

/*

4001

/*

4000

* Blocks with reserved pages will never free, skip

4002

* Blocks with reserved pages will never free, skip

4001

* them.

4003

* them.

4002

*/

4004

*/

4003

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4005

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4004

if (pageblock_is_reserved(pfn, block_end_pfn))

4006

if (pageblock_is_reserved(pfn, block_end_pfn))

4005

continue;

4007

continue;

4006

4008

4007

/* If this block is reserved, account for it */

4009

/* If this block is reserved, account for it */

4008

if (block_migratetype == MIGRATE_RESERVE) {

4010

if (block_migratetype == MIGRATE_RESERVE) {

4009

reserve--;

4011

reserve--;

4010

continue;

4012

continue;

4011

}

4013

}

4012

4014

4013

/* Suitable for reserving if this block is movable */

4015

/* Suitable for reserving if this block is movable */

4014

if (block_migratetype == MIGRATE_MOVABLE) {

4016

if (block_migratetype == MIGRATE_MOVABLE) {

4015

set_pageblock_migratetype(page,

4017

set_pageblock_migratetype(page,

4016

MIGRATE_RESERVE);

4018

MIGRATE_RESERVE);

4017

move_freepages_block(zone, page,

4019

move_freepages_block(zone, page,

4018

MIGRATE_RESERVE);

4020

MIGRATE_RESERVE);

4019

reserve--;

4021

reserve--;

4020

continue;

4022

continue;

4021

}

4023

}

4022

} else if (!old_reserve) {

4024

} else if (!old_reserve) {

4023

/*

4025

/*

4024

* At boot time we don't need to scan the whole zone

4026

* At boot time we don't need to scan the whole zone

4025

* for turning off MIGRATE_RESERVE.

4027

* for turning off MIGRATE_RESERVE.

4026

*/

4028

*/

4027

break;

4029

break;

4028

}

4030

}

4029

4031

4030

/*

4032

/*

4031

* If the reserve is met and this is a previous reserved block,

4033

* If the reserve is met and this is a previous reserved block,

4032

* take it back

4034

* take it back

4033

*/

4035

*/

4034

if (block_migratetype == MIGRATE_RESERVE) {

4036

if (block_migratetype == MIGRATE_RESERVE) {

4035

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4037

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4036

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4038

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4037

}

4039

}

4038

}

4040

}

4039

}

4041

}

4040

4042

4041

/*

4043

/*

4042

* Initially all pages are reserved - free ones are freed

4044

* Initially all pages are reserved - free ones are freed

4043

* up by free_all_bootmem() once the early boot process is

4045

* up by free_all_bootmem() once the early boot process is

4044

* done. Non-atomic initialization, single-pass.

4046

* done. Non-atomic initialization, single-pass.

4045

*/

4047

*/

4046

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4048

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4047

unsigned long start_pfn, enum memmap_context context)

4049

unsigned long start_pfn, enum memmap_context context)

4048

{

4050

{

4049

struct page *page;

4051

struct page *page;

4050

unsigned long end_pfn = start_pfn + size;

4052

unsigned long end_pfn = start_pfn + size;

4051

unsigned long pfn;

4053

unsigned long pfn;

4052

struct zone *z;

4054

struct zone *z;

4053

4055

4054

if (highest_memmap_pfn < end_pfn - 1)

4056

if (highest_memmap_pfn < end_pfn - 1)

4055

highest_memmap_pfn = end_pfn - 1;

4057

highest_memmap_pfn = end_pfn - 1;

4056

4058

4057

z = &NODE_DATA(nid)->node_zones[zone];

4059

z = &NODE_DATA(nid)->node_zones[zone];

4058

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4060

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4059

/*

4061

/*

4060

* There can be holes in boot-time mem_map[]s

4062

* There can be holes in boot-time mem_map[]s

4061

* handed to this function. They do not

4063

* handed to this function. They do not

4062

* exist on hotplugged memory.

4064

* exist on hotplugged memory.

4063

*/

4065

*/

4064

if (context == MEMMAP_EARLY) {

4066

if (context == MEMMAP_EARLY) {

4065

if (!early_pfn_valid(pfn))

4067

if (!early_pfn_valid(pfn))

4066

continue;

4068

continue;

4067

if (!early_pfn_in_nid(pfn, nid))

4069

if (!early_pfn_in_nid(pfn, nid))

4068

continue;

4070

continue;

4069

}

4071

}

4070

page = pfn_to_page(pfn);

4072

page = pfn_to_page(pfn);

4071

set_page_links(page, zone, nid, pfn);

4073

set_page_links(page, zone, nid, pfn);

4072

mminit_verify_page_links(page, zone, nid, pfn);

4074

mminit_verify_page_links(page, zone, nid, pfn);

4073

init_page_count(page);

4075

init_page_count(page);

4074

page_mapcount_reset(page);

4076

page_mapcount_reset(page);

4075

page_nid_reset_last(page);

4077

page_nid_reset_last(page);

4076

SetPageReserved(page);

4078

SetPageReserved(page);

4077

/*

4079

/*

4078

* Mark the block movable so that blocks are reserved for

4080

* Mark the block movable so that blocks are reserved for

4079

* movable at startup. This will force kernel allocations

4081

* movable at startup. This will force kernel allocations

4080

* to reserve their blocks rather than leaking throughout

4082

* to reserve their blocks rather than leaking throughout

4081

* the address space during boot when many long-lived

4083

* the address space during boot when many long-lived

4082

* kernel allocations are made. Later some blocks near

4084

* kernel allocations are made. Later some blocks near

4083

* the start are marked MIGRATE_RESERVE by

4085

* the start are marked MIGRATE_RESERVE by

4084

* setup_zone_migrate_reserve()

4086

* setup_zone_migrate_reserve()

4085

*

4087

*

4086

* bitmap is created for zone's valid pfn range. but memmap

4088

* bitmap is created for zone's valid pfn range. but memmap

4087

* can be created for invalid pages (for alignment)

4089

* can be created for invalid pages (for alignment)

4088

* check here not to call set_pageblock_migratetype() against

4090

* check here not to call set_pageblock_migratetype() against

4089

* pfn out of zone.

4091

* pfn out of zone.

4090

*/

4092

*/

4091

if ((z->zone_start_pfn <= pfn)

4093

if ((z->zone_start_pfn <= pfn)

4092

&& (pfn < zone_end_pfn(z))

4094

&& (pfn < zone_end_pfn(z))

4093

&& !(pfn & (pageblock_nr_pages - 1)))

4095

&& !(pfn & (pageblock_nr_pages - 1)))

4094

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4096

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4095

4097

4096

INIT_LIST_HEAD(&page->lru);

4098

INIT_LIST_HEAD(&page->lru);

4097

#ifdef WANT_PAGE_VIRTUAL

4099

#ifdef WANT_PAGE_VIRTUAL

4098

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4100

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4099

if (!is_highmem_idx(zone))

4101

if (!is_highmem_idx(zone))

4100

set_page_address(page, __va(pfn << PAGE_SHIFT));

4102

set_page_address(page, __va(pfn << PAGE_SHIFT));

4101

#endif

4103

#endif

4102

}

4104

}

4103

}

4105

}

4104

4106

4105

static void __meminit zone_init_free_lists(struct zone *zone)

4107

static void __meminit zone_init_free_lists(struct zone *zone)

4106

{

4108

{

4107

int order, t;

4109

int order, t;

4108

for_each_migratetype_order(order, t) {

4110

for_each_migratetype_order(order, t) {

4109

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4111

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4110

zone->free_area[order].nr_free = 0;

4112

zone->free_area[order].nr_free = 0;

4111

}

4113

}

4112

}

4114

}

4113

4115

4114

#ifndef __HAVE_ARCH_MEMMAP_INIT

4116

#ifndef __HAVE_ARCH_MEMMAP_INIT

4115

#define memmap_init(size, nid, zone, start_pfn) \

4117

#define memmap_init(size, nid, zone, start_pfn) \

4116

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4118

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4117

#endif

4119

#endif

4118

4120

4119

static int zone_batchsize(struct zone *zone)

4121

static int zone_batchsize(struct zone *zone)

4120

{

4122

{

4121

#ifdef CONFIG_MMU

4123

#ifdef CONFIG_MMU

4122

int batch;

4124

int batch;

4123

4125

4124

/*

4126

/*

4125

* The per-cpu-pages pools are set to around 1000th of the

4127

* The per-cpu-pages pools are set to around 1000th of the

4126

* size of the zone. But no more than 1/2 of a meg.

4128

* size of the zone. But no more than 1/2 of a meg.

4127

*

4129

*

4128

* OK, so we don't know how big the cache is. So guess.

4130

* OK, so we don't know how big the cache is. So guess.

4129

*/

4131

*/

4130

batch = zone->managed_pages / 1024;

4132

batch = zone->managed_pages / 1024;

4131

if (batch * PAGE_SIZE > 512 * 1024)

4133

if (batch * PAGE_SIZE > 512 * 1024)

4132

batch = (512 * 1024) / PAGE_SIZE;

4134

batch = (512 * 1024) / PAGE_SIZE;

4133

batch /= 4; /* We effectively *= 4 below */

4135

batch /= 4; /* We effectively *= 4 below */

4134

if (batch < 1)

4136

if (batch < 1)

4135

batch = 1;

4137

batch = 1;

4136

4138

4137

/*

4139

/*

4138

* Clamp the batch to a 2^n - 1 value. Having a power

4140

* Clamp the batch to a 2^n - 1 value. Having a power

4139

* of 2 value was found to be more likely to have

4141

* of 2 value was found to be more likely to have

4140

* suboptimal cache aliasing properties in some cases.

4142

* suboptimal cache aliasing properties in some cases.

4141

*

4143

*

4142

* For example if 2 tasks are alternately allocating

4144

* For example if 2 tasks are alternately allocating

4143

* batches of pages, one task can end up with a lot

4145

* batches of pages, one task can end up with a lot

4144

* of pages of one half of the possible page colors

4146

* of pages of one half of the possible page colors

4145

* and the other with pages of the other colors.

4147

* and the other with pages of the other colors.

4146

*/

4148

*/

4147

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4149

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4148

4150

4149

return batch;

4151

return batch;

4150

4152

4151

#else

4153

#else

4152

/* The deferral and batching of frees should be suppressed under NOMMU

4154

/* The deferral and batching of frees should be suppressed under NOMMU

4153

* conditions.

4155

* conditions.

4154

*

4156

*

4155

* The problem is that NOMMU needs to be able to allocate large chunks

4157

* The problem is that NOMMU needs to be able to allocate large chunks

4156

* of contiguous memory as there's no hardware page translation to

4158

* of contiguous memory as there's no hardware page translation to

4157

* assemble apparent contiguous memory from discontiguous pages.

4159

* assemble apparent contiguous memory from discontiguous pages.

4158

*

4160

*

4159

* Queueing large contiguous runs of pages for batching, however,

4161

* Queueing large contiguous runs of pages for batching, however,

4160

* causes the pages to actually be freed in smaller chunks. As there

4162

* causes the pages to actually be freed in smaller chunks. As there

4161

* can be a significant delay between the individual batches being

4163

* can be a significant delay between the individual batches being

4162

* recycled, this leads to the once large chunks of space being

4164

* recycled, this leads to the once large chunks of space being

4163

* fragmented and becoming unavailable for high-order allocations.

4165

* fragmented and becoming unavailable for high-order allocations.

4164

*/

4166

*/

4165

return 0;

4167

return 0;

4166

#endif

4168

#endif

4167

}

4169

}

4168

4170

4169

/*

4171

/*

4170

* pcp->high and pcp->batch values are related and dependent on one another:

4172

* pcp->high and pcp->batch values are related and dependent on one another:

4171

* ->batch must never be higher then ->high.

4173

* ->batch must never be higher then ->high.

4172

* The following function updates them in a safe manner without read side

4174

* The following function updates them in a safe manner without read side

4173

* locking.

4175

* locking.

4174

*

4176

*

4175

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4177

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4176

* those fields changing asynchronously (acording the the above rule).

4178

* those fields changing asynchronously (acording the the above rule).

4177

*

4179

*

4178

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4180

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4179

* outside of boot time (or some other assurance that no concurrent updaters

4181

* outside of boot time (or some other assurance that no concurrent updaters

4180

* exist).

4182

* exist).

4181

*/

4183

*/

4182

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4184

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4183

unsigned long batch)

4185

unsigned long batch)

4184

{

4186

{

4185

/* start with a fail safe value for batch */

4187

/* start with a fail safe value for batch */

4186

pcp->batch = 1;

4188

pcp->batch = 1;

4187

smp_wmb();

4189

smp_wmb();

4188

4190

4189

/* Update high, then batch, in order */

4191

/* Update high, then batch, in order */

4190

pcp->high = high;

4192

pcp->high = high;

4191

smp_wmb();

4193

smp_wmb();

4192

4194

4193

pcp->batch = batch;

4195

pcp->batch = batch;

4194

}

4196

}

4195

4197

4196

/* a companion to pageset_set_high() */

4198

/* a companion to pageset_set_high() */

4197

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4199

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4198

{

4200

{

4199

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4201

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4200

}

4202

}

4201

4203

4202

static void pageset_init(struct per_cpu_pageset *p)

4204

static void pageset_init(struct per_cpu_pageset *p)

4203

{

4205

{

4204

struct per_cpu_pages *pcp;

4206

struct per_cpu_pages *pcp;

4205

int migratetype;

4207

int migratetype;

4206

4208

4207

memset(p, 0, sizeof(*p));

4209

memset(p, 0, sizeof(*p));

4208

4210

4209

pcp = &p->pcp;

4211

pcp = &p->pcp;

4210

pcp->count = 0;

4212

pcp->count = 0;

4211

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4213

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4212

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4214

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4213

}

4215

}

4214

4216

4215

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4217

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4216

{

4218

{

4217

pageset_init(p);

4219

pageset_init(p);

4218

pageset_set_batch(p, batch);

4220

pageset_set_batch(p, batch);

4219

}

4221

}

4220

4222

4221

/*

4223

/*

4222

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4224

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4223

* to the value high for the pageset p.

4225

* to the value high for the pageset p.

4224

*/

4226

*/

4225

static void pageset_set_high(struct per_cpu_pageset *p,

4227

static void pageset_set_high(struct per_cpu_pageset *p,

4226

unsigned long high)

4228

unsigned long high)

4227

{

4229

{

4228

unsigned long batch = max(1UL, high / 4);

4230

unsigned long batch = max(1UL, high / 4);

4229

if ((high / 4) > (PAGE_SHIFT * 8))

4231

if ((high / 4) > (PAGE_SHIFT * 8))

4230

batch = PAGE_SHIFT * 8;

4232

batch = PAGE_SHIFT * 8;

4231

4233

4232

pageset_update(&p->pcp, high, batch);

4234

pageset_update(&p->pcp, high, batch);

4233

}

4235

}

4234

4236

4235

static void pageset_set_high_and_batch(struct zone *zone,

4237

static void pageset_set_high_and_batch(struct zone *zone,

4236

struct per_cpu_pageset *pcp)

4238

struct per_cpu_pageset *pcp)

4237

{

4239

{

4238

if (percpu_pagelist_fraction)

4240

if (percpu_pagelist_fraction)

4239

pageset_set_high(pcp,

4241

pageset_set_high(pcp,

4240

(zone->managed_pages /

4242

(zone->managed_pages /

4241

percpu_pagelist_fraction));

4243

percpu_pagelist_fraction));

4242

else

4244

else

4243

pageset_set_batch(pcp, zone_batchsize(zone));

4245

pageset_set_batch(pcp, zone_batchsize(zone));

4244

}

4246

}

4245

4247

4246

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4248

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4247

{

4249

{

4248

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4250

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4249

4251

4250

pageset_init(pcp);

4252

pageset_init(pcp);

4251

pageset_set_high_and_batch(zone, pcp);

4253

pageset_set_high_and_batch(zone, pcp);

4252

}

4254

}

4253

4255

4254

static void __meminit setup_zone_pageset(struct zone *zone)

4256

static void __meminit setup_zone_pageset(struct zone *zone)

4255

{

4257

{

4256

int cpu;

4258

int cpu;

4257

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4259

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4258

for_each_possible_cpu(cpu)

4260

for_each_possible_cpu(cpu)

4259

zone_pageset_init(zone, cpu);

4261

zone_pageset_init(zone, cpu);

4260

}

4262

}

4261

4263

4262

/*

4264

/*

4263

* Allocate per cpu pagesets and initialize them.

4265

* Allocate per cpu pagesets and initialize them.

4264

* Before this call only boot pagesets were available.

4266

* Before this call only boot pagesets were available.

4265

*/

4267

*/

4266

void __init setup_per_cpu_pageset(void)

4268

void __init setup_per_cpu_pageset(void)

4267

{

4269

{

4268

struct zone *zone;

4270

struct zone *zone;

4269

4271

4270

for_each_populated_zone(zone)

4272

for_each_populated_zone(zone)

4271

setup_zone_pageset(zone);

4273

setup_zone_pageset(zone);

4272

}

4274

}

4273

4275

4274

static noinline __init_refok

4276

static noinline __init_refok

4275

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4277

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4276

{

4278

{

4277

int i;

4279

int i;

4278

struct pglist_data *pgdat = zone->zone_pgdat;

4280

struct pglist_data *pgdat = zone->zone_pgdat;

4279

size_t alloc_size;

4281

size_t alloc_size;

4280

4282

4281

/*

4283

/*

4282

* The per-page waitqueue mechanism uses hashed waitqueues

4284

* The per-page waitqueue mechanism uses hashed waitqueues

4283

* per zone.

4285

* per zone.

4284

*/

4286

*/

4285

zone->wait_table_hash_nr_entries =

4287

zone->wait_table_hash_nr_entries =

4286

wait_table_hash_nr_entries(zone_size_pages);

4288

wait_table_hash_nr_entries(zone_size_pages);

4287

zone->wait_table_bits =

4289

zone->wait_table_bits =

4288

wait_table_bits(zone->wait_table_hash_nr_entries);

4290

wait_table_bits(zone->wait_table_hash_nr_entries);

4289

alloc_size = zone->wait_table_hash_nr_entries

4291

alloc_size = zone->wait_table_hash_nr_entries

4290

* sizeof(wait_queue_head_t);

4292

* sizeof(wait_queue_head_t);

4291

4293

4292

if (!slab_is_available()) {

4294

if (!slab_is_available()) {

4293

zone->wait_table = (wait_queue_head_t *)

4295

zone->wait_table = (wait_queue_head_t *)

4294

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4296

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4295

} else {

4297

} else {

4296

/*

4298

/*

4297

* This case means that a zone whose size was 0 gets new memory

4299

* This case means that a zone whose size was 0 gets new memory

4298

* via memory hot-add.

4300

* via memory hot-add.

4299

* But it may be the case that a new node was hot-added. In

4301

* But it may be the case that a new node was hot-added. In

4300

* this case vmalloc() will not be able to use this new node's

4302

* this case vmalloc() will not be able to use this new node's

4301

* memory - this wait_table must be initialized to use this new

4303

* memory - this wait_table must be initialized to use this new

4302

* node itself as well.

4304

* node itself as well.

4303

* To use this new node's memory, further consideration will be

4305

* To use this new node's memory, further consideration will be

4304

* necessary.

4306

* necessary.

4305

*/

4307

*/

4306

zone->wait_table = vmalloc(alloc_size);

4308

zone->wait_table = vmalloc(alloc_size);

4307

}

4309

}

4308

if (!zone->wait_table)

4310

if (!zone->wait_table)

4309

return -ENOMEM;

4311

return -ENOMEM;

4310

4312

4311

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4313

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4312

init_waitqueue_head(zone->wait_table + i);

4314

init_waitqueue_head(zone->wait_table + i);

4313

4315

4314

return 0;

4316

return 0;

4315

}

4317

}

4316

4318

4317

static __meminit void zone_pcp_init(struct zone *zone)

4319

static __meminit void zone_pcp_init(struct zone *zone)

4318

{

4320

{

4319

/*

4321

/*

4320

* per cpu subsystem is not up at this point. The following code

4322

* per cpu subsystem is not up at this point. The following code

4321

* relies on the ability of the linker to provide the

4323

* relies on the ability of the linker to provide the

4322

* offset of a (static) per cpu variable into the per cpu area.

4324

* offset of a (static) per cpu variable into the per cpu area.

4323

*/

4325

*/

4324

zone->pageset = &boot_pageset;

4326

zone->pageset = &boot_pageset;

4325

4327

4326

if (zone->present_pages)

4328

if (zone->present_pages)

4327

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4329

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4328

zone->name, zone->present_pages,

4330

zone->name, zone->present_pages,

4329

zone_batchsize(zone));

4331

zone_batchsize(zone));

4330

}

4332

}

4331

4333

4332

int __meminit init_currently_empty_zone(struct zone *zone,

4334

int __meminit init_currently_empty_zone(struct zone *zone,

4333

unsigned long zone_start_pfn,

4335

unsigned long zone_start_pfn,

4334

unsigned long size,

4336

unsigned long size,

4335

enum memmap_context context)

4337

enum memmap_context context)

4336

{

4338

{

4337

struct pglist_data *pgdat = zone->zone_pgdat;

4339

struct pglist_data *pgdat = zone->zone_pgdat;

4338

int ret;

4340

int ret;

4339

ret = zone_wait_table_init(zone, size);

4341

ret = zone_wait_table_init(zone, size);

4340

if (ret)

4342

if (ret)

4341

return ret;

4343

return ret;

4342

pgdat->nr_zones = zone_idx(zone) + 1;

4344

pgdat->nr_zones = zone_idx(zone) + 1;

4343

4345

4344

zone->zone_start_pfn = zone_start_pfn;

4346

zone->zone_start_pfn = zone_start_pfn;

4345

4347

4346

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4348

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4347

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4349

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4348

pgdat->node_id,

4350

pgdat->node_id,

4349

(unsigned long)zone_idx(zone),

4351

(unsigned long)zone_idx(zone),

4350

zone_start_pfn, (zone_start_pfn + size));

4352

zone_start_pfn, (zone_start_pfn + size));

4351

4353

4352

zone_init_free_lists(zone);

4354

zone_init_free_lists(zone);

4353

4355

4354

return 0;

4356

return 0;

4355

}

4357

}

4356

4358

4357

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4359

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4358

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4360

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4359

/*

4361

/*

4360

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4362

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4361

* Architectures may implement their own version but if add_active_range()

4363

* Architectures may implement their own version but if add_active_range()

4362

* was used and there are no special requirements, this is a convenient

4364

* was used and there are no special requirements, this is a convenient

4363

* alternative

4365

* alternative

4364

*/

4366

*/

4365

int __meminit __early_pfn_to_nid(unsigned long pfn)

4367

int __meminit __early_pfn_to_nid(unsigned long pfn)

4366

{

4368

{

4367

unsigned long start_pfn, end_pfn;

4369

unsigned long start_pfn, end_pfn;

4368

int nid;

4370

int nid;

4369

/*

4371

/*

4370

* NOTE: The following SMP-unsafe globals are only used early in boot

4372

* NOTE: The following SMP-unsafe globals are only used early in boot

4371

* when the kernel is running single-threaded.

4373

* when the kernel is running single-threaded.

4372

*/

4374

*/

4373

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4375

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4374

static int __meminitdata last_nid;

4376

static int __meminitdata last_nid;

4375

4377

4376

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4378

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4377

return last_nid;

4379

return last_nid;

4378

4380

4379

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4381

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4380

if (nid != -1) {

4382

if (nid != -1) {

4381

last_start_pfn = start_pfn;

4383

last_start_pfn = start_pfn;

4382

last_end_pfn = end_pfn;

4384

last_end_pfn = end_pfn;

4383

last_nid = nid;

4385

last_nid = nid;

4384

}

4386

}

4385

4387

4386

return nid;

4388

return nid;

4387

}

4389

}

4388

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4390

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4389

4391

4390

int __meminit early_pfn_to_nid(unsigned long pfn)

4392

int __meminit early_pfn_to_nid(unsigned long pfn)

4391

{

4393

{

4392

int nid;

4394

int nid;

4393

4395

4394

nid = __early_pfn_to_nid(pfn);

4396

nid = __early_pfn_to_nid(pfn);

4395

if (nid >= 0)

4397

if (nid >= 0)

4396

return nid;

4398

return nid;

4397

/* just returns 0 */

4399

/* just returns 0 */

4398

return 0;

4400

return 0;

4399

}

4401

}

4400

4402

4401

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4403

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4402

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4404

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4403

{

4405

{

4404

int nid;

4406

int nid;

4405

4407

4406

nid = __early_pfn_to_nid(pfn);

4408

nid = __early_pfn_to_nid(pfn);

4407

if (nid >= 0 && nid != node)

4409

if (nid >= 0 && nid != node)

4408

return false;

4410

return false;

4409

return true;

4411

return true;

4410

}

4412

}

4411

#endif

4413

#endif

4412

4414

4413

/**

4415

/**

4414

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4416

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4415

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4417

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4416

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4418

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4417

*

4419

*

4418

* If an architecture guarantees that all ranges registered with

4420

* If an architecture guarantees that all ranges registered with

4419

* add_active_ranges() contain no holes and may be freed, this

4421

* add_active_ranges() contain no holes and may be freed, this

4420

* this function may be used instead of calling free_bootmem() manually.

4422

* this function may be used instead of calling free_bootmem() manually.

4421

*/

4423

*/

4422

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4424

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4423

{

4425

{

4424

unsigned long start_pfn, end_pfn;

4426

unsigned long start_pfn, end_pfn;

4425

int i, this_nid;

4427

int i, this_nid;

4426

4428

4427

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4429

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4428

start_pfn = min(start_pfn, max_low_pfn);

4430

start_pfn = min(start_pfn, max_low_pfn);

4429

end_pfn = min(end_pfn, max_low_pfn);

4431

end_pfn = min(end_pfn, max_low_pfn);

4430

4432

4431

if (start_pfn < end_pfn)

4433

if (start_pfn < end_pfn)

4432

free_bootmem_node(NODE_DATA(this_nid),

4434

free_bootmem_node(NODE_DATA(this_nid),

4433

PFN_PHYS(start_pfn),

4435

PFN_PHYS(start_pfn),

4434

(end_pfn - start_pfn) << PAGE_SHIFT);

4436

(end_pfn - start_pfn) << PAGE_SHIFT);

4435

}

4437

}

4436

}

4438

}

4437

4439

4438

/**

4440

/**

4439

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4441

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4440

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4442

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4441

*

4443

*

4442

* If an architecture guarantees that all ranges registered with

4444

* If an architecture guarantees that all ranges registered with

4443

* add_active_ranges() contain no holes and may be freed, this

4445

* add_active_ranges() contain no holes and may be freed, this

4444

* function may be used instead of calling memory_present() manually.

4446

* function may be used instead of calling memory_present() manually.

4445

*/

4447

*/

4446

void __init sparse_memory_present_with_active_regions(int nid)

4448

void __init sparse_memory_present_with_active_regions(int nid)

4447

{

4449

{

4448

unsigned long start_pfn, end_pfn;

4450

unsigned long start_pfn, end_pfn;

4449

int i, this_nid;

4451

int i, this_nid;

4450

4452

4451

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4453

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4452

memory_present(this_nid, start_pfn, end_pfn);

4454

memory_present(this_nid, start_pfn, end_pfn);

4453

}

4455

}

4454

4456

4455

/**

4457

/**

4456

* get_pfn_range_for_nid - Return the start and end page frames for a node

4458

* get_pfn_range_for_nid - Return the start and end page frames for a node

4457

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4459

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4458

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4460

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4459

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4461

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4460

*

4462

*

4461

* It returns the start and end page frame of a node based on information

4463

* It returns the start and end page frame of a node based on information

4462

* provided by an arch calling add_active_range(). If called for a node

4464

* provided by an arch calling add_active_range(). If called for a node

4463

* with no available memory, a warning is printed and the start and end

4465

* with no available memory, a warning is printed and the start and end

4464

* PFNs will be 0.

4466

* PFNs will be 0.

4465

*/

4467

*/

4466

void __meminit get_pfn_range_for_nid(unsigned int nid,

4468

void __meminit get_pfn_range_for_nid(unsigned int nid,

4467

unsigned long *start_pfn, unsigned long *end_pfn)

4469

unsigned long *start_pfn, unsigned long *end_pfn)

4468

{

4470

{

4469

unsigned long this_start_pfn, this_end_pfn;

4471

unsigned long this_start_pfn, this_end_pfn;

4470

int i;

4472

int i;

4471

4473

4472

*start_pfn = -1UL;

4474

*start_pfn = -1UL;

4473

*end_pfn = 0;

4475

*end_pfn = 0;

4474

4476

4475

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4477

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4476

*start_pfn = min(*start_pfn, this_start_pfn);

4478

*start_pfn = min(*start_pfn, this_start_pfn);

4477

*end_pfn = max(*end_pfn, this_end_pfn);

4479

*end_pfn = max(*end_pfn, this_end_pfn);

4478

}

4480

}

4479

4481

4480

if (*start_pfn == -1UL)

4482

if (*start_pfn == -1UL)

4481

*start_pfn = 0;

4483

*start_pfn = 0;

4482

}

4484

}

4483

4485

4484

/*

4486

/*

4485

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4487

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4486

* assumption is made that zones within a node are ordered in monotonic

4488

* assumption is made that zones within a node are ordered in monotonic

4487

* increasing memory addresses so that the "highest" populated zone is used

4489

* increasing memory addresses so that the "highest" populated zone is used

4488

*/

4490

*/

4489

static void __init find_usable_zone_for_movable(void)

4491

static void __init find_usable_zone_for_movable(void)

4490

{

4492

{

4491

int zone_index;

4493

int zone_index;

4492

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4494

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4493

if (zone_index == ZONE_MOVABLE)

4495

if (zone_index == ZONE_MOVABLE)

4494

continue;

4496

continue;

4495

4497

4496

if (arch_zone_highest_possible_pfn[zone_index] >

4498

if (arch_zone_highest_possible_pfn[zone_index] >

4497

arch_zone_lowest_possible_pfn[zone_index])

4499

arch_zone_lowest_possible_pfn[zone_index])

4498

break;

4500

break;

4499

}

4501

}

4500

4502

4501

VM_BUG_ON(zone_index == -1);

4503

VM_BUG_ON(zone_index == -1);

4502

movable_zone = zone_index;

4504

movable_zone = zone_index;

4503

}

4505

}

4504

4506

4505

/*

4507

/*

4506

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4508

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4507

* because it is sized independent of architecture. Unlike the other zones,

4509

* because it is sized independent of architecture. Unlike the other zones,

4508

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4510

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4509

* in each node depending on the size of each node and how evenly kernelcore

4511

* in each node depending on the size of each node and how evenly kernelcore

4510

* is distributed. This helper function adjusts the zone ranges

4512

* is distributed. This helper function adjusts the zone ranges

4511

* provided by the architecture for a given node by using the end of the

4513

* provided by the architecture for a given node by using the end of the

4512

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4514

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4513

* zones within a node are in order of monotonic increases memory addresses

4515

* zones within a node are in order of monotonic increases memory addresses

4514

*/

4516

*/

4515

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4517

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4516

unsigned long zone_type,

4518

unsigned long zone_type,

4517

unsigned long node_start_pfn,

4519

unsigned long node_start_pfn,

4518

unsigned long node_end_pfn,

4520

unsigned long node_end_pfn,

4519

unsigned long *zone_start_pfn,

4521

unsigned long *zone_start_pfn,

4520

unsigned long *zone_end_pfn)

4522

unsigned long *zone_end_pfn)

4521

{

4523

{

4522

/* Only adjust if ZONE_MOVABLE is on this node */

4524

/* Only adjust if ZONE_MOVABLE is on this node */

4523

if (zone_movable_pfn[nid]) {

4525

if (zone_movable_pfn[nid]) {

4524

/* Size ZONE_MOVABLE */

4526

/* Size ZONE_MOVABLE */

4525

if (zone_type == ZONE_MOVABLE) {

4527

if (zone_type == ZONE_MOVABLE) {

4526

*zone_start_pfn = zone_movable_pfn[nid];

4528

*zone_start_pfn = zone_movable_pfn[nid];

4527

*zone_end_pfn = min(node_end_pfn,

4529

*zone_end_pfn = min(node_end_pfn,

4528

arch_zone_highest_possible_pfn[movable_zone]);

4530

arch_zone_highest_possible_pfn[movable_zone]);

4529

4531

4530

/* Adjust for ZONE_MOVABLE starting within this range */

4532

/* Adjust for ZONE_MOVABLE starting within this range */

4531

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4533

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4532

*zone_end_pfn > zone_movable_pfn[nid]) {

4534

*zone_end_pfn > zone_movable_pfn[nid]) {

4533

*zone_end_pfn = zone_movable_pfn[nid];

4535

*zone_end_pfn = zone_movable_pfn[nid];

4534

4536

4535

/* Check if this whole range is within ZONE_MOVABLE */

4537

/* Check if this whole range is within ZONE_MOVABLE */

4536

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4538

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4537

*zone_start_pfn = *zone_end_pfn;

4539

*zone_start_pfn = *zone_end_pfn;

4538

}

4540

}

4539

}

4541

}

4540

4542

4541

/*

4543

/*

4542

* Return the number of pages a zone spans in a node, including holes

4544

* Return the number of pages a zone spans in a node, including holes

4543

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4545

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4544

*/

4546

*/

4545

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4547

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4546

unsigned long zone_type,

4548

unsigned long zone_type,

4547

unsigned long node_start_pfn,

4549

unsigned long node_start_pfn,

4548

unsigned long node_end_pfn,

4550

unsigned long node_end_pfn,

4549

unsigned long *ignored)

4551

unsigned long *ignored)

4550

{

4552

{

4551

unsigned long zone_start_pfn, zone_end_pfn;

4553

unsigned long zone_start_pfn, zone_end_pfn;

4552

4554

4553

/* Get the start and end of the zone */

4555

/* Get the start and end of the zone */

4554

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4556

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4555

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4557

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4556

adjust_zone_range_for_zone_movable(nid, zone_type,

4558

adjust_zone_range_for_zone_movable(nid, zone_type,

4557

node_start_pfn, node_end_pfn,

4559

node_start_pfn, node_end_pfn,

4558

&zone_start_pfn, &zone_end_pfn);

4560

&zone_start_pfn, &zone_end_pfn);

4559

4561

4560

/* Check that this node has pages within the zone's required range */

4562

/* Check that this node has pages within the zone's required range */

4561

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4563

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4562

return 0;

4564

return 0;

4563

4565

4564

/* Move the zone boundaries inside the node if necessary */

4566

/* Move the zone boundaries inside the node if necessary */

4565

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4567

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4566

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4568

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4567

4569

4568

/* Return the spanned pages */

4570

/* Return the spanned pages */

4569

return zone_end_pfn - zone_start_pfn;

4571

return zone_end_pfn - zone_start_pfn;

4570

}

4572

}

4571

4573

4572

/*

4574

/*

4573

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4575

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4574

* then all holes in the requested range will be accounted for.

4576

* then all holes in the requested range will be accounted for.

4575

*/

4577

*/

4576

unsigned long __meminit __absent_pages_in_range(int nid,

4578

unsigned long __meminit __absent_pages_in_range(int nid,

4577

unsigned long range_start_pfn,

4579

unsigned long range_start_pfn,

4578

unsigned long range_end_pfn)

4580

unsigned long range_end_pfn)

4579

{

4581

{

4580

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4582

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4581

unsigned long start_pfn, end_pfn;

4583

unsigned long start_pfn, end_pfn;

4582

int i;

4584

int i;

4583

4585

4584

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4586

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4585

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4587

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4586

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4588

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4587

nr_absent -= end_pfn - start_pfn;

4589

nr_absent -= end_pfn - start_pfn;

4588

}

4590

}

4589

return nr_absent;

4591

return nr_absent;

4590

}

4592

}

4591

4593

4592

/**

4594

/**

4593

* absent_pages_in_range - Return number of page frames in holes within a range

4595

* absent_pages_in_range - Return number of page frames in holes within a range

4594

* @start_pfn: The start PFN to start searching for holes

4596

* @start_pfn: The start PFN to start searching for holes

4595

* @end_pfn: The end PFN to stop searching for holes

4597

* @end_pfn: The end PFN to stop searching for holes

4596

*

4598

*

4597

* It returns the number of pages frames in memory holes within a range.

4599

* It returns the number of pages frames in memory holes within a range.

4598

*/

4600

*/

4599

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4601

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4600

unsigned long end_pfn)

4602

unsigned long end_pfn)

4601

{

4603

{

4602

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4604

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4603

}

4605

}

4604

4606

4605

/* Return the number of page frames in holes in a zone on a node */

4607

/* Return the number of page frames in holes in a zone on a node */

4606

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4608

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4607

unsigned long zone_type,

4609

unsigned long zone_type,

4608

unsigned long node_start_pfn,

4610

unsigned long node_start_pfn,

4609

unsigned long node_end_pfn,

4611

unsigned long node_end_pfn,

4610

unsigned long *ignored)

4612

unsigned long *ignored)

4611

{

4613

{

4612

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4614

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4613

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4615

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4614

unsigned long zone_start_pfn, zone_end_pfn;

4616

unsigned long zone_start_pfn, zone_end_pfn;

4615

4617

4616

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4618

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4617

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4619

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4618

4620

4619

adjust_zone_range_for_zone_movable(nid, zone_type,

4621

adjust_zone_range_for_zone_movable(nid, zone_type,

4620

node_start_pfn, node_end_pfn,

4622

node_start_pfn, node_end_pfn,

4621

&zone_start_pfn, &zone_end_pfn);

4623

&zone_start_pfn, &zone_end_pfn);

4622

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4624

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4623

}

4625

}

4624

4626

4625

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4627

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4626

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4628

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4627

unsigned long zone_type,

4629

unsigned long zone_type,

4628

unsigned long node_start_pfn,

4630

unsigned long node_start_pfn,

4629

unsigned long node_end_pfn,

4631

unsigned long node_end_pfn,

4630

unsigned long *zones_size)

4632

unsigned long *zones_size)

4631

{

4633

{

4632

return zones_size[zone_type];

4634

return zones_size[zone_type];

4633

}

4635

}

4634

4636

4635

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4637

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4636

unsigned long zone_type,

4638

unsigned long zone_type,

4637

unsigned long node_start_pfn,

4639

unsigned long node_start_pfn,

4638

unsigned long node_end_pfn,

4640

unsigned long node_end_pfn,

4639

unsigned long *zholes_size)

4641

unsigned long *zholes_size)

4640

{

4642

{

4641

if (!zholes_size)

4643

if (!zholes_size)

4642

return 0;

4644

return 0;

4643

4645

4644

return zholes_size[zone_type];

4646

return zholes_size[zone_type];

4645

}

4647

}

4646

4648

4647

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4649

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4648

4650

4649

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4651

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4650

unsigned long node_start_pfn,

4652

unsigned long node_start_pfn,

4651

unsigned long node_end_pfn,

4653

unsigned long node_end_pfn,

4652

unsigned long *zones_size,

4654

unsigned long *zones_size,

4653

unsigned long *zholes_size)

4655

unsigned long *zholes_size)

4654

{

4656

{

4655

unsigned long realtotalpages, totalpages = 0;

4657

unsigned long realtotalpages, totalpages = 0;

4656

enum zone_type i;

4658

enum zone_type i;

4657

4659

4658

for (i = 0; i < MAX_NR_ZONES; i++)

4660

for (i = 0; i < MAX_NR_ZONES; i++)

4659

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4661

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4660

node_start_pfn,

4662

node_start_pfn,

4661

node_end_pfn,

4663

node_end_pfn,

4662

zones_size);

4664

zones_size);

4663

pgdat->node_spanned_pages = totalpages;

4665

pgdat->node_spanned_pages = totalpages;

4664

4666

4665

realtotalpages = totalpages;

4667

realtotalpages = totalpages;

4666

for (i = 0; i < MAX_NR_ZONES; i++)

4668

for (i = 0; i < MAX_NR_ZONES; i++)

4667

realtotalpages -=

4669

realtotalpages -=

4668

zone_absent_pages_in_node(pgdat->node_id, i,

4670

zone_absent_pages_in_node(pgdat->node_id, i,

4669

node_start_pfn, node_end_pfn,

4671

node_start_pfn, node_end_pfn,

4670

zholes_size);

4672

zholes_size);

4671

pgdat->node_present_pages = realtotalpages;

4673

pgdat->node_present_pages = realtotalpages;

4672

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4674

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4673

realtotalpages);

4675

realtotalpages);

4674

}

4676

}

4675

4677

4676

#ifndef CONFIG_SPARSEMEM

4678

#ifndef CONFIG_SPARSEMEM

4677

/*

4679

/*

4678

* Calculate the size of the zone->blockflags rounded to an unsigned long

4680

* Calculate the size of the zone->blockflags rounded to an unsigned long

4679

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4681

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4680

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4682

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4681

* round what is now in bits to nearest long in bits, then return it in

4683

* round what is now in bits to nearest long in bits, then return it in

4682

* bytes.

4684

* bytes.

4683

*/

4685

*/

4684

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4686

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4685

{

4687

{

4686

unsigned long usemapsize;

4688

unsigned long usemapsize;

4687

4689

4688

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4690

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4689

usemapsize = roundup(zonesize, pageblock_nr_pages);

4691

usemapsize = roundup(zonesize, pageblock_nr_pages);

4690

usemapsize = usemapsize >> pageblock_order;

4692

usemapsize = usemapsize >> pageblock_order;

4691

usemapsize *= NR_PAGEBLOCK_BITS;

4693

usemapsize *= NR_PAGEBLOCK_BITS;

4692

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4694

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4693

4695

4694

return usemapsize / 8;

4696

return usemapsize / 8;

4695

}

4697

}

4696

4698

4697

static void __init setup_usemap(struct pglist_data *pgdat,

4699

static void __init setup_usemap(struct pglist_data *pgdat,

4698

struct zone *zone,

4700

struct zone *zone,

4699

unsigned long zone_start_pfn,

4701

unsigned long zone_start_pfn,

4700

unsigned long zonesize)

4702

unsigned long zonesize)

4701

{

4703

{

4702

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4704

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4703

zone->pageblock_flags = NULL;

4705

zone->pageblock_flags = NULL;

4704

if (usemapsize)

4706

if (usemapsize)

4705

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4707

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4706

usemapsize);

4708

usemapsize);

4707

}

4709

}

4708

#else

4710

#else

4709

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4711

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4710

unsigned long zone_start_pfn, unsigned long zonesize) {}

4712

unsigned long zone_start_pfn, unsigned long zonesize) {}

4711

#endif /* CONFIG_SPARSEMEM */

4713

#endif /* CONFIG_SPARSEMEM */

4712

4714

4713

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4715

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4714

4716

4715

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4717

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4716

void __paginginit set_pageblock_order(void)

4718

void __paginginit set_pageblock_order(void)

4717

{

4719

{

4718

unsigned int order;

4720

unsigned int order;

4719

4721

4720

/* Check that pageblock_nr_pages has not already been setup */

4722

/* Check that pageblock_nr_pages has not already been setup */

4721

if (pageblock_order)

4723

if (pageblock_order)

4722

return;

4724

return;

4723

4725

4724

if (HPAGE_SHIFT > PAGE_SHIFT)

4726

if (HPAGE_SHIFT > PAGE_SHIFT)

4725

order = HUGETLB_PAGE_ORDER;

4727

order = HUGETLB_PAGE_ORDER;

4726

else

4728

else

4727

order = MAX_ORDER - 1;

4729

order = MAX_ORDER - 1;

4728

4730

4729

/*

4731

/*

4730

* Assume the largest contiguous order of interest is a huge page.

4732

* Assume the largest contiguous order of interest is a huge page.

4731

* This value may be variable depending on boot parameters on IA64 and

4733

* This value may be variable depending on boot parameters on IA64 and

4732

* powerpc.

4734

* powerpc.

4733

*/

4735

*/

4734

pageblock_order = order;

4736

pageblock_order = order;

4735

}

4737

}

4736

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4738

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4737

4739

4738

/*

4740

/*

4739

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4741

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4740

* is unused as pageblock_order is set at compile-time. See

4742

* is unused as pageblock_order is set at compile-time. See

4741

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4743

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4742

* the kernel config

4744

* the kernel config

4743

*/

4745

*/

4744

void __paginginit set_pageblock_order(void)

4746

void __paginginit set_pageblock_order(void)

4745

{

4747

{

4746

}

4748

}

4747

4749

4748

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4750

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4749

4751

4750

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4752

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4751

unsigned long present_pages)

4753

unsigned long present_pages)

4752

{

4754

{

4753

unsigned long pages = spanned_pages;

4755

unsigned long pages = spanned_pages;

4754

4756

4755

/*

4757

/*

4756

* Provide a more accurate estimation if there are holes within

4758

* Provide a more accurate estimation if there are holes within

4757

* the zone and SPARSEMEM is in use. If there are holes within the

4759

* the zone and SPARSEMEM is in use. If there are holes within the

4758

* zone, each populated memory region may cost us one or two extra

4760

* zone, each populated memory region may cost us one or two extra

4759

* memmap pages due to alignment because memmap pages for each

4761

* memmap pages due to alignment because memmap pages for each

4760

* populated regions may not naturally algined on page boundary.

4762

* populated regions may not naturally algined on page boundary.

4761

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4763

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4762

*/

4764

*/

4763

if (spanned_pages > present_pages + (present_pages >> 4) &&

4765

if (spanned_pages > present_pages + (present_pages >> 4) &&

4764

IS_ENABLED(CONFIG_SPARSEMEM))

4766

IS_ENABLED(CONFIG_SPARSEMEM))

4765

pages = present_pages;

4767

pages = present_pages;

4766

4768

4767

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4769

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4768

}

4770

}

4769

4771

4770

/*

4772

/*

4771

* Set up the zone data structures:

4773

* Set up the zone data structures:

4772

* - mark all pages reserved

4774

* - mark all pages reserved

4773

* - mark all memory queues empty

4775

* - mark all memory queues empty

4774

* - clear the memory bitmaps

4776

* - clear the memory bitmaps

4775

*

4777

*

4776

* NOTE: pgdat should get zeroed by caller.

4778

* NOTE: pgdat should get zeroed by caller.

4777

*/

4779

*/

4778

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4780

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4779

unsigned long node_start_pfn, unsigned long node_end_pfn,

4781

unsigned long node_start_pfn, unsigned long node_end_pfn,

4780

unsigned long *zones_size, unsigned long *zholes_size)

4782

unsigned long *zones_size, unsigned long *zholes_size)

4781

{

4783

{

4782

enum zone_type j;

4784

enum zone_type j;

4783

int nid = pgdat->node_id;

4785

int nid = pgdat->node_id;

4784

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4786

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4785

int ret;

4787

int ret;

4786

4788

4787

pgdat_resize_init(pgdat);

4789

pgdat_resize_init(pgdat);

4788

#ifdef CONFIG_NUMA_BALANCING

4790

#ifdef CONFIG_NUMA_BALANCING

4789

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4791

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4790

pgdat->numabalancing_migrate_nr_pages = 0;

4792

pgdat->numabalancing_migrate_nr_pages = 0;

4791

pgdat->numabalancing_migrate_next_window = jiffies;

4793

pgdat->numabalancing_migrate_next_window = jiffies;

4792

#endif

4794

#endif

4793

init_waitqueue_head(&pgdat->kswapd_wait);

4795

init_waitqueue_head(&pgdat->kswapd_wait);

4794

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4796

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4795

pgdat_page_cgroup_init(pgdat);

4797

pgdat_page_cgroup_init(pgdat);

4796

4798

4797

for (j = 0; j < MAX_NR_ZONES; j++) {

4799

for (j = 0; j < MAX_NR_ZONES; j++) {

4798

struct zone *zone = pgdat->node_zones + j;

4800

struct zone *zone = pgdat->node_zones + j;

4799

unsigned long size, realsize, freesize, memmap_pages;

4801

unsigned long size, realsize, freesize, memmap_pages;

4800

4802

4801

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4803

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4802

node_end_pfn, zones_size);

4804

node_end_pfn, zones_size);

4803

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4805

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4804

node_start_pfn,

4806

node_start_pfn,

4805

node_end_pfn,

4807

node_end_pfn,

4806

zholes_size);

4808

zholes_size);

4807

4809

4808

/*

4810

/*

4809

* Adjust freesize so that it accounts for how much memory

4811

* Adjust freesize so that it accounts for how much memory

4810

* is used by this zone for memmap. This affects the watermark

4812

* is used by this zone for memmap. This affects the watermark

4811

* and per-cpu initialisations

4813

* and per-cpu initialisations

4812

*/

4814

*/

4813

memmap_pages = calc_memmap_size(size, realsize);

4815

memmap_pages = calc_memmap_size(size, realsize);

4814

if (freesize >= memmap_pages) {

4816

if (freesize >= memmap_pages) {

4815

freesize -= memmap_pages;

4817

freesize -= memmap_pages;

4816

if (memmap_pages)

4818

if (memmap_pages)

4817

printk(KERN_DEBUG

4819

printk(KERN_DEBUG

4818

" %s zone: %lu pages used for memmap\n",

4820

" %s zone: %lu pages used for memmap\n",

4819

zone_names[j], memmap_pages);

4821

zone_names[j], memmap_pages);

4820

} else

4822

} else

4821

printk(KERN_WARNING

4823

printk(KERN_WARNING

4822

" %s zone: %lu pages exceeds freesize %lu\n",

4824

" %s zone: %lu pages exceeds freesize %lu\n",

4823

zone_names[j], memmap_pages, freesize);

4825

zone_names[j], memmap_pages, freesize);

4824

4826

4825

/* Account for reserved pages */

4827

/* Account for reserved pages */

4826

if (j == 0 && freesize > dma_reserve) {

4828

if (j == 0 && freesize > dma_reserve) {

4827

freesize -= dma_reserve;

4829

freesize -= dma_reserve;

4828

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4830

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4829

zone_names[0], dma_reserve);

4831

zone_names[0], dma_reserve);

4830

}

4832

}

4831

4833

4832

if (!is_highmem_idx(j))

4834

if (!is_highmem_idx(j))

4833

nr_kernel_pages += freesize;

4835

nr_kernel_pages += freesize;

4834

/* Charge for highmem memmap if there are enough kernel pages */

4836

/* Charge for highmem memmap if there are enough kernel pages */

4835

else if (nr_kernel_pages > memmap_pages * 2)

4837

else if (nr_kernel_pages > memmap_pages * 2)

4836

nr_kernel_pages -= memmap_pages;

4838

nr_kernel_pages -= memmap_pages;

4837

nr_all_pages += freesize;

4839

nr_all_pages += freesize;

4838

4840

4839

zone->spanned_pages = size;

4841

zone->spanned_pages = size;

4840

zone->present_pages = realsize;

4842

zone->present_pages = realsize;

4841

/*

4843

/*

4842

* Set an approximate value for lowmem here, it will be adjusted

4844

* Set an approximate value for lowmem here, it will be adjusted

4843

* when the bootmem allocator frees pages into the buddy system.

4845

* when the bootmem allocator frees pages into the buddy system.

4844

* And all highmem pages will be managed by the buddy system.

4846

* And all highmem pages will be managed by the buddy system.

4845

*/

4847

*/

4846

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4848

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4847

#ifdef CONFIG_NUMA

4849

#ifdef CONFIG_NUMA

4848

zone->node = nid;

4850

zone->node = nid;

4849

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4851

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4850

/ 100;

4852

/ 100;

4851

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4853

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4852

#endif

4854

#endif

4853

zone->name = zone_names[j];

4855

zone->name = zone_names[j];

4854

spin_lock_init(&zone->lock);

4856

spin_lock_init(&zone->lock);

4855

spin_lock_init(&zone->lru_lock);

4857

spin_lock_init(&zone->lru_lock);

4856

zone_seqlock_init(zone);

4858

zone_seqlock_init(zone);

4857

zone->zone_pgdat = pgdat;

4859

zone->zone_pgdat = pgdat;

4858

zone_pcp_init(zone);

4860

zone_pcp_init(zone);

4859

4861

4860

/* For bootup, initialized properly in watermark setup */

4862

/* For bootup, initialized properly in watermark setup */

4861

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4863

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4862

4864

4863

lruvec_init(&zone->lruvec);

4865

lruvec_init(&zone->lruvec);

4864

if (!size)

4866

if (!size)

4865

continue;

4867

continue;

4866

4868

4867

set_pageblock_order();

4869

set_pageblock_order();

4868

setup_usemap(pgdat, zone, zone_start_pfn, size);

4870

setup_usemap(pgdat, zone, zone_start_pfn, size);

4869

ret = init_currently_empty_zone(zone, zone_start_pfn,

4871

ret = init_currently_empty_zone(zone, zone_start_pfn,

4870

size, MEMMAP_EARLY);

4872

size, MEMMAP_EARLY);

4871

BUG_ON(ret);

4873

BUG_ON(ret);

4872

memmap_init(size, nid, j, zone_start_pfn);

4874

memmap_init(size, nid, j, zone_start_pfn);

4873

zone_start_pfn += size;

4875

zone_start_pfn += size;

4874

}

4876

}

4875

}

4877

}

4876

4878

4877

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4879

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4878

{

4880

{

4879

/* Skip empty nodes */

4881

/* Skip empty nodes */

4880

if (!pgdat->node_spanned_pages)

4882

if (!pgdat->node_spanned_pages)

4881

return;

4883

return;

4882

4884

4883

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4885

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4884

/* ia64 gets its own node_mem_map, before this, without bootmem */

4886

/* ia64 gets its own node_mem_map, before this, without bootmem */

4885

if (!pgdat->node_mem_map) {

4887

if (!pgdat->node_mem_map) {

4886

unsigned long size, start, end;

4888

unsigned long size, start, end;

4887

struct page *map;

4889

struct page *map;

4888

4890

4889

/*

4891

/*

4890

* The zone's endpoints aren't required to be MAX_ORDER

4892

* The zone's endpoints aren't required to be MAX_ORDER

4891

* aligned but the node_mem_map endpoints must be in order

4893

* aligned but the node_mem_map endpoints must be in order

4892

* for the buddy allocator to function correctly.

4894

* for the buddy allocator to function correctly.

4893

*/

4895

*/

4894

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4896

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4895

end = pgdat_end_pfn(pgdat);

4897

end = pgdat_end_pfn(pgdat);

4896

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4898

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4897

size = (end - start) * sizeof(struct page);

4899

size = (end - start) * sizeof(struct page);

4898

map = alloc_remap(pgdat->node_id, size);

4900

map = alloc_remap(pgdat->node_id, size);

4899

if (!map)

4901

if (!map)

4900

map = alloc_bootmem_node_nopanic(pgdat, size);

4902

map = alloc_bootmem_node_nopanic(pgdat, size);

4901

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4903

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4902

}

4904

}

4903

#ifndef CONFIG_NEED_MULTIPLE_NODES

4905

#ifndef CONFIG_NEED_MULTIPLE_NODES

4904

/*

4906

/*

4905

* With no DISCONTIG, the global mem_map is just set as node 0's

4907

* With no DISCONTIG, the global mem_map is just set as node 0's

4906

*/

4908

*/

4907

if (pgdat == NODE_DATA(0)) {

4909

if (pgdat == NODE_DATA(0)) {

4908

mem_map = NODE_DATA(0)->node_mem_map;

4910

mem_map = NODE_DATA(0)->node_mem_map;

4909

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4911

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4910

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4912

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4911

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4913

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4912

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4914

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4913

}

4915

}

4914

#endif

4916

#endif

4915

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4917

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4916

}

4918

}

4917

4919

4918

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4920

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4919

unsigned long node_start_pfn, unsigned long *zholes_size)

4921

unsigned long node_start_pfn, unsigned long *zholes_size)

4920

{

4922

{

4921

pg_data_t *pgdat = NODE_DATA(nid);

4923

pg_data_t *pgdat = NODE_DATA(nid);

4922

unsigned long start_pfn = 0;

4924

unsigned long start_pfn = 0;

4923

unsigned long end_pfn = 0;

4925

unsigned long end_pfn = 0;

4924

4926

4925

/* pg_data_t should be reset to zero when it's allocated */

4927

/* pg_data_t should be reset to zero when it's allocated */

4926

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4928

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4927

4929

4928

pgdat->node_id = nid;

4930

pgdat->node_id = nid;

4929

pgdat->node_start_pfn = node_start_pfn;

4931

pgdat->node_start_pfn = node_start_pfn;

4930

if (node_state(nid, N_MEMORY))

4932

if (node_state(nid, N_MEMORY))

4931

init_zone_allows_reclaim(nid);

4933

init_zone_allows_reclaim(nid);

4932

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4934

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4933

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4935

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4934

#endif

4936

#endif

4935

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4937

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4936

zones_size, zholes_size);

4938

zones_size, zholes_size);

4937

4939

4938

alloc_node_mem_map(pgdat);

4940

alloc_node_mem_map(pgdat);

4939

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4941

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4940

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4942

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4941

nid, (unsigned long)pgdat,

4943

nid, (unsigned long)pgdat,

4942

(unsigned long)pgdat->node_mem_map);

4944

(unsigned long)pgdat->node_mem_map);

4943

#endif

4945

#endif

4944

4946

4945

free_area_init_core(pgdat, start_pfn, end_pfn,

4947

free_area_init_core(pgdat, start_pfn, end_pfn,

4946

zones_size, zholes_size);

4948

zones_size, zholes_size);

4947

}

4949

}

4948

4950

4949

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4951

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4950

4952

4951

#if MAX_NUMNODES > 1

4953

#if MAX_NUMNODES > 1

4952

/*

4954

/*

4953

* Figure out the number of possible node ids.

4955

* Figure out the number of possible node ids.

4954

*/

4956

*/

4955

void __init setup_nr_node_ids(void)

4957

void __init setup_nr_node_ids(void)

4956

{

4958

{

4957

unsigned int node;

4959

unsigned int node;

4958

unsigned int highest = 0;

4960

unsigned int highest = 0;

4959

4961

4960

for_each_node_mask(node, node_possible_map)

4962

for_each_node_mask(node, node_possible_map)

4961

highest = node;

4963

highest = node;

4962

nr_node_ids = highest + 1;

4964

nr_node_ids = highest + 1;

4963

}

4965

}

4964

#endif

4966

#endif

4965

4967

4966

/**

4968

/**

4967

* node_map_pfn_alignment - determine the maximum internode alignment

4969

* node_map_pfn_alignment - determine the maximum internode alignment

4968

*

4970

*

4969

* This function should be called after node map is populated and sorted.

4971

* This function should be called after node map is populated and sorted.

4970

* It calculates the maximum power of two alignment which can distinguish

4972

* It calculates the maximum power of two alignment which can distinguish

4971

* all the nodes.

4973

* all the nodes.

4972

*

4974

*

4973

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4975

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4974

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4976

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4975

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4977

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4976

* shifted, 1GiB is enough and this function will indicate so.

4978

* shifted, 1GiB is enough and this function will indicate so.

4977

*

4979

*

4978

* This is used to test whether pfn -> nid mapping of the chosen memory

4980

* This is used to test whether pfn -> nid mapping of the chosen memory

4979

* model has fine enough granularity to avoid incorrect mapping for the

4981

* model has fine enough granularity to avoid incorrect mapping for the

4980

* populated node map.

4982

* populated node map.

4981

*

4983

*

4982

* Returns the determined alignment in pfn's. 0 if there is no alignment

4984

* Returns the determined alignment in pfn's. 0 if there is no alignment

4983

* requirement (single node).

4985

* requirement (single node).

4984

*/

4986

*/

4985

unsigned long __init node_map_pfn_alignment(void)

4987

unsigned long __init node_map_pfn_alignment(void)

4986

{

4988

{

4987

unsigned long accl_mask = 0, last_end = 0;

4989

unsigned long accl_mask = 0, last_end = 0;

4988

unsigned long start, end, mask;

4990

unsigned long start, end, mask;

4989

int last_nid = -1;

4991

int last_nid = -1;

4990

int i, nid;

4992

int i, nid;

4991

4993

4992

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4994

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4993

if (!start || last_nid < 0 || last_nid == nid) {

4995

if (!start || last_nid < 0 || last_nid == nid) {

4994

last_nid = nid;

4996

last_nid = nid;

4995

last_end = end;

4997

last_end = end;

4996

continue;

4998

continue;

4997

}

4999

}

4998

5000

4999

/*

5001

/*

5000

* Start with a mask granular enough to pin-point to the

5002

* Start with a mask granular enough to pin-point to the

5001

* start pfn and tick off bits one-by-one until it becomes

5003

* start pfn and tick off bits one-by-one until it becomes

5002

* too coarse to separate the current node from the last.

5004

* too coarse to separate the current node from the last.

5003

*/

5005

*/

5004

mask = ~((1 << __ffs(start)) - 1);

5006

mask = ~((1 << __ffs(start)) - 1);

5005

while (mask && last_end <= (start & (mask << 1)))

5007

while (mask && last_end <= (start & (mask << 1)))

5006

mask <<= 1;

5008

mask <<= 1;

5007

5009

5008

/* accumulate all internode masks */

5010

/* accumulate all internode masks */

5009

accl_mask |= mask;

5011

accl_mask |= mask;

5010

}

5012

}

5011

5013

5012

/* convert mask to number of pages */

5014

/* convert mask to number of pages */

5013

return ~accl_mask + 1;

5015

return ~accl_mask + 1;

5014

}

5016

}

5015

5017

5016

/* Find the lowest pfn for a node */

5018

/* Find the lowest pfn for a node */

5017

static unsigned long __init find_min_pfn_for_node(int nid)

5019

static unsigned long __init find_min_pfn_for_node(int nid)

5018

{

5020

{

5019

unsigned long min_pfn = ULONG_MAX;

5021

unsigned long min_pfn = ULONG_MAX;

5020

unsigned long start_pfn;

5022

unsigned long start_pfn;

5021

int i;

5023

int i;

5022

5024

5023

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5025

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5024

min_pfn = min(min_pfn, start_pfn);

5026

min_pfn = min(min_pfn, start_pfn);

5025

5027

5026

if (min_pfn == ULONG_MAX) {

5028

if (min_pfn == ULONG_MAX) {

5027

printk(KERN_WARNING

5029

printk(KERN_WARNING

5028

"Could not find start_pfn for node %d\n", nid);

5030

"Could not find start_pfn for node %d\n", nid);

5029

return 0;

5031

return 0;

5030

}

5032

}

5031

5033

5032

return min_pfn;

5034

return min_pfn;

5033

}

5035

}

5034

5036

5035

/**

5037

/**

5036

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5038

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5037

*

5039

*

5038

* It returns the minimum PFN based on information provided via

5040

* It returns the minimum PFN based on information provided via

5039

* add_active_range().

5041

* add_active_range().

5040

*/

5042

*/

5041

unsigned long __init find_min_pfn_with_active_regions(void)

5043

unsigned long __init find_min_pfn_with_active_regions(void)

5042

{

5044

{

5043

return find_min_pfn_for_node(MAX_NUMNODES);

5045

return find_min_pfn_for_node(MAX_NUMNODES);

5044

}

5046

}

5045

5047

5046

/*

5048

/*

5047

* early_calculate_totalpages()

5049

* early_calculate_totalpages()

5048

* Sum pages in active regions for movable zone.

5050

* Sum pages in active regions for movable zone.

5049

* Populate N_MEMORY for calculating usable_nodes.

5051

* Populate N_MEMORY for calculating usable_nodes.

5050

*/

5052

*/

5051

static unsigned long __init early_calculate_totalpages(void)

5053

static unsigned long __init early_calculate_totalpages(void)

5052

{

5054

{

5053

unsigned long totalpages = 0;

5055

unsigned long totalpages = 0;

5054

unsigned long start_pfn, end_pfn;

5056

unsigned long start_pfn, end_pfn;

5055

int i, nid;

5057

int i, nid;

5056

5058

5057

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5059

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5058

unsigned long pages = end_pfn - start_pfn;

5060

unsigned long pages = end_pfn - start_pfn;

5059

5061

5060

totalpages += pages;

5062

totalpages += pages;

5061

if (pages)

5063

if (pages)

5062

node_set_state(nid, N_MEMORY);

5064

node_set_state(nid, N_MEMORY);

5063

}

5065

}

5064

return totalpages;

5066

return totalpages;

5065

}

5067

}

5066

5068

5067

/*

5069

/*

5068

* Find the PFN the Movable zone begins in each node. Kernel memory

5070

* Find the PFN the Movable zone begins in each node. Kernel memory

5069

* is spread evenly between nodes as long as the nodes have enough

5071

* is spread evenly between nodes as long as the nodes have enough

5070

* memory. When they don't, some nodes will have more kernelcore than

5072

* memory. When they don't, some nodes will have more kernelcore than

5071

* others

5073

* others

5072

*/

5074

*/

5073

static void __init find_zone_movable_pfns_for_nodes(void)

5075

static void __init find_zone_movable_pfns_for_nodes(void)

5074

{

5076

{

5075

int i, nid;

5077

int i, nid;

5076

unsigned long usable_startpfn;

5078

unsigned long usable_startpfn;

5077

unsigned long kernelcore_node, kernelcore_remaining;

5079

unsigned long kernelcore_node, kernelcore_remaining;

5078

/* save the state before borrow the nodemask */

5080

/* save the state before borrow the nodemask */

5079

nodemask_t saved_node_state = node_states[N_MEMORY];

5081

nodemask_t saved_node_state = node_states[N_MEMORY];

5080

unsigned long totalpages = early_calculate_totalpages();

5082

unsigned long totalpages = early_calculate_totalpages();

5081

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5083

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5082

5084

5083

/*

5085

/*

5084

* If movablecore was specified, calculate what size of

5086

* If movablecore was specified, calculate what size of

5085

* kernelcore that corresponds so that memory usable for

5087

* kernelcore that corresponds so that memory usable for

5086

* any allocation type is evenly spread. If both kernelcore

5088

* any allocation type is evenly spread. If both kernelcore

5087

* and movablecore are specified, then the value of kernelcore

5089

* and movablecore are specified, then the value of kernelcore

5088

* will be used for required_kernelcore if it's greater than

5090

* will be used for required_kernelcore if it's greater than

5089

* what movablecore would have allowed.

5091

* what movablecore would have allowed.

5090

*/

5092

*/

5091

if (required_movablecore) {

5093

if (required_movablecore) {

5092

unsigned long corepages;

5094

unsigned long corepages;

5093

5095

5094

/*

5096

/*

5095

* Round-up so that ZONE_MOVABLE is at least as large as what

5097

* Round-up so that ZONE_MOVABLE is at least as large as what

5096

* was requested by the user

5098

* was requested by the user

5097

*/

5099

*/

5098

required_movablecore =

5100

required_movablecore =

5099

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5101

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5100

corepages = totalpages - required_movablecore;

5102

corepages = totalpages - required_movablecore;

5101

5103

5102

required_kernelcore = max(required_kernelcore, corepages);

5104

required_kernelcore = max(required_kernelcore, corepages);

5103

}

5105

}

5104

5106

5105

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5107

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5106

if (!required_kernelcore)

5108

if (!required_kernelcore)

5107

goto out;

5109

goto out;

5108

5110

5109

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5111

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5110

find_usable_zone_for_movable();

5112

find_usable_zone_for_movable();

5111

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5113

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5112

5114

5113

restart:

5115

restart:

5114

/* Spread kernelcore memory as evenly as possible throughout nodes */

5116

/* Spread kernelcore memory as evenly as possible throughout nodes */

5115

kernelcore_node = required_kernelcore / usable_nodes;

5117

kernelcore_node = required_kernelcore / usable_nodes;

5116

for_each_node_state(nid, N_MEMORY) {

5118

for_each_node_state(nid, N_MEMORY) {

5117

unsigned long start_pfn, end_pfn;

5119

unsigned long start_pfn, end_pfn;

5118

5120

5119

/*

5121

/*

5120

* Recalculate kernelcore_node if the division per node

5122

* Recalculate kernelcore_node if the division per node

5121

* now exceeds what is necessary to satisfy the requested

5123

* now exceeds what is necessary to satisfy the requested

5122

* amount of memory for the kernel

5124

* amount of memory for the kernel

5123

*/

5125

*/

5124

if (required_kernelcore < kernelcore_node)

5126

if (required_kernelcore < kernelcore_node)

5125

kernelcore_node = required_kernelcore / usable_nodes;

5127

kernelcore_node = required_kernelcore / usable_nodes;

5126

5128

5127

/*

5129

/*

5128

* As the map is walked, we track how much memory is usable

5130

* As the map is walked, we track how much memory is usable

5129

* by the kernel using kernelcore_remaining. When it is

5131

* by the kernel using kernelcore_remaining. When it is

5130

* 0, the rest of the node is usable by ZONE_MOVABLE

5132

* 0, the rest of the node is usable by ZONE_MOVABLE

5131

*/

5133

*/

5132

kernelcore_remaining = kernelcore_node;

5134

kernelcore_remaining = kernelcore_node;

5133

5135

5134

/* Go through each range of PFNs within this node */

5136

/* Go through each range of PFNs within this node */

5135

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5137

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5136

unsigned long size_pages;

5138

unsigned long size_pages;

5137

5139

5138

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5140

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5139

if (start_pfn >= end_pfn)

5141

if (start_pfn >= end_pfn)

5140

continue;

5142

continue;

5141

5143

5142

/* Account for what is only usable for kernelcore */

5144

/* Account for what is only usable for kernelcore */

5143

if (start_pfn < usable_startpfn) {

5145

if (start_pfn < usable_startpfn) {

5144

unsigned long kernel_pages;

5146

unsigned long kernel_pages;

5145

kernel_pages = min(end_pfn, usable_startpfn)

5147

kernel_pages = min(end_pfn, usable_startpfn)

5146

- start_pfn;

5148

- start_pfn;

5147

5149

5148

kernelcore_remaining -= min(kernel_pages,

5150

kernelcore_remaining -= min(kernel_pages,

5149

kernelcore_remaining);

5151

kernelcore_remaining);

5150

required_kernelcore -= min(kernel_pages,

5152

required_kernelcore -= min(kernel_pages,

5151

required_kernelcore);

5153

required_kernelcore);

5152

5154

5153

/* Continue if range is now fully accounted */

5155

/* Continue if range is now fully accounted */

5154

if (end_pfn <= usable_startpfn) {

5156

if (end_pfn <= usable_startpfn) {

5155

5157

5156

/*

5158

/*

5157

* Push zone_movable_pfn to the end so

5159

* Push zone_movable_pfn to the end so

5158

* that if we have to rebalance

5160

* that if we have to rebalance

5159

* kernelcore across nodes, we will

5161

* kernelcore across nodes, we will

5160

* not double account here

5162

* not double account here

5161

*/

5163

*/

5162

zone_movable_pfn[nid] = end_pfn;

5164

zone_movable_pfn[nid] = end_pfn;

5163

continue;

5165

continue;

5164

}

5166

}

5165

start_pfn = usable_startpfn;

5167

start_pfn = usable_startpfn;

5166

}

5168

}

5167

5169

5168

/*

5170

/*

5169

* The usable PFN range for ZONE_MOVABLE is from

5171

* The usable PFN range for ZONE_MOVABLE is from

5170

* start_pfn->end_pfn. Calculate size_pages as the

5172

* start_pfn->end_pfn. Calculate size_pages as the

5171

* number of pages used as kernelcore

5173

* number of pages used as kernelcore

5172

*/

5174

*/

5173

size_pages = end_pfn - start_pfn;

5175

size_pages = end_pfn - start_pfn;

5174

if (size_pages > kernelcore_remaining)

5176

if (size_pages > kernelcore_remaining)

5175

size_pages = kernelcore_remaining;

5177

size_pages = kernelcore_remaining;

5176

zone_movable_pfn[nid] = start_pfn + size_pages;

5178

zone_movable_pfn[nid] = start_pfn + size_pages;

5177

5179

5178

/*

5180

/*

5179

* Some kernelcore has been met, update counts and

5181

* Some kernelcore has been met, update counts and

5180

* break if the kernelcore for this node has been

5182

* break if the kernelcore for this node has been

5181

* satisfied

5183

* satisfied

5182

*/

5184

*/

5183

required_kernelcore -= min(required_kernelcore,

5185

required_kernelcore -= min(required_kernelcore,

5184

size_pages);

5186

size_pages);

5185

kernelcore_remaining -= size_pages;

5187

kernelcore_remaining -= size_pages;

5186

if (!kernelcore_remaining)

5188

if (!kernelcore_remaining)

5187

break;

5189

break;

5188

}

5190

}

5189

}

5191

}

5190

5192

5191

/*

5193

/*

5192

* If there is still required_kernelcore, we do another pass with one

5194

* If there is still required_kernelcore, we do another pass with one

5193

* less node in the count. This will push zone_movable_pfn[nid] further

5195

* less node in the count. This will push zone_movable_pfn[nid] further

5194

* along on the nodes that still have memory until kernelcore is

5196

* along on the nodes that still have memory until kernelcore is

5195

* satisfied

5197

* satisfied

5196

*/

5198

*/

5197

usable_nodes--;

5199

usable_nodes--;

5198

if (usable_nodes && required_kernelcore > usable_nodes)

5200

if (usable_nodes && required_kernelcore > usable_nodes)

5199

goto restart;

5201

goto restart;

5200

5202

5201

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5203

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5202

for (nid = 0; nid < MAX_NUMNODES; nid++)

5204

for (nid = 0; nid < MAX_NUMNODES; nid++)

5203

zone_movable_pfn[nid] =

5205

zone_movable_pfn[nid] =

5204

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5206

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5205

5207

5206

out:

5208

out:

5207

/* restore the node_state */

5209

/* restore the node_state */

5208

node_states[N_MEMORY] = saved_node_state;

5210

node_states[N_MEMORY] = saved_node_state;

5209

}

5211

}

5210

5212

5211

/* Any regular or high memory on that node ? */

5213

/* Any regular or high memory on that node ? */

5212

static void check_for_memory(pg_data_t *pgdat, int nid)

5214

static void check_for_memory(pg_data_t *pgdat, int nid)

5213

{

5215

{

5214

enum zone_type zone_type;

5216

enum zone_type zone_type;

5215

5217

5216

if (N_MEMORY == N_NORMAL_MEMORY)

5218

if (N_MEMORY == N_NORMAL_MEMORY)

5217

return;

5219

return;

5218

5220

5219

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5221

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5220

struct zone *zone = &pgdat->node_zones[zone_type];

5222

struct zone *zone = &pgdat->node_zones[zone_type];

5221

if (zone->present_pages) {

5223

if (zone->present_pages) {

5222

node_set_state(nid, N_HIGH_MEMORY);

5224

node_set_state(nid, N_HIGH_MEMORY);

5223

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5225

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5224

zone_type <= ZONE_NORMAL)

5226

zone_type <= ZONE_NORMAL)

5225

node_set_state(nid, N_NORMAL_MEMORY);

5227

node_set_state(nid, N_NORMAL_MEMORY);

5226

break;

5228

break;

5227

}

5229

}

5228

}

5230

}

5229

}

5231

}

5230

5232

5231

/**

5233

/**

5232

* free_area_init_nodes - Initialise all pg_data_t and zone data

5234

* free_area_init_nodes - Initialise all pg_data_t and zone data

5233

* @max_zone_pfn: an array of max PFNs for each zone

5235

* @max_zone_pfn: an array of max PFNs for each zone

5234

*

5236

*

5235

* This will call free_area_init_node() for each active node in the system.

5237

* This will call free_area_init_node() for each active node in the system.

5236

* Using the page ranges provided by add_active_range(), the size of each

5238

* Using the page ranges provided by add_active_range(), the size of each

5237

* zone in each node and their holes is calculated. If the maximum PFN

5239

* zone in each node and their holes is calculated. If the maximum PFN

5238

* between two adjacent zones match, it is assumed that the zone is empty.

5240

* between two adjacent zones match, it is assumed that the zone is empty.

5239

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5241

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5240

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5242

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5241

* starts where the previous one ended. For example, ZONE_DMA32 starts

5243

* starts where the previous one ended. For example, ZONE_DMA32 starts

5242

* at arch_max_dma_pfn.

5244

* at arch_max_dma_pfn.

5243

*/

5245

*/

5244

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5246

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5245

{

5247

{

5246

unsigned long start_pfn, end_pfn;

5248

unsigned long start_pfn, end_pfn;

5247

int i, nid;

5249

int i, nid;

5248

5250

5249

/* Record where the zone boundaries are */

5251

/* Record where the zone boundaries are */

5250

memset(arch_zone_lowest_possible_pfn, 0,

5252

memset(arch_zone_lowest_possible_pfn, 0,

5251

sizeof(arch_zone_lowest_possible_pfn));

5253

sizeof(arch_zone_lowest_possible_pfn));

5252

memset(arch_zone_highest_possible_pfn, 0,

5254

memset(arch_zone_highest_possible_pfn, 0,

5253

sizeof(arch_zone_highest_possible_pfn));

5255

sizeof(arch_zone_highest_possible_pfn));

5254

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5256

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5255

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5257

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5256

for (i = 1; i < MAX_NR_ZONES; i++) {

5258

for (i = 1; i < MAX_NR_ZONES; i++) {

5257

if (i == ZONE_MOVABLE)

5259

if (i == ZONE_MOVABLE)

5258

continue;

5260

continue;

5259

arch_zone_lowest_possible_pfn[i] =

5261

arch_zone_lowest_possible_pfn[i] =

5260

arch_zone_highest_possible_pfn[i-1];

5262

arch_zone_highest_possible_pfn[i-1];

5261

arch_zone_highest_possible_pfn[i] =

5263

arch_zone_highest_possible_pfn[i] =

5262

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5264

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5263

}

5265

}

5264

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5266

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5265

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5267

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5266

5268

5267

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5269

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5268

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5270

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5269

find_zone_movable_pfns_for_nodes();

5271

find_zone_movable_pfns_for_nodes();

5270

5272

5271

/* Print out the zone ranges */

5273

/* Print out the zone ranges */

5272

printk("Zone ranges:\n");

5274

printk("Zone ranges:\n");

5273

for (i = 0; i < MAX_NR_ZONES; i++) {

5275

for (i = 0; i < MAX_NR_ZONES; i++) {

5274

if (i == ZONE_MOVABLE)

5276

if (i == ZONE_MOVABLE)

5275

continue;

5277

continue;

5276

printk(KERN_CONT " %-8s ", zone_names[i]);

5278

printk(KERN_CONT " %-8s ", zone_names[i]);

5277

if (arch_zone_lowest_possible_pfn[i] ==

5279

if (arch_zone_lowest_possible_pfn[i] ==

5278

arch_zone_highest_possible_pfn[i])

5280

arch_zone_highest_possible_pfn[i])

5279

printk(KERN_CONT "empty\n");

5281

printk(KERN_CONT "empty\n");

5280

else

5282

else

5281

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5283

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5282

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5284

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5283

(arch_zone_highest_possible_pfn[i]

5285

(arch_zone_highest_possible_pfn[i]

5284

<< PAGE_SHIFT) - 1);

5286

<< PAGE_SHIFT) - 1);

5285

}

5287

}

5286

5288

5287

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5289

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5288

printk("Movable zone start for each node\n");

5290

printk("Movable zone start for each node\n");

5289

for (i = 0; i < MAX_NUMNODES; i++) {

5291

for (i = 0; i < MAX_NUMNODES; i++) {

5290

if (zone_movable_pfn[i])

5292

if (zone_movable_pfn[i])

5291

printk(" Node %d: %#010lx\n", i,

5293

printk(" Node %d: %#010lx\n", i,

5292

zone_movable_pfn[i] << PAGE_SHIFT);

5294

zone_movable_pfn[i] << PAGE_SHIFT);

5293

}

5295

}

5294

5296

5295

/* Print out the early node map */

5297

/* Print out the early node map */

5296

printk("Early memory node ranges\n");

5298

printk("Early memory node ranges\n");

5297

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5299

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5298

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5300

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5299

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5301

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5300

5302

5301

/* Initialise every node */

5303

/* Initialise every node */

5302

mminit_verify_pageflags_layout();

5304

mminit_verify_pageflags_layout();

5303

setup_nr_node_ids();

5305

setup_nr_node_ids();

5304

for_each_online_node(nid) {

5306

for_each_online_node(nid) {

5305

pg_data_t *pgdat = NODE_DATA(nid);

5307

pg_data_t *pgdat = NODE_DATA(nid);

5306

free_area_init_node(nid, NULL,

5308

free_area_init_node(nid, NULL,

5307

find_min_pfn_for_node(nid), NULL);

5309

find_min_pfn_for_node(nid), NULL);

5308

5310

5309

/* Any memory on that node */

5311

/* Any memory on that node */

5310

if (pgdat->node_present_pages)

5312

if (pgdat->node_present_pages)

5311

node_set_state(nid, N_MEMORY);

5313

node_set_state(nid, N_MEMORY);

5312

check_for_memory(pgdat, nid);

5314

check_for_memory(pgdat, nid);

5313

}

5315

}

5314

}

5316

}

5315

5317

5316

static int __init cmdline_parse_core(char *p, unsigned long *core)

5318

static int __init cmdline_parse_core(char *p, unsigned long *core)

5317

{

5319

{

5318

unsigned long long coremem;

5320

unsigned long long coremem;

5319

if (!p)

5321

if (!p)

5320

return -EINVAL;

5322

return -EINVAL;

5321

5323

5322

coremem = memparse(p, &p);

5324

coremem = memparse(p, &p);

5323

*core = coremem >> PAGE_SHIFT;

5325

*core = coremem >> PAGE_SHIFT;

5324

5326

5325

/* Paranoid check that UL is enough for the coremem value */

5327

/* Paranoid check that UL is enough for the coremem value */

5326

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5328

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5327

5329

5328

return 0;

5330

return 0;

5329

}

5331

}

5330

5332

5331

/*

5333

/*

5332

* kernelcore=size sets the amount of memory for use for allocations that

5334

* kernelcore=size sets the amount of memory for use for allocations that

5333

* cannot be reclaimed or migrated.

5335

* cannot be reclaimed or migrated.

5334

*/

5336

*/

5335

static int __init cmdline_parse_kernelcore(char *p)

5337

static int __init cmdline_parse_kernelcore(char *p)

5336

{

5338

{

5337

return cmdline_parse_core(p, &required_kernelcore);

5339

return cmdline_parse_core(p, &required_kernelcore);

5338

}

5340

}

5339

5341

5340

/*

5342

/*

5341

* movablecore=size sets the amount of memory for use for allocations that

5343

* movablecore=size sets the amount of memory for use for allocations that

5342

* can be reclaimed or migrated.

5344

* can be reclaimed or migrated.

5343

*/

5345

*/

5344

static int __init cmdline_parse_movablecore(char *p)

5346

static int __init cmdline_parse_movablecore(char *p)

5345

{

5347

{

5346

return cmdline_parse_core(p, &required_movablecore);

5348

return cmdline_parse_core(p, &required_movablecore);

5347

}

5349

}

5348

5350

5349

early_param("kernelcore", cmdline_parse_kernelcore);

5351

early_param("kernelcore", cmdline_parse_kernelcore);

5350

early_param("movablecore", cmdline_parse_movablecore);

5352

early_param("movablecore", cmdline_parse_movablecore);

5351

5353

5352

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5354

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5353

5355

5354

void adjust_managed_page_count(struct page *page, long count)

5356

void adjust_managed_page_count(struct page *page, long count)

5355

{

5357

{

5356

spin_lock(&managed_page_count_lock);

5358

spin_lock(&managed_page_count_lock);

5357

page_zone(page)->managed_pages += count;

5359

page_zone(page)->managed_pages += count;

5358

totalram_pages += count;

5360

totalram_pages += count;

5359

#ifdef CONFIG_HIGHMEM

5361

#ifdef CONFIG_HIGHMEM

5360

if (PageHighMem(page))

5362

if (PageHighMem(page))

5361

totalhigh_pages += count;

5363

totalhigh_pages += count;

5362

#endif

5364

#endif

5363

spin_unlock(&managed_page_count_lock);

5365

spin_unlock(&managed_page_count_lock);

5364

}

5366

}

5365

EXPORT_SYMBOL(adjust_managed_page_count);

5367

EXPORT_SYMBOL(adjust_managed_page_count);

5366

5368

5367

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5369

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5368

{

5370

{

5369

void *pos;

5371

void *pos;

5370

unsigned long pages = 0;

5372

unsigned long pages = 0;

5371

5373

5372

start = (void *)PAGE_ALIGN((unsigned long)start);

5374

start = (void *)PAGE_ALIGN((unsigned long)start);

5373

end = (void *)((unsigned long)end & PAGE_MASK);

5375

end = (void *)((unsigned long)end & PAGE_MASK);

5374

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5376

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5375

if ((unsigned int)poison <= 0xFF)

5377

if ((unsigned int)poison <= 0xFF)

5376

memset(pos, poison, PAGE_SIZE);

5378

memset(pos, poison, PAGE_SIZE);

5377

free_reserved_page(virt_to_page(pos));

5379

free_reserved_page(virt_to_page(pos));

5378

}

5380

}

5379

5381

5380

if (pages && s)

5382

if (pages && s)

5381

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5383

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5382

s, pages << (PAGE_SHIFT - 10), start, end);

5384

s, pages << (PAGE_SHIFT - 10), start, end);

5383

5385

5384

return pages;

5386

return pages;

5385

}

5387

}

5386

EXPORT_SYMBOL(free_reserved_area);

5388

EXPORT_SYMBOL(free_reserved_area);

5387

5389

5388

#ifdef CONFIG_HIGHMEM

5390

#ifdef CONFIG_HIGHMEM

5389

void free_highmem_page(struct page *page)

5391

void free_highmem_page(struct page *page)

5390

{

5392

{

5391

__free_reserved_page(page);

5393

__free_reserved_page(page);

5392

totalram_pages++;

5394

totalram_pages++;

5393

page_zone(page)->managed_pages++;

5395

page_zone(page)->managed_pages++;

5394

totalhigh_pages++;

5396

totalhigh_pages++;

5395

}

5397

}

5396

#endif

5398

#endif

5397

5399

5398

5400

5399

void __init mem_init_print_info(const char *str)

5401

void __init mem_init_print_info(const char *str)

5400

{

5402

{

5401

unsigned long physpages, codesize, datasize, rosize, bss_size;

5403

unsigned long physpages, codesize, datasize, rosize, bss_size;

5402

unsigned long init_code_size, init_data_size;

5404

unsigned long init_code_size, init_data_size;

5403

5405

5404

physpages = get_num_physpages();

5406

physpages = get_num_physpages();

5405

codesize = _etext - _stext;

5407

codesize = _etext - _stext;

5406

datasize = _edata - _sdata;

5408

datasize = _edata - _sdata;

5407

rosize = __end_rodata - __start_rodata;

5409

rosize = __end_rodata - __start_rodata;

5408

bss_size = __bss_stop - __bss_start;

5410

bss_size = __bss_stop - __bss_start;

5409

init_data_size = __init_end - __init_begin;

5411

init_data_size = __init_end - __init_begin;

5410

init_code_size = _einittext - _sinittext;

5412

init_code_size = _einittext - _sinittext;

5411

5413

5412

/*

5414

/*

5413

* Detect special cases and adjust section sizes accordingly:

5415

* Detect special cases and adjust section sizes accordingly:

5414

* 1) .init.* may be embedded into .data sections

5416

* 1) .init.* may be embedded into .data sections

5415

* 2) .init.text.* may be out of [__init_begin, __init_end],

5417

* 2) .init.text.* may be out of [__init_begin, __init_end],

5416

* please refer to arch/tile/kernel/vmlinux.lds.S.

5418

* please refer to arch/tile/kernel/vmlinux.lds.S.

5417

* 3) .rodata.* may be embedded into .text or .data sections.

5419

* 3) .rodata.* may be embedded into .text or .data sections.

5418

*/

5420

*/

5419

#define adj_init_size(start, end, size, pos, adj) \

5421

#define adj_init_size(start, end, size, pos, adj) \

5420

do { \

5422

do { \

5421

if (start <= pos && pos < end && size > adj) \

5423

if (start <= pos && pos < end && size > adj) \

5422

size -= adj; \

5424

size -= adj; \

5423

} while (0)

5425

} while (0)

5424

5426

5425

adj_init_size(__init_begin, __init_end, init_data_size,

5427

adj_init_size(__init_begin, __init_end, init_data_size,

5426

_sinittext, init_code_size);

5428

_sinittext, init_code_size);

5427

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5429

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5428

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5430

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5429

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5431

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5430

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5432

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5431

5433

5432

#undef adj_init_size

5434

#undef adj_init_size

5433

5435

5434

printk("Memory: %luK/%luK available "

5436

printk("Memory: %luK/%luK available "

5435

"(%luK kernel code, %luK rwdata, %luK rodata, "

5437

"(%luK kernel code, %luK rwdata, %luK rodata, "

5436

"%luK init, %luK bss, %luK reserved"

5438

"%luK init, %luK bss, %luK reserved"

5437

#ifdef CONFIG_HIGHMEM

5439

#ifdef CONFIG_HIGHMEM

5438

", %luK highmem"

5440

", %luK highmem"

5439

#endif

5441

#endif

5440

"%s%s)\n",

5442

"%s%s)\n",

5441

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5443

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5442

codesize >> 10, datasize >> 10, rosize >> 10,

5444

codesize >> 10, datasize >> 10, rosize >> 10,

5443

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5445

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5444

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5446

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5445

#ifdef CONFIG_HIGHMEM

5447

#ifdef CONFIG_HIGHMEM

5446

totalhigh_pages << (PAGE_SHIFT-10),

5448

totalhigh_pages << (PAGE_SHIFT-10),

5447

#endif

5449

#endif

5448

str ? ", " : "", str ? str : "");

5450

str ? ", " : "", str ? str : "");

5449

}

5451

}

5450

5452

5451

/**

5453

/**

5452

* set_dma_reserve - set the specified number of pages reserved in the first zone

5454

* set_dma_reserve - set the specified number of pages reserved in the first zone

5453

* @new_dma_reserve: The number of pages to mark reserved

5455

* @new_dma_reserve: The number of pages to mark reserved

5454

*

5456

*

5455

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5457

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5456

* In the DMA zone, a significant percentage may be consumed by kernel image

5458

* In the DMA zone, a significant percentage may be consumed by kernel image

5457

* and other unfreeable allocations which can skew the watermarks badly. This

5459

* and other unfreeable allocations which can skew the watermarks badly. This

5458

* function may optionally be used to account for unfreeable pages in the

5460

* function may optionally be used to account for unfreeable pages in the

5459

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5461

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5460

* smaller per-cpu batchsize.

5462

* smaller per-cpu batchsize.

5461

*/

5463

*/

5462

void __init set_dma_reserve(unsigned long new_dma_reserve)

5464

void __init set_dma_reserve(unsigned long new_dma_reserve)

5463

{

5465

{

5464

dma_reserve = new_dma_reserve;

5466

dma_reserve = new_dma_reserve;

5465

}

5467

}

5466

5468

5467

void __init free_area_init(unsigned long *zones_size)

5469

void __init free_area_init(unsigned long *zones_size)

5468

{

5470

{

5469

free_area_init_node(0, zones_size,

5471

free_area_init_node(0, zones_size,

5470

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5472

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5471

}

5473

}

5472

5474

5473

static int page_alloc_cpu_notify(struct notifier_block *self,

5475

static int page_alloc_cpu_notify(struct notifier_block *self,

5474

unsigned long action, void *hcpu)

5476

unsigned long action, void *hcpu)

5475

{

5477

{

5476

int cpu = (unsigned long)hcpu;

5478

int cpu = (unsigned long)hcpu;

5477

5479

5478

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5480

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5479

lru_add_drain_cpu(cpu);

5481

lru_add_drain_cpu(cpu);

5480

drain_pages(cpu);

5482

drain_pages(cpu);

5481

5483

5482

/*

5484

/*

5483

* Spill the event counters of the dead processor

5485

* Spill the event counters of the dead processor

5484

* into the current processors event counters.

5486

* into the current processors event counters.

5485

* This artificially elevates the count of the current

5487

* This artificially elevates the count of the current

5486

* processor.

5488

* processor.

5487

*/

5489

*/

5488

vm_events_fold_cpu(cpu);

5490

vm_events_fold_cpu(cpu);

5489

5491

5490

/*

5492

/*

5491

* Zero the differential counters of the dead processor

5493

* Zero the differential counters of the dead processor

5492

* so that the vm statistics are consistent.

5494

* so that the vm statistics are consistent.

5493

*

5495

*

5494

* This is only okay since the processor is dead and cannot

5496

* This is only okay since the processor is dead and cannot

5495

* race with what we are doing.

5497

* race with what we are doing.

5496

*/

5498

*/

5497

cpu_vm_stats_fold(cpu);

5499

cpu_vm_stats_fold(cpu);

5498

}

5500

}

5499

return NOTIFY_OK;

5501

return NOTIFY_OK;

5500

}

5502

}

5501

5503

5502

void __init page_alloc_init(void)

5504

void __init page_alloc_init(void)

5503

{

5505

{

5504

hotcpu_notifier(page_alloc_cpu_notify, 0);

5506

hotcpu_notifier(page_alloc_cpu_notify, 0);

5505

}

5507

}

5506

5508

5507

/*

5509

/*

5508

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5510

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5509

* or min_free_kbytes changes.

5511

* or min_free_kbytes changes.

5510

*/

5512

*/

5511

static void calculate_totalreserve_pages(void)

5513

static void calculate_totalreserve_pages(void)

5512

{

5514

{

5513

struct pglist_data *pgdat;

5515

struct pglist_data *pgdat;

5514

unsigned long reserve_pages = 0;

5516

unsigned long reserve_pages = 0;

5515

enum zone_type i, j;

5517

enum zone_type i, j;

5516

5518

5517

for_each_online_pgdat(pgdat) {

5519

for_each_online_pgdat(pgdat) {

5518

for (i = 0; i < MAX_NR_ZONES; i++) {

5520

for (i = 0; i < MAX_NR_ZONES; i++) {

5519

struct zone *zone = pgdat->node_zones + i;

5521

struct zone *zone = pgdat->node_zones + i;

5520

unsigned long max = 0;

5522

unsigned long max = 0;

5521

5523

5522

/* Find valid and maximum lowmem_reserve in the zone */

5524

/* Find valid and maximum lowmem_reserve in the zone */

5523

for (j = i; j < MAX_NR_ZONES; j++) {

5525

for (j = i; j < MAX_NR_ZONES; j++) {

5524

if (zone->lowmem_reserve[j] > max)

5526

if (zone->lowmem_reserve[j] > max)

5525

max = zone->lowmem_reserve[j];

5527

max = zone->lowmem_reserve[j];

5526

}

5528

}

5527

5529

5528

/* we treat the high watermark as reserved pages. */

5530

/* we treat the high watermark as reserved pages. */

5529

max += high_wmark_pages(zone);

5531

max += high_wmark_pages(zone);

5530

5532

5531

if (max > zone->managed_pages)

5533

if (max > zone->managed_pages)

5532

max = zone->managed_pages;

5534

max = zone->managed_pages;

5533

reserve_pages += max;

5535

reserve_pages += max;

5534

/*

5536

/*

5535

* Lowmem reserves are not available to

5537

* Lowmem reserves are not available to

5536

* GFP_HIGHUSER page cache allocations and

5538

* GFP_HIGHUSER page cache allocations and

5537

* kswapd tries to balance zones to their high

5539

* kswapd tries to balance zones to their high

5538

* watermark. As a result, neither should be

5540

* watermark. As a result, neither should be

5539

* regarded as dirtyable memory, to prevent a

5541

* regarded as dirtyable memory, to prevent a

5540

* situation where reclaim has to clean pages

5542

* situation where reclaim has to clean pages

5541

* in order to balance the zones.

5543

* in order to balance the zones.

5542

*/

5544

*/

5543

zone->dirty_balance_reserve = max;

5545

zone->dirty_balance_reserve = max;

5544

}

5546

}

5545

}

5547

}

5546

dirty_balance_reserve = reserve_pages;

5548

dirty_balance_reserve = reserve_pages;

5547

totalreserve_pages = reserve_pages;

5549

totalreserve_pages = reserve_pages;

5548

}

5550

}

5549

5551

5550

/*

5552

/*

5551

* setup_per_zone_lowmem_reserve - called whenever

5553

* setup_per_zone_lowmem_reserve - called whenever

5552

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5554

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5553

* has a correct pages reserved value, so an adequate number of

5555

* has a correct pages reserved value, so an adequate number of

5554

* pages are left in the zone after a successful __alloc_pages().

5556

* pages are left in the zone after a successful __alloc_pages().

5555

*/

5557

*/

5556

static void setup_per_zone_lowmem_reserve(void)

5558

static void setup_per_zone_lowmem_reserve(void)

5557

{

5559

{

5558

struct pglist_data *pgdat;

5560

struct pglist_data *pgdat;

5559

enum zone_type j, idx;

5561

enum zone_type j, idx;

5560

5562

5561

for_each_online_pgdat(pgdat) {

5563

for_each_online_pgdat(pgdat) {

5562

for (j = 0; j < MAX_NR_ZONES; j++) {

5564

for (j = 0; j < MAX_NR_ZONES; j++) {

5563

struct zone *zone = pgdat->node_zones + j;

5565

struct zone *zone = pgdat->node_zones + j;

5564

unsigned long managed_pages = zone->managed_pages;

5566

unsigned long managed_pages = zone->managed_pages;

5565

5567

5566

zone->lowmem_reserve[j] = 0;

5568

zone->lowmem_reserve[j] = 0;

5567

5569

5568

idx = j;

5570

idx = j;

5569

while (idx) {

5571

while (idx) {

5570

struct zone *lower_zone;

5572

struct zone *lower_zone;

5571

5573

5572

idx--;

5574

idx--;

5573

5575

5574

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5576

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5575

sysctl_lowmem_reserve_ratio[idx] = 1;

5577

sysctl_lowmem_reserve_ratio[idx] = 1;

5576

5578

5577

lower_zone = pgdat->node_zones + idx;

5579

lower_zone = pgdat->node_zones + idx;

5578

lower_zone->lowmem_reserve[j] = managed_pages /

5580

lower_zone->lowmem_reserve[j] = managed_pages /

5579

sysctl_lowmem_reserve_ratio[idx];

5581

sysctl_lowmem_reserve_ratio[idx];

5580

managed_pages += lower_zone->managed_pages;

5582

managed_pages += lower_zone->managed_pages;

5581

}

5583

}

5582

}

5584

}

5583

}

5585

}

5584

5586

5585

/* update totalreserve_pages */

5587

/* update totalreserve_pages */

5586

calculate_totalreserve_pages();

5588

calculate_totalreserve_pages();

5587

}

5589

}

5588

5590

5589

static void __setup_per_zone_wmarks(void)

5591

static void __setup_per_zone_wmarks(void)

5590

{

5592

{

5591

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5593

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5592

unsigned long lowmem_pages = 0;

5594

unsigned long lowmem_pages = 0;

5593

struct zone *zone;

5595

struct zone *zone;

5594

unsigned long flags;

5596

unsigned long flags;

5595

5597

5596

/* Calculate total number of !ZONE_HIGHMEM pages */

5598

/* Calculate total number of !ZONE_HIGHMEM pages */

5597

for_each_zone(zone) {

5599

for_each_zone(zone) {

5598

if (!is_highmem(zone))

5600

if (!is_highmem(zone))

5599

lowmem_pages += zone->managed_pages;

5601

lowmem_pages += zone->managed_pages;

5600

}

5602

}

5601

5603

5602

for_each_zone(zone) {

5604

for_each_zone(zone) {

5603

u64 tmp;

5605

u64 tmp;

5604

5606

5605

spin_lock_irqsave(&zone->lock, flags);

5607

spin_lock_irqsave(&zone->lock, flags);

5606

tmp = (u64)pages_min * zone->managed_pages;

5608

tmp = (u64)pages_min * zone->managed_pages;

5607

do_div(tmp, lowmem_pages);

5609

do_div(tmp, lowmem_pages);

5608

if (is_highmem(zone)) {

5610

if (is_highmem(zone)) {

5609

/*

5611

/*

5610

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5612

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5611

* need highmem pages, so cap pages_min to a small

5613

* need highmem pages, so cap pages_min to a small

5612

* value here.

5614

* value here.

5613

*

5615

*

5614

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5616

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5615

* deltas controls asynch page reclaim, and so should

5617

* deltas controls asynch page reclaim, and so should

5616

* not be capped for highmem.

5618

* not be capped for highmem.

5617

*/

5619

*/

5618

unsigned long min_pages;

5620

unsigned long min_pages;

5619

5621

5620

min_pages = zone->managed_pages / 1024;

5622

min_pages = zone->managed_pages / 1024;

5621

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5623

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5622

zone->watermark[WMARK_MIN] = min_pages;

5624

zone->watermark[WMARK_MIN] = min_pages;

5623

} else {

5625

} else {

5624

/*

5626

/*

5625

* If it's a lowmem zone, reserve a number of pages

5627

* If it's a lowmem zone, reserve a number of pages

5626

* proportionate to the zone's size.

5628

* proportionate to the zone's size.

5627

*/

5629

*/

5628

zone->watermark[WMARK_MIN] = tmp;

5630

zone->watermark[WMARK_MIN] = tmp;

5629

}

5631

}

5630

5632

5631

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5633

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5632

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5634

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5633

5635

5634

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5636

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5635

high_wmark_pages(zone) -

5637

high_wmark_pages(zone) -

5636

low_wmark_pages(zone) -

5638

low_wmark_pages(zone) -

5637

zone_page_state(zone, NR_ALLOC_BATCH));

5639

zone_page_state(zone, NR_ALLOC_BATCH));

5638

5640

5639

setup_zone_migrate_reserve(zone);

5641

setup_zone_migrate_reserve(zone);

5640

spin_unlock_irqrestore(&zone->lock, flags);

5642

spin_unlock_irqrestore(&zone->lock, flags);

5641

}

5643

}

5642

5644

5643

/* update totalreserve_pages */

5645

/* update totalreserve_pages */

5644

calculate_totalreserve_pages();

5646

calculate_totalreserve_pages();

5645

}

5647

}

5646

5648

5647

/**

5649

/**

5648

* setup_per_zone_wmarks - called when min_free_kbytes changes

5650

* setup_per_zone_wmarks - called when min_free_kbytes changes

5649

* or when memory is hot-{added|removed}

5651

* or when memory is hot-{added|removed}

5650

*

5652

*

5651

* Ensures that the watermark[min,low,high] values for each zone are set

5653

* Ensures that the watermark[min,low,high] values for each zone are set

5652

* correctly with respect to min_free_kbytes.

5654

* correctly with respect to min_free_kbytes.

5653

*/

5655

*/

5654

void setup_per_zone_wmarks(void)

5656

void setup_per_zone_wmarks(void)

5655

{

5657

{

5656

mutex_lock(&zonelists_mutex);

5658

mutex_lock(&zonelists_mutex);

5657

__setup_per_zone_wmarks();

5659

__setup_per_zone_wmarks();

5658

mutex_unlock(&zonelists_mutex);

5660

mutex_unlock(&zonelists_mutex);

5659

}

5661

}

5660

5662

5661

/*

5663

/*

5662

* The inactive anon list should be small enough that the VM never has to

5664

* The inactive anon list should be small enough that the VM never has to

5663

* do too much work, but large enough that each inactive page has a chance

5665

* do too much work, but large enough that each inactive page has a chance

5664

* to be referenced again before it is swapped out.

5666

* to be referenced again before it is swapped out.

5665

*

5667

*

5666

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5668

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5667

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5669

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5668

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5670

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5669

* the anonymous pages are kept on the inactive list.

5671

* the anonymous pages are kept on the inactive list.

5670

*

5672

*

5671

* total target max

5673

* total target max

5672

* memory ratio inactive anon

5674

* memory ratio inactive anon

5673

* -------------------------------------

5675

* -------------------------------------

5674

* 10MB 1 5MB

5676

* 10MB 1 5MB

5675

* 100MB 1 50MB

5677

* 100MB 1 50MB

5676

* 1GB 3 250MB

5678

* 1GB 3 250MB

5677

* 10GB 10 0.9GB

5679

* 10GB 10 0.9GB

5678

* 100GB 31 3GB

5680

* 100GB 31 3GB

5679

* 1TB 101 10GB

5681

* 1TB 101 10GB

5680

* 10TB 320 32GB

5682

* 10TB 320 32GB

5681

*/

5683

*/

5682

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5684

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5683

{

5685

{

5684

unsigned int gb, ratio;

5686

unsigned int gb, ratio;

5685

5687

5686

/* Zone size in gigabytes */

5688

/* Zone size in gigabytes */

5687

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5689

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5688

if (gb)

5690

if (gb)

5689

ratio = int_sqrt(10 * gb);

5691

ratio = int_sqrt(10 * gb);

5690

else

5692

else

5691

ratio = 1;

5693

ratio = 1;

5692

5694

5693

zone->inactive_ratio = ratio;

5695

zone->inactive_ratio = ratio;

5694

}

5696

}

5695

5697

5696

static void __meminit setup_per_zone_inactive_ratio(void)

5698

static void __meminit setup_per_zone_inactive_ratio(void)

5697

{

5699

{

5698

struct zone *zone;

5700

struct zone *zone;

5699

5701

5700

for_each_zone(zone)

5702

for_each_zone(zone)

5701

calculate_zone_inactive_ratio(zone);

5703

calculate_zone_inactive_ratio(zone);

5702

}

5704

}

5703

5705

5704

/*

5706

/*

5705

* Initialise min_free_kbytes.

5707

* Initialise min_free_kbytes.

5706

*

5708

*

5707

* For small machines we want it small (128k min). For large machines

5709

* For small machines we want it small (128k min). For large machines

5708

* we want it large (64MB max). But it is not linear, because network

5710

* we want it large (64MB max). But it is not linear, because network

5709

* bandwidth does not increase linearly with machine size. We use

5711

* bandwidth does not increase linearly with machine size. We use

5710

*

5712

*

5711

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5713

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5712

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5714

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5713

*

5715

*

5714

* which yields

5716

* which yields

5715

*

5717

*

5716

* 16MB: 512k

5718

* 16MB: 512k

5717

* 32MB: 724k

5719

* 32MB: 724k

5718

* 64MB: 1024k

5720

* 64MB: 1024k

5719

* 128MB: 1448k

5721

* 128MB: 1448k

5720

* 256MB: 2048k

5722

* 256MB: 2048k

5721

* 512MB: 2896k

5723

* 512MB: 2896k

5722

* 1024MB: 4096k

5724

* 1024MB: 4096k

5723

* 2048MB: 5792k

5725

* 2048MB: 5792k

5724

* 4096MB: 8192k

5726

* 4096MB: 8192k

5725

* 8192MB: 11584k

5727

* 8192MB: 11584k

5726

* 16384MB: 16384k

5728

* 16384MB: 16384k

5727

*/

5729

*/

5728

int __meminit init_per_zone_wmark_min(void)

5730

int __meminit init_per_zone_wmark_min(void)

5729

{

5731

{

5730

unsigned long lowmem_kbytes;

5732

unsigned long lowmem_kbytes;

5731

int new_min_free_kbytes;

5733

int new_min_free_kbytes;

5732

5734

5733

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5735

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5734

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5736

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5735

5737

5736

if (new_min_free_kbytes > user_min_free_kbytes) {

5738

if (new_min_free_kbytes > user_min_free_kbytes) {

5737

min_free_kbytes = new_min_free_kbytes;

5739

min_free_kbytes = new_min_free_kbytes;

5738

if (min_free_kbytes < 128)

5740

if (min_free_kbytes < 128)

5739

min_free_kbytes = 128;

5741

min_free_kbytes = 128;

5740

if (min_free_kbytes > 65536)

5742

if (min_free_kbytes > 65536)

5741

min_free_kbytes = 65536;

5743

min_free_kbytes = 65536;

5742

} else {

5744

} else {

5743

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5745

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5744

new_min_free_kbytes, user_min_free_kbytes);

5746

new_min_free_kbytes, user_min_free_kbytes);

5745

}

5747

}

5746

setup_per_zone_wmarks();

5748

setup_per_zone_wmarks();

5747

refresh_zone_stat_thresholds();

5749

refresh_zone_stat_thresholds();

5748

setup_per_zone_lowmem_reserve();

5750

setup_per_zone_lowmem_reserve();

5749

setup_per_zone_inactive_ratio();

5751

setup_per_zone_inactive_ratio();

5750

return 0;

5752

return 0;

5751

}

5753

}

5752

module_init(init_per_zone_wmark_min)

5754

module_init(init_per_zone_wmark_min)

5753

5755

5754

/*

5756

/*

5755

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5757

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5756

* that we can call two helper functions whenever min_free_kbytes

5758

* that we can call two helper functions whenever min_free_kbytes

5757

* changes.

5759

* changes.

5758

*/

5760

*/

5759

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5761

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5760

void __user *buffer, size_t *length, loff_t *ppos)

5762

void __user *buffer, size_t *length, loff_t *ppos)

5761

{

5763

{

5762

int rc;

5764

int rc;

5763

5765

5764

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5766

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5765

if (rc)

5767

if (rc)

5766

return rc;

5768

return rc;

5767

5769

5768

if (write) {

5770

if (write) {

5769

user_min_free_kbytes = min_free_kbytes;

5771

user_min_free_kbytes = min_free_kbytes;

5770

setup_per_zone_wmarks();

5772

setup_per_zone_wmarks();

5771

}

5773

}

5772

return 0;

5774

return 0;

5773

}

5775

}

5774

5776

5775

#ifdef CONFIG_NUMA

5777

#ifdef CONFIG_NUMA

5776

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5778

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5777

void __user *buffer, size_t *length, loff_t *ppos)

5779

void __user *buffer, size_t *length, loff_t *ppos)

5778

{

5780

{

5779

struct zone *zone;

5781

struct zone *zone;

5780

int rc;

5782

int rc;

5781

5783

5782

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5784

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5783

if (rc)

5785

if (rc)

5784

return rc;

5786

return rc;

5785

5787

5786

for_each_zone(zone)

5788

for_each_zone(zone)

5787

zone->min_unmapped_pages = (zone->managed_pages *

5789

zone->min_unmapped_pages = (zone->managed_pages *

5788

sysctl_min_unmapped_ratio) / 100;

5790

sysctl_min_unmapped_ratio) / 100;

5789

return 0;

5791

return 0;

5790

}

5792

}

5791

5793

5792

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5794

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5793

void __user *buffer, size_t *length, loff_t *ppos)

5795

void __user *buffer, size_t *length, loff_t *ppos)

5794

{

5796

{

5795

struct zone *zone;

5797

struct zone *zone;

5796

int rc;

5798

int rc;

5797

5799

5798

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5800

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5799

if (rc)

5801

if (rc)

5800

return rc;

5802

return rc;

5801

5803

5802

for_each_zone(zone)

5804

for_each_zone(zone)

5803

zone->min_slab_pages = (zone->managed_pages *

5805

zone->min_slab_pages = (zone->managed_pages *

5804

sysctl_min_slab_ratio) / 100;

5806

sysctl_min_slab_ratio) / 100;

5805

return 0;

5807

return 0;

5806

}

5808

}

5807

#endif

5809

#endif

5808

5810

5809

/*

5811

/*

5810

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5812

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5811

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5813

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5812

* whenever sysctl_lowmem_reserve_ratio changes.

5814

* whenever sysctl_lowmem_reserve_ratio changes.

5813

*

5815

*

5814

* The reserve ratio obviously has absolutely no relation with the

5816

* The reserve ratio obviously has absolutely no relation with the

5815

* minimum watermarks. The lowmem reserve ratio can only make sense

5817

* minimum watermarks. The lowmem reserve ratio can only make sense

5816

* if in function of the boot time zone sizes.

5818

* if in function of the boot time zone sizes.

5817

*/

5819

*/

5818

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5820

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5819

void __user *buffer, size_t *length, loff_t *ppos)

5821

void __user *buffer, size_t *length, loff_t *ppos)

5820

{

5822

{

5821

proc_dointvec_minmax(table, write, buffer, length, ppos);

5823

proc_dointvec_minmax(table, write, buffer, length, ppos);

5822

setup_per_zone_lowmem_reserve();

5824

setup_per_zone_lowmem_reserve();

5823

return 0;

5825

return 0;

5824

}

5826

}

5825

5827

5826

/*

5828

/*

5827

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5829

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5828

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5830

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5829

* pagelist can have before it gets flushed back to buddy allocator.

5831

* pagelist can have before it gets flushed back to buddy allocator.

5830

*/

5832

*/

5831

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5833

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5832

void __user *buffer, size_t *length, loff_t *ppos)

5834

void __user *buffer, size_t *length, loff_t *ppos)

5833

{

5835

{

5834

struct zone *zone;

5836

struct zone *zone;

5835

int old_percpu_pagelist_fraction;

5837

int old_percpu_pagelist_fraction;

5836

int ret;

5838

int ret;

5837

5839

5838

mutex_lock(&pcp_batch_high_lock);

5840

mutex_lock(&pcp_batch_high_lock);

5839

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5841

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5840

5842

5841

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5843

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5842

if (!write || ret < 0)

5844

if (!write || ret < 0)

5843

goto out;

5845

goto out;

5844

5846

5845

/* Sanity checking to avoid pcp imbalance */

5847

/* Sanity checking to avoid pcp imbalance */

5846

if (percpu_pagelist_fraction &&

5848

if (percpu_pagelist_fraction &&

5847

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5849

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5848

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5850

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5849

ret = -EINVAL;

5851

ret = -EINVAL;

5850

goto out;

5852

goto out;

5851

}

5853

}

5852

5854

5853

/* No change? */

5855

/* No change? */

5854

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5856

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5855

goto out;

5857

goto out;

5856

5858

5857

for_each_populated_zone(zone) {

5859

for_each_populated_zone(zone) {

5858

unsigned int cpu;

5860

unsigned int cpu;

5859

5861

5860

for_each_possible_cpu(cpu)

5862

for_each_possible_cpu(cpu)

5861

pageset_set_high_and_batch(zone,

5863

pageset_set_high_and_batch(zone,

5862

per_cpu_ptr(zone->pageset, cpu));

5864

per_cpu_ptr(zone->pageset, cpu));

5863

}

5865

}

5864

out:

5866

out:

5865

mutex_unlock(&pcp_batch_high_lock);

5867

mutex_unlock(&pcp_batch_high_lock);

5866

return ret;

5868

return ret;

5867

}

5869

}

5868

5870

5869

int hashdist = HASHDIST_DEFAULT;

5871

int hashdist = HASHDIST_DEFAULT;

5870

5872

5871

#ifdef CONFIG_NUMA

5873

#ifdef CONFIG_NUMA

5872

static int __init set_hashdist(char *str)

5874

static int __init set_hashdist(char *str)

5873

{

5875

{

5874

if (!str)

5876

if (!str)

5875

return 0;

5877

return 0;

5876

hashdist = simple_strtoul(str, &str, 0);

5878

hashdist = simple_strtoul(str, &str, 0);

5877

return 1;

5879

return 1;

5878

}

5880

}

5879

__setup("hashdist=", set_hashdist);

5881

__setup("hashdist=", set_hashdist);

5880

#endif

5882

#endif

5881

5883

5882

/*

5884

/*

5883

* allocate a large system hash table from bootmem

5885

* allocate a large system hash table from bootmem

5884

* - it is assumed that the hash table must contain an exact power-of-2

5886

* - it is assumed that the hash table must contain an exact power-of-2

5885

* quantity of entries

5887

* quantity of entries

5886

* - limit is the number of hash buckets, not the total allocation size

5888

* - limit is the number of hash buckets, not the total allocation size

5887

*/

5889

*/

5888

void *__init alloc_large_system_hash(const char *tablename,

5890

void *__init alloc_large_system_hash(const char *tablename,

5889

unsigned long bucketsize,

5891

unsigned long bucketsize,

5890

unsigned long numentries,

5892

unsigned long numentries,

5891

int scale,

5893

int scale,

5892

int flags,

5894

int flags,

5893

unsigned int *_hash_shift,

5895

unsigned int *_hash_shift,

5894

unsigned int *_hash_mask,

5896

unsigned int *_hash_mask,

5895

unsigned long low_limit,

5897

unsigned long low_limit,

5896

unsigned long high_limit)

5898

unsigned long high_limit)

5897

{

5899

{

5898

unsigned long long max = high_limit;

5900

unsigned long long max = high_limit;

5899

unsigned long log2qty, size;

5901

unsigned long log2qty, size;

5900

void *table = NULL;

5902

void *table = NULL;

5901

5903

5902

/* allow the kernel cmdline to have a say */

5904

/* allow the kernel cmdline to have a say */

5903

if (!numentries) {

5905

if (!numentries) {

5904

/* round applicable memory size up to nearest megabyte */

5906

/* round applicable memory size up to nearest megabyte */

5905

numentries = nr_kernel_pages;

5907

numentries = nr_kernel_pages;

5906

5908

5907

/* It isn't necessary when PAGE_SIZE >= 1MB */

5909

/* It isn't necessary when PAGE_SIZE >= 1MB */

5908

if (PAGE_SHIFT < 20)

5910

if (PAGE_SHIFT < 20)

5909

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5911

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5910

5912

5911

/* limit to 1 bucket per 2^scale bytes of low memory */

5913

/* limit to 1 bucket per 2^scale bytes of low memory */

5912

if (scale > PAGE_SHIFT)

5914

if (scale > PAGE_SHIFT)

5913

numentries >>= (scale - PAGE_SHIFT);

5915

numentries >>= (scale - PAGE_SHIFT);

5914

else

5916

else

5915

numentries <<= (PAGE_SHIFT - scale);

5917

numentries <<= (PAGE_SHIFT - scale);

5916

5918

5917

/* Make sure we've got at least a 0-order allocation.. */

5919

/* Make sure we've got at least a 0-order allocation.. */

5918

if (unlikely(flags & HASH_SMALL)) {

5920

if (unlikely(flags & HASH_SMALL)) {

5919

/* Makes no sense without HASH_EARLY */

5921

/* Makes no sense without HASH_EARLY */

5920

WARN_ON(!(flags & HASH_EARLY));

5922

WARN_ON(!(flags & HASH_EARLY));

5921

if (!(numentries >> *_hash_shift)) {

5923

if (!(numentries >> *_hash_shift)) {

5922

numentries = 1UL << *_hash_shift;

5924

numentries = 1UL << *_hash_shift;

5923

BUG_ON(!numentries);

5925

BUG_ON(!numentries);

5924

}

5926

}

5925

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5927

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5926

numentries = PAGE_SIZE / bucketsize;

5928

numentries = PAGE_SIZE / bucketsize;

5927

}

5929

}

5928

numentries = roundup_pow_of_two(numentries);

5930

numentries = roundup_pow_of_two(numentries);

5929

5931

5930

/* limit allocation size to 1/16 total memory by default */

5932

/* limit allocation size to 1/16 total memory by default */

5931

if (max == 0) {

5933

if (max == 0) {

5932

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5934

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5933

do_div(max, bucketsize);

5935

do_div(max, bucketsize);

5934

}

5936

}

5935

max = min(max, 0x80000000ULL);

5937

max = min(max, 0x80000000ULL);

5936

5938

5937

if (numentries < low_limit)

5939

if (numentries < low_limit)

5938

numentries = low_limit;

5940

numentries = low_limit;

5939

if (numentries > max)

5941

if (numentries > max)

5940

numentries = max;

5942

numentries = max;

5941

5943

5942

log2qty = ilog2(numentries);

5944

log2qty = ilog2(numentries);

5943

5945

5944

do {

5946

do {

5945

size = bucketsize << log2qty;

5947

size = bucketsize << log2qty;

5946

if (flags & HASH_EARLY)

5948

if (flags & HASH_EARLY)

5947

table = alloc_bootmem_nopanic(size);

5949

table = alloc_bootmem_nopanic(size);

5948

else if (hashdist)

5950

else if (hashdist)

5949

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5951

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5950

else {

5952

else {

5951

/*

5953

/*

5952

* If bucketsize is not a power-of-two, we may free

5954

* If bucketsize is not a power-of-two, we may free

5953

* some pages at the end of hash table which

5955

* some pages at the end of hash table which

5954

* alloc_pages_exact() automatically does

5956

* alloc_pages_exact() automatically does

5955

*/

5957

*/

5956

if (get_order(size) < MAX_ORDER) {

5958

if (get_order(size) < MAX_ORDER) {

5957

table = alloc_pages_exact(size, GFP_ATOMIC);

5959

table = alloc_pages_exact(size, GFP_ATOMIC);

5958

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5960

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5959

}

5961

}

5960

}

5962

}

5961

} while (!table && size > PAGE_SIZE && --log2qty);

5963

} while (!table && size > PAGE_SIZE && --log2qty);

5962

5964

5963

if (!table)

5965

if (!table)

5964

panic("Failed to allocate %s hash table\n", tablename);

5966

panic("Failed to allocate %s hash table\n", tablename);

5965

5967

5966

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5968

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5967

tablename,

5969

tablename,

5968

(1UL << log2qty),

5970

(1UL << log2qty),

5969

ilog2(size) - PAGE_SHIFT,

5971

ilog2(size) - PAGE_SHIFT,

5970

size);

5972

size);

5971

5973

5972

if (_hash_shift)

5974

if (_hash_shift)

5973

*_hash_shift = log2qty;

5975

*_hash_shift = log2qty;

5974

if (_hash_mask)

5976

if (_hash_mask)

5975

*_hash_mask = (1 << log2qty) - 1;

5977

*_hash_mask = (1 << log2qty) - 1;

5976

5978

5977

return table;

5979

return table;

5978

}

5980

}

5979

5981

5980

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5982

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5981

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5983

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5982

unsigned long pfn)

5984

unsigned long pfn)

5983

{

5985

{

5984

#ifdef CONFIG_SPARSEMEM

5986

#ifdef CONFIG_SPARSEMEM

5985

return __pfn_to_section(pfn)->pageblock_flags;

5987

return __pfn_to_section(pfn)->pageblock_flags;

5986

#else

5988

#else

5987

return zone->pageblock_flags;

5989

return zone->pageblock_flags;

5988

#endif /* CONFIG_SPARSEMEM */

5990

#endif /* CONFIG_SPARSEMEM */

5989

}

5991

}

5990

5992

5991

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5993

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5992

{

5994

{

5993

#ifdef CONFIG_SPARSEMEM

5995

#ifdef CONFIG_SPARSEMEM

5994

pfn &= (PAGES_PER_SECTION-1);

5996

pfn &= (PAGES_PER_SECTION-1);

5995

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5997

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5996

#else

5998

#else

5997

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5999

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5998

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6000

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5999

#endif /* CONFIG_SPARSEMEM */

6001

#endif /* CONFIG_SPARSEMEM */

6000

}

6002

}

6001

6003

6002

/**

6004

/**

6003

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6005

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6004

* @page: The page within the block of interest

6006

* @page: The page within the block of interest

6005

* @start_bitidx: The first bit of interest to retrieve

6007

* @start_bitidx: The first bit of interest to retrieve

6006

* @end_bitidx: The last bit of interest

6008

* @end_bitidx: The last bit of interest

6007

* returns pageblock_bits flags

6009

* returns pageblock_bits flags

6008

*/

6010

*/

6009

unsigned long get_pageblock_flags_mask(struct page *page,

6011

unsigned long get_pageblock_flags_mask(struct page *page,

6010

unsigned long end_bitidx,

6012

unsigned long end_bitidx,

6011

unsigned long mask)

6013

unsigned long mask)

6012

{

6014

{

6013

struct zone *zone;

6015

struct zone *zone;

6014

unsigned long *bitmap;

6016

unsigned long *bitmap;

6015

unsigned long pfn, bitidx, word_bitidx;

6017

unsigned long pfn, bitidx, word_bitidx;

6016

unsigned long word;

6018

unsigned long word;

6017

6019

6018

zone = page_zone(page);

6020

zone = page_zone(page);

6019

pfn = page_to_pfn(page);

6021

pfn = page_to_pfn(page);

6020

bitmap = get_pageblock_bitmap(zone, pfn);

6022

bitmap = get_pageblock_bitmap(zone, pfn);

6021

bitidx = pfn_to_bitidx(zone, pfn);

6023

bitidx = pfn_to_bitidx(zone, pfn);

6022

word_bitidx = bitidx / BITS_PER_LONG;

6024

word_bitidx = bitidx / BITS_PER_LONG;

6023

bitidx &= (BITS_PER_LONG-1);

6025

bitidx &= (BITS_PER_LONG-1);

6024

6026

6025

word = bitmap[word_bitidx];

6027

word = bitmap[word_bitidx];

6026

bitidx += end_bitidx;

6028

bitidx += end_bitidx;

6027

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6029

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6028

}

6030

}

6029

6031

6030

/**

6032

/**

6031

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6033

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6032

* @page: The page within the block of interest

6034

* @page: The page within the block of interest

6033

* @start_bitidx: The first bit of interest

6035

* @start_bitidx: The first bit of interest

6034

* @end_bitidx: The last bit of interest

6036

* @end_bitidx: The last bit of interest

6035

* @flags: The flags to set

6037

* @flags: The flags to set

6036

*/

6038

*/

6037

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6039

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6038

unsigned long end_bitidx,

6040

unsigned long end_bitidx,

6039

unsigned long mask)

6041

unsigned long mask)

6040

{

6042

{

6041

struct zone *zone;

6043

struct zone *zone;

6042

unsigned long *bitmap;

6044

unsigned long *bitmap;

6043

unsigned long pfn, bitidx, word_bitidx;

6045

unsigned long pfn, bitidx, word_bitidx;

6044

unsigned long old_word, word;

6046

unsigned long old_word, word;

6045

6047

6046

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6048

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6047

6049

6048

zone = page_zone(page);

6050

zone = page_zone(page);

6049

pfn = page_to_pfn(page);

6051

pfn = page_to_pfn(page);

6050

bitmap = get_pageblock_bitmap(zone, pfn);

6052

bitmap = get_pageblock_bitmap(zone, pfn);

6051

bitidx = pfn_to_bitidx(zone, pfn);

6053

bitidx = pfn_to_bitidx(zone, pfn);

6052

word_bitidx = bitidx / BITS_PER_LONG;

6054

word_bitidx = bitidx / BITS_PER_LONG;

6053

bitidx &= (BITS_PER_LONG-1);

6055

bitidx &= (BITS_PER_LONG-1);

6054

6056

6055

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6057

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6056

6058

6057

bitidx += end_bitidx;

6059

bitidx += end_bitidx;

6058

mask <<= (BITS_PER_LONG - bitidx - 1);

6060

mask <<= (BITS_PER_LONG - bitidx - 1);

6059

flags <<= (BITS_PER_LONG - bitidx - 1);

6061

flags <<= (BITS_PER_LONG - bitidx - 1);

6060

6062

6061

word = ACCESS_ONCE(bitmap[word_bitidx]);

6063

word = ACCESS_ONCE(bitmap[word_bitidx]);

6062

for (;;) {

6064

for (;;) {

6063

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6065

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6064

if (word == old_word)

6066

if (word == old_word)

6065

break;

6067

break;

6066

word = old_word;

6068

word = old_word;

6067

}

6069

}

6068

}

6070

}

6069

6071

6070

/*

6072

/*

6071

* This function checks whether pageblock includes unmovable pages or not.

6073

* This function checks whether pageblock includes unmovable pages or not.

6072

* If @count is not zero, it is okay to include less @count unmovable pages

6074

* If @count is not zero, it is okay to include less @count unmovable pages

6073

*

6075

*

6074

* PageLRU check without isolation or lru_lock could race so that

6076

* PageLRU check without isolation or lru_lock could race so that

6075

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6077

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6076

* expect this function should be exact.

6078

* expect this function should be exact.

6077

*/

6079

*/

6078

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6080

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6079

bool skip_hwpoisoned_pages)

6081

bool skip_hwpoisoned_pages)

6080

{

6082

{

6081

unsigned long pfn, iter, found;

6083

unsigned long pfn, iter, found;

6082

int mt;

6084

int mt;

6083

6085

6084

/*

6086

/*

6085

* For avoiding noise data, lru_add_drain_all() should be called

6087

* For avoiding noise data, lru_add_drain_all() should be called

6086

* If ZONE_MOVABLE, the zone never contains unmovable pages

6088

* If ZONE_MOVABLE, the zone never contains unmovable pages

6087

*/

6089

*/

6088

if (zone_idx(zone) == ZONE_MOVABLE)

6090

if (zone_idx(zone) == ZONE_MOVABLE)

6089

return false;

6091

return false;

6090

mt = get_pageblock_migratetype(page);

6092

mt = get_pageblock_migratetype(page);

6091

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6093

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6092

return false;

6094

return false;

6093

6095

6094

pfn = page_to_pfn(page);

6096

pfn = page_to_pfn(page);

6095

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6097

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6096

unsigned long check = pfn + iter;

6098

unsigned long check = pfn + iter;

6097

6099

6098

if (!pfn_valid_within(check))

6100

if (!pfn_valid_within(check))

6099

continue;

6101

continue;

6100

6102

6101

page = pfn_to_page(check);

6103

page = pfn_to_page(check);

6102

6104

6103

/*

6105

/*

6104

* Hugepages are not in LRU lists, but they're movable.

6106

* Hugepages are not in LRU lists, but they're movable.

6105

* We need not scan over tail pages bacause we don't

6107

* We need not scan over tail pages bacause we don't

6106

* handle each tail page individually in migration.

6108

* handle each tail page individually in migration.

6107

*/

6109

*/

6108

if (PageHuge(page)) {

6110

if (PageHuge(page)) {

6109

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6111

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6110

continue;

6112

continue;

6111

}

6113

}

6112

6114

6113

/*

6115

/*

6114

* We can't use page_count without pin a page

6116

* We can't use page_count without pin a page

6115

* because another CPU can free compound page.

6117

* because another CPU can free compound page.

6116

* This check already skips compound tails of THP

6118

* This check already skips compound tails of THP

6117

* because their page->_count is zero at all time.

6119

* because their page->_count is zero at all time.

6118

*/

6120

*/

6119

if (!atomic_read(&page->_count)) {

6121

if (!atomic_read(&page->_count)) {

6120

if (PageBuddy(page))

6122

if (PageBuddy(page))

6121

iter += (1 << page_order(page)) - 1;

6123

iter += (1 << page_order(page)) - 1;

6122

continue;

6124

continue;

6123

}

6125

}

6124

6126

6125

/*

6127

/*

6126

* The HWPoisoned page may be not in buddy system, and

6128

* The HWPoisoned page may be not in buddy system, and

6127

* page_count() is not 0.

6129

* page_count() is not 0.

6128

*/

6130

*/

6129

if (skip_hwpoisoned_pages && PageHWPoison(page))

6131

if (skip_hwpoisoned_pages && PageHWPoison(page))

6130

continue;

6132

continue;

6131

6133

6132

if (!PageLRU(page))

6134

if (!PageLRU(page))

6133

found++;

6135

found++;

6134

/*

6136

/*

6135

* If there are RECLAIMABLE pages, we need to check it.

6137

* If there are RECLAIMABLE pages, we need to check it.

6136

* But now, memory offline itself doesn't call shrink_slab()

6138

* But now, memory offline itself doesn't call shrink_slab()

6137

* and it still to be fixed.

6139

* and it still to be fixed.

6138

*/

6140

*/

6139

/*

6141

/*

6140

* If the page is not RAM, page_count()should be 0.

6142

* If the page is not RAM, page_count()should be 0.

6141

* we don't need more check. This is an _used_ not-movable page.

6143

* we don't need more check. This is an _used_ not-movable page.

6142

*

6144

*

6143

* The problematic thing here is PG_reserved pages. PG_reserved

6145

* The problematic thing here is PG_reserved pages. PG_reserved

6144

* is set to both of a memory hole page and a _used_ kernel

6146

* is set to both of a memory hole page and a _used_ kernel

6145

* page at boot.

6147

* page at boot.

6146

*/

6148

*/

6147

if (found > count)

6149

if (found > count)

6148

return true;

6150

return true;

6149

}

6151

}

6150

return false;

6152

return false;

6151

}

6153

}

6152

6154

6153

bool is_pageblock_removable_nolock(struct page *page)

6155

bool is_pageblock_removable_nolock(struct page *page)

6154

{

6156

{

6155

struct zone *zone;

6157

struct zone *zone;

6156

unsigned long pfn;

6158

unsigned long pfn;

6157

6159

6158

/*

6160

/*

6159

* We have to be careful here because we are iterating over memory

6161

* We have to be careful here because we are iterating over memory

6160

* sections which are not zone aware so we might end up outside of

6162

* sections which are not zone aware so we might end up outside of

6161

* the zone but still within the section.

6163

* the zone but still within the section.

6162

* We have to take care about the node as well. If the node is offline

6164

* We have to take care about the node as well. If the node is offline

6163

* its NODE_DATA will be NULL - see page_zone.

6165

* its NODE_DATA will be NULL - see page_zone.

6164

*/

6166

*/

6165

if (!node_online(page_to_nid(page)))

6167

if (!node_online(page_to_nid(page)))

6166

return false;

6168

return false;

6167

6169

6168

zone = page_zone(page);

6170

zone = page_zone(page);

6169

pfn = page_to_pfn(page);

6171

pfn = page_to_pfn(page);

6170

if (!zone_spans_pfn(zone, pfn))

6172

if (!zone_spans_pfn(zone, pfn))

6171

return false;

6173

return false;

6172

6174

6173

return !has_unmovable_pages(zone, page, 0, true);

6175

return !has_unmovable_pages(zone, page, 0, true);

6174

}

6176

}

6175

6177

6176

#ifdef CONFIG_CMA

6178

#ifdef CONFIG_CMA

6177

6179

6178

static unsigned long pfn_max_align_down(unsigned long pfn)

6180

static unsigned long pfn_max_align_down(unsigned long pfn)

6179

{

6181

{

6180

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6182

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6181

pageblock_nr_pages) - 1);

6183

pageblock_nr_pages) - 1);

6182

}

6184

}

6183

6185

6184

static unsigned long pfn_max_align_up(unsigned long pfn)

6186

static unsigned long pfn_max_align_up(unsigned long pfn)

6185

{

6187

{

6186

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6188

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6187

pageblock_nr_pages));

6189

pageblock_nr_pages));

6188

}

6190

}

6189

6191

6190

/* [start, end) must belong to a single zone. */

6192

/* [start, end) must belong to a single zone. */

6191

static int __alloc_contig_migrate_range(struct compact_control *cc,

6193

static int __alloc_contig_migrate_range(struct compact_control *cc,

6192

unsigned long start, unsigned long end)

6194

unsigned long start, unsigned long end)

6193

{

6195

{

6194

/* This function is based on compact_zone() from compaction.c. */

6196

/* This function is based on compact_zone() from compaction.c. */

6195

unsigned long nr_reclaimed;

6197

unsigned long nr_reclaimed;

6196

unsigned long pfn = start;

6198

unsigned long pfn = start;

6197

unsigned int tries = 0;

6199

unsigned int tries = 0;

6198

int ret = 0;

6200

int ret = 0;

6199

6201

6200

migrate_prep();

6202

migrate_prep();

6201

6203

6202

while (pfn < end || !list_empty(&cc->migratepages)) {

6204

while (pfn < end || !list_empty(&cc->migratepages)) {

6203

if (fatal_signal_pending(current)) {

6205

if (fatal_signal_pending(current)) {

6204

ret = -EINTR;

6206

ret = -EINTR;

6205

break;

6207

break;

6206

}

6208

}

6207

6209

6208

if (list_empty(&cc->migratepages)) {

6210

if (list_empty(&cc->migratepages)) {

6209

cc->nr_migratepages = 0;

6211

cc->nr_migratepages = 0;

6210

pfn = isolate_migratepages_range(cc->zone, cc,

6212

pfn = isolate_migratepages_range(cc->zone, cc,

6211

pfn, end, true);

6213

pfn, end, true);

6212

if (!pfn) {

6214

if (!pfn) {

6213

ret = -EINTR;

6215

ret = -EINTR;

6214

break;

6216

break;

6215

}

6217

}

6216

tries = 0;

6218

tries = 0;

6217

} else if (++tries == 5) {

6219

} else if (++tries == 5) {

6218

ret = ret < 0 ? ret : -EBUSY;

6220

ret = ret < 0 ? ret : -EBUSY;

6219

break;

6221

break;

6220

}

6222

}

6221

6223

6222

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6224

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6223

&cc->migratepages);

6225

&cc->migratepages);

6224

cc->nr_migratepages -= nr_reclaimed;

6226

cc->nr_migratepages -= nr_reclaimed;

6225

6227

6226

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6228

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6227

NULL, 0, cc->mode, MR_CMA);

6229

NULL, 0, cc->mode, MR_CMA);

6228

}

6230

}

6229

if (ret < 0) {

6231

if (ret < 0) {

6230

putback_movable_pages(&cc->migratepages);

6232

putback_movable_pages(&cc->migratepages);

6231

return ret;

6233

return ret;

6232

}

6234

}

6233

return 0;

6235

return 0;

6234

}

6236

}

6235

6237

6236

/**

6238

/**

6237

* alloc_contig_range() -- tries to allocate given range of pages

6239

* alloc_contig_range() -- tries to allocate given range of pages

6238

* @start: start PFN to allocate

6240

* @start: start PFN to allocate

6239

* @end: one-past-the-last PFN to allocate

6241

* @end: one-past-the-last PFN to allocate

6240

* @migratetype: migratetype of the underlaying pageblocks (either

6242

* @migratetype: migratetype of the underlaying pageblocks (either

6241

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6243

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6242

* in range must have the same migratetype and it must

6244

* in range must have the same migratetype and it must

6243

* be either of the two.

6245

* be either of the two.

6244

*

6246

*

6245

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6247

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6246

* aligned, however it's the caller's responsibility to guarantee that

6248

* aligned, however it's the caller's responsibility to guarantee that

6247

* we are the only thread that changes migrate type of pageblocks the

6249

* we are the only thread that changes migrate type of pageblocks the

6248

* pages fall in.

6250

* pages fall in.

6249

*

6251

*

6250

* The PFN range must belong to a single zone.

6252

* The PFN range must belong to a single zone.

6251

*

6253

*

6252

* Returns zero on success or negative error code. On success all

6254

* Returns zero on success or negative error code. On success all

6253

* pages which PFN is in [start, end) are allocated for the caller and

6255

* pages which PFN is in [start, end) are allocated for the caller and

6254

* need to be freed with free_contig_range().

6256

* need to be freed with free_contig_range().

6255

*/

6257

*/

6256

int alloc_contig_range(unsigned long start, unsigned long end,

6258

int alloc_contig_range(unsigned long start, unsigned long end,

6257

unsigned migratetype)

6259

unsigned migratetype)

6258

{

6260

{

6259

unsigned long outer_start, outer_end;

6261

unsigned long outer_start, outer_end;

6260

int ret = 0, order;

6262

int ret = 0, order;

6261

6263

6262

struct compact_control cc = {

6264

struct compact_control cc = {

6263

.nr_migratepages = 0,

6265

.nr_migratepages = 0,

6264

.order = -1,

6266

.order = -1,

6265

.zone = page_zone(pfn_to_page(start)),

6267

.zone = page_zone(pfn_to_page(start)),

6266

.mode = MIGRATE_SYNC,

6268

.mode = MIGRATE_SYNC,

6267

.ignore_skip_hint = true,

6269

.ignore_skip_hint = true,

6268

};

6270

};

6269

INIT_LIST_HEAD(&cc.migratepages);

6271

INIT_LIST_HEAD(&cc.migratepages);

6270

6272

6271

/*

6273

/*

6272

* What we do here is we mark all pageblocks in range as

6274

* What we do here is we mark all pageblocks in range as

6273

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6275

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6274

* have different sizes, and due to the way page allocator

6276

* have different sizes, and due to the way page allocator

6275

* work, we align the range to biggest of the two pages so

6277

* work, we align the range to biggest of the two pages so

6276

* that page allocator won't try to merge buddies from

6278

* that page allocator won't try to merge buddies from

6277

* different pageblocks and change MIGRATE_ISOLATE to some

6279

* different pageblocks and change MIGRATE_ISOLATE to some

6278

* other migration type.

6280

* other migration type.

6279

*

6281

*

6280

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6282

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6281

* migrate the pages from an unaligned range (ie. pages that

6283

* migrate the pages from an unaligned range (ie. pages that

6282

* we are interested in). This will put all the pages in

6284

* we are interested in). This will put all the pages in

6283

* range back to page allocator as MIGRATE_ISOLATE.

6285

* range back to page allocator as MIGRATE_ISOLATE.

6284

*

6286

*

6285

* When this is done, we take the pages in range from page

6287

* When this is done, we take the pages in range from page

6286

* allocator removing them from the buddy system. This way

6288

* allocator removing them from the buddy system. This way

6287

* page allocator will never consider using them.

6289

* page allocator will never consider using them.

6288

*

6290

*

6289

* This lets us mark the pageblocks back as

6291

* This lets us mark the pageblocks back as

6290

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6292

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6291

* aligned range but not in the unaligned, original range are

6293

* aligned range but not in the unaligned, original range are

6292

* put back to page allocator so that buddy can use them.

6294

* put back to page allocator so that buddy can use them.

6293

*/

6295

*/

6294

6296

6295

ret = start_isolate_page_range(pfn_max_align_down(start),

6297

ret = start_isolate_page_range(pfn_max_align_down(start),

6296

pfn_max_align_up(end), migratetype,

6298

pfn_max_align_up(end), migratetype,

6297

false);

6299

false);

6298

if (ret)

6300

if (ret)

6299

return ret;

6301

return ret;

6300

6302

6301

ret = __alloc_contig_migrate_range(&cc, start, end);

6303

ret = __alloc_contig_migrate_range(&cc, start, end);

6302

if (ret)

6304

if (ret)

6303

goto done;

6305

goto done;

6304

6306

6305

/*

6307

/*

6306

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6308

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6307

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6309

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6308

* more, all pages in [start, end) are free in page allocator.

6310

* more, all pages in [start, end) are free in page allocator.

6309

* What we are going to do is to allocate all pages from

6311

* What we are going to do is to allocate all pages from

6310

* [start, end) (that is remove them from page allocator).

6312

* [start, end) (that is remove them from page allocator).

6311

*

6313

*

6312

* The only problem is that pages at the beginning and at the

6314

* The only problem is that pages at the beginning and at the

6313

* end of interesting range may be not aligned with pages that

6315

* end of interesting range may be not aligned with pages that

6314

* page allocator holds, ie. they can be part of higher order

6316

* page allocator holds, ie. they can be part of higher order

6315

* pages. Because of this, we reserve the bigger range and

6317

* pages. Because of this, we reserve the bigger range and

6316

* once this is done free the pages we are not interested in.

6318

* once this is done free the pages we are not interested in.

6317

*

6319

*

6318

* We don't have to hold zone->lock here because the pages are

6320

* We don't have to hold zone->lock here because the pages are

6319

* isolated thus they won't get removed from buddy.

6321

* isolated thus they won't get removed from buddy.

6320

*/

6322

*/

6321

6323

6322

lru_add_drain_all();

6324

lru_add_drain_all();

6323

drain_all_pages();

6325

drain_all_pages();

6324

6326

6325

order = 0;

6327

order = 0;

6326

outer_start = start;

6328

outer_start = start;

6327

while (!PageBuddy(pfn_to_page(outer_start))) {

6329

while (!PageBuddy(pfn_to_page(outer_start))) {

6328

if (++order >= MAX_ORDER) {

6330

if (++order >= MAX_ORDER) {

6329

ret = -EBUSY;

6331

ret = -EBUSY;

6330

goto done;

6332

goto done;

6331

}

6333

}

6332

outer_start &= ~0UL << order;

6334

outer_start &= ~0UL << order;

6333

}

6335

}

6334

6336

6335

/* Make sure the range is really isolated. */

6337

/* Make sure the range is really isolated. */

6336

if (test_pages_isolated(outer_start, end, false)) {

6338

if (test_pages_isolated(outer_start, end, false)) {

6337

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6339

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6338

outer_start, end);

6340

outer_start, end);

6339

ret = -EBUSY;

6341

ret = -EBUSY;

6340

goto done;

6342

goto done;

6341

}

6343

}

6342

6344

6343

6345

6344

/* Grab isolated pages from freelists. */

6346

/* Grab isolated pages from freelists. */

6345

outer_end = isolate_freepages_range(&cc, outer_start, end);

6347

outer_end = isolate_freepages_range(&cc, outer_start, end);

6346

if (!outer_end) {

6348

if (!outer_end) {

6347

ret = -EBUSY;

6349

ret = -EBUSY;

6348

goto done;

6350

goto done;

6349

}

6351

}

6350

6352

6351

/* Free head and tail (if any) */

6353

/* Free head and tail (if any) */

6352

if (start != outer_start)

6354

if (start != outer_start)

6353

free_contig_range(outer_start, start - outer_start);

6355

free_contig_range(outer_start, start - outer_start);

6354

if (end != outer_end)

6356

if (end != outer_end)

6355

free_contig_range(end, outer_end - end);

6357

free_contig_range(end, outer_end - end);

6356

6358

6357

done:

6359

done:

6358

undo_isolate_page_range(pfn_max_align_down(start),

6360

undo_isolate_page_range(pfn_max_align_down(start),

6359

pfn_max_align_up(end), migratetype);

6361

pfn_max_align_up(end), migratetype);

6360

return ret;

6362

return ret;

6361

}

6363

}

6362

6364

6363

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6365

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6364

{

6366

{

6365

unsigned int count = 0;

6367

unsigned int count = 0;

6366

6368

6367

for (; nr_pages--; pfn++) {

6369

for (; nr_pages--; pfn++) {

6368

struct page *page = pfn_to_page(pfn);

6370

struct page *page = pfn_to_page(pfn);

6369

6371

6370

count += page_count(page) != 1;

6372

count += page_count(page) != 1;

6371

__free_page(page);

6373

__free_page(page);

6372

}

6374

}

6373

WARN(count != 0, "%d pages are still in use!\n", count);

6375

WARN(count != 0, "%d pages are still in use!\n", count);

6374

}

6376

}

6375

#endif

6377

#endif

6376

6378

6377

#ifdef CONFIG_MEMORY_HOTPLUG

6379

#ifdef CONFIG_MEMORY_HOTPLUG

6378

/*

6380

/*

6379

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6381

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6380

* page high values need to be recalulated.

6382

* page high values need to be recalulated.

6381

*/

6383

*/

6382

void __meminit zone_pcp_update(struct zone *zone)

6384

void __meminit zone_pcp_update(struct zone *zone)

6383

{

6385

{

6384

unsigned cpu;

6386

unsigned cpu;

6385

mutex_lock(&pcp_batch_high_lock);

6387

mutex_lock(&pcp_batch_high_lock);

6386

for_each_possible_cpu(cpu)

6388

for_each_possible_cpu(cpu)

6387

pageset_set_high_and_batch(zone,

6389

pageset_set_high_and_batch(zone,

6388

per_cpu_ptr(zone->pageset, cpu));

6390

per_cpu_ptr(zone->pageset, cpu));

6389

mutex_unlock(&pcp_batch_high_lock);

6391

mutex_unlock(&pcp_batch_high_lock);

6390

}

6392

}

6391

#endif

6393

#endif

6392

6394

6393

void zone_pcp_reset(struct zone *zone)

6395

void zone_pcp_reset(struct zone *zone)

6394

{

6396

{

6395

unsigned long flags;

6397

unsigned long flags;

6396

int cpu;

6398

int cpu;

6397

struct per_cpu_pageset *pset;

6399

struct per_cpu_pageset *pset;

6398

6400

6399

/* avoid races with drain_pages() */

6401

/* avoid races with drain_pages() */

6400

local_irq_save(flags);

6402

local_irq_save(flags);

6401

if (zone->pageset != &boot_pageset) {

6403

if (zone->pageset != &boot_pageset) {

6402

for_each_online_cpu(cpu) {

6404

for_each_online_cpu(cpu) {

6403

pset = per_cpu_ptr(zone->pageset, cpu);

6405

pset = per_cpu_ptr(zone->pageset, cpu);

6404

drain_zonestat(zone, pset);

6406

drain_zonestat(zone, pset);

6405

}

6407

}

6406

free_percpu(zone->pageset);

6408

free_percpu(zone->pageset);

6407

zone->pageset = &boot_pageset;

6409

zone->pageset = &boot_pageset;

6408

}

6410

}

6409

local_irq_restore(flags);

6411

local_irq_restore(flags);

6410

}

6412

}

6411

6413

6412

#ifdef CONFIG_MEMORY_HOTREMOVE

6414

#ifdef CONFIG_MEMORY_HOTREMOVE

6413

/*

6415

/*

6414

* All pages in the range must be isolated before calling this.

6416

* All pages in the range must be isolated before calling this.

6415

*/

6417

*/

6416

void

6418

void

6417

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6419

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6418

{

6420

{

6419

struct page *page;

6421

struct page *page;

6420

struct zone *zone;

6422

struct zone *zone;

6421

int order, i;

6423

int order, i;

6422

unsigned long pfn;

6424

unsigned long pfn;

6423

unsigned long flags;

6425

unsigned long flags;

6424

/* find the first valid pfn */

6426

/* find the first valid pfn */

6425

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6427

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6426

if (pfn_valid(pfn))

6428

if (pfn_valid(pfn))

6427

break;

6429

break;

6428

if (pfn == end_pfn)

6430

if (pfn == end_pfn)

6429

return;

6431

return;

6430

zone = page_zone(pfn_to_page(pfn));

6432

zone = page_zone(pfn_to_page(pfn));

6431

spin_lock_irqsave(&zone->lock, flags);

6433

spin_lock_irqsave(&zone->lock, flags);

6432

pfn = start_pfn;

6434

pfn = start_pfn;

6433

while (pfn < end_pfn) {

6435

while (pfn < end_pfn) {

6434

if (!pfn_valid(pfn)) {

6436

if (!pfn_valid(pfn)) {

6435

pfn++;

6437

pfn++;

6436

continue;

6438

continue;

6437

}

6439

}

6438

page = pfn_to_page(pfn);

6440

page = pfn_to_page(pfn);

6439

/*

6441

/*

6440

* The HWPoisoned page may be not in buddy system, and

6442

* The HWPoisoned page may be not in buddy system, and

6441

* page_count() is not 0.

6443

* page_count() is not 0.

6442

*/

6444

*/

6443

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6445

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6444

pfn++;

6446

pfn++;

6445

SetPageReserved(page);

6447

SetPageReserved(page);

6446

continue;

6448

continue;

6447

}

6449

}

6448

6450

6449

BUG_ON(page_count(page));

6451

BUG_ON(page_count(page));

6450

BUG_ON(!PageBuddy(page));

6452

BUG_ON(!PageBuddy(page));

6451

order = page_order(page);

6453

order = page_order(page);

6452

#ifdef CONFIG_DEBUG_VM

6454

#ifdef CONFIG_DEBUG_VM

6453

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6455

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6454

pfn, 1 << order, end_pfn);

6456

pfn, 1 << order, end_pfn);

6455

#endif

6457

#endif

6456

list_del(&page->lru);

6458

list_del(&page->lru);

6457

rmv_page_order(page);

6459

rmv_page_order(page);

6458

zone->free_area[order].nr_free--;

6460

zone->free_area[order].nr_free--;

6459

for (i = 0; i < (1 << order); i++)

6461

for (i = 0; i < (1 << order); i++)

6460

SetPageReserved((page+i));

6462

SetPageReserved((page+i));

6461

pfn += (1 << order);

6463

pfn += (1 << order);

6462

}

6464

}

6463

spin_unlock_irqrestore(&zone->lock, flags);

6465

spin_unlock_irqrestore(&zone->lock, flags);

6464

}

6466

}

6465

#endif

6467

#endif

6466

6468

6467

#ifdef CONFIG_MEMORY_FAILURE

6469

#ifdef CONFIG_MEMORY_FAILURE

6468

bool is_free_buddy_page(struct page *page)

6470

bool is_free_buddy_page(struct page *page)

6469

{

6471

{

6470

struct zone *zone = page_zone(page);

6472

struct zone *zone = page_zone(page);

6471

unsigned long pfn = page_to_pfn(page);

6473

unsigned long pfn = page_to_pfn(page);

6472

unsigned long flags;

6474

unsigned long flags;

6473

int order;

6475

int order;

6474

6476

6475

spin_lock_irqsave(&zone->lock, flags);

6477

spin_lock_irqsave(&zone->lock, flags);

6476

for (order = 0; order < MAX_ORDER; order++) {

6478

for (order = 0; order < MAX_ORDER; order++) {

6477

struct page *page_head = page - (pfn & ((1 << order) - 1));

6479

struct page *page_head = page - (pfn & ((1 << order) - 1));

6478

6480

6479

if (PageBuddy(page_head) && page_order(page_head) >= order)

6481

if (PageBuddy(page_head) && page_order(page_head) >= order)

6480

break;

6482

break;

6481

}

6483

}

6482

spin_unlock_irqrestore(&zone->lock, flags);

6484

spin_unlock_irqrestore(&zone->lock, flags);

6483

6485

6484

return order < MAX_ORDER;

6486

return order < MAX_ORDER;

6485

}

6487

}

6486

#endif

6488

#endif

6487

6489

6488

static const struct trace_print_flags pageflag_names[] = {

6490

static const struct trace_print_flags pageflag_names[] = {

6489

{1UL << PG_locked, "locked" },

6491

{1UL << PG_locked, "locked" },

6490

{1UL << PG_error, "error" },

6492

{1UL << PG_error, "error" },

6491

{1UL << PG_referenced, "referenced" },

6493

{1UL << PG_referenced, "referenced" },

6492

{1UL << PG_uptodate, "uptodate" },

6494

{1UL << PG_uptodate, "uptodate" },

6493

{1UL << PG_dirty, "dirty" },

6495

{1UL << PG_dirty, "dirty" },

6494

{1UL << PG_lru, "lru" },

6496

{1UL << PG_lru, "lru" },

6495

{1UL << PG_active, "active" },

6497

{1UL << PG_active, "active" },

6496

{1UL << PG_slab, "slab" },

6498

{1UL << PG_slab, "slab" },

6497

{1UL << PG_owner_priv_1, "owner_priv_1" },

6499

{1UL << PG_owner_priv_1, "owner_priv_1" },

6498

{1UL << PG_arch_1, "arch_1" },

6500

{1UL << PG_arch_1, "arch_1" },

6499

{1UL << PG_reserved, "reserved" },

6501

{1UL << PG_reserved, "reserved" },

6500

{1UL << PG_private, "private" },

6502

{1UL << PG_private, "private" },

6501

{1UL << PG_private_2, "private_2" },

6503

{1UL << PG_private_2, "private_2" },

6502

{1UL << PG_writeback, "writeback" },

6504

{1UL << PG_writeback, "writeback" },

6503

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6505

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6504

{1UL << PG_head, "head" },

6506

{1UL << PG_head, "head" },

6505

{1UL << PG_tail, "tail" },

6507

{1UL << PG_tail, "tail" },

6506

#else

6508

#else

6507

{1UL << PG_compound, "compound" },

6509

{1UL << PG_compound, "compound" },

6508

#endif

6510

#endif

6509

{1UL << PG_swapcache, "swapcache" },

6511

{1UL << PG_swapcache, "swapcache" },

6510

{1UL << PG_mappedtodisk, "mappedtodisk" },

6512

{1UL << PG_mappedtodisk, "mappedtodisk" },

6511

{1UL << PG_reclaim, "reclaim" },

6513

{1UL << PG_reclaim, "reclaim" },

6512

{1UL << PG_swapbacked, "swapbacked" },

6514

{1UL << PG_swapbacked, "swapbacked" },

6513

{1UL << PG_unevictable, "unevictable" },

6515

{1UL << PG_unevictable, "unevictable" },

6514

#ifdef CONFIG_MMU

6516

#ifdef CONFIG_MMU

6515

{1UL << PG_mlocked, "mlocked" },

6517

{1UL << PG_mlocked, "mlocked" },

6516

#endif

6518

#endif

6517

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6519

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6518

{1UL << PG_uncached, "uncached" },

6520

{1UL << PG_uncached, "uncached" },

6519

#endif

6521

#endif

6520

#ifdef CONFIG_MEMORY_FAILURE

6522

#ifdef CONFIG_MEMORY_FAILURE

6521

{1UL << PG_hwpoison, "hwpoison" },

6523

{1UL << PG_hwpoison, "hwpoison" },

6522

#endif

6524

#endif

6523

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6525

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6524

{1UL << PG_compound_lock, "compound_lock" },

6526

{1UL << PG_compound_lock, "compound_lock" },

6525

#endif

6527

#endif

6526

};

6528

};

6527

6529

6528

static void dump_page_flags(unsigned long flags)

6530

static void dump_page_flags(unsigned long flags)

6529

{

6531

{

6530

const char *delim = "";

6532

const char *delim = "";

6531

unsigned long mask;

6533

unsigned long mask;

6532

int i;

6534

int i;

6533

6535

6534

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6536

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6535

6537

6536

printk(KERN_ALERT "page flags: %#lx(", flags);

6538

printk(KERN_ALERT "page flags: %#lx(", flags);

6537

6539

6538

/* remove zone id */

6540

/* remove zone id */

6539

flags &= (1UL << NR_PAGEFLAGS) - 1;

6541

flags &= (1UL << NR_PAGEFLAGS) - 1;

6540

6542

6541

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6543

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6542

6544

6543

mask = pageflag_names[i].mask;

6545

mask = pageflag_names[i].mask;

6544

if ((flags & mask) != mask)

6546

if ((flags & mask) != mask)

6545

continue;

6547

continue;

6546

6548

6547

flags &= ~mask;

6549

flags &= ~mask;

6548

printk("%s%s", delim, pageflag_names[i].name);

6550

printk("%s%s", delim, pageflag_names[i].name);

6549

delim = "|";

6551

delim = "|";

6550

}

6552

}

6551

6553

6552

/* check for left over flags */

6554

/* check for left over flags */

6553

if (flags)

6555

if (flags)

6554

printk("%s%#lx", delim, flags);

6556

printk("%s%#lx", delim, flags);

6555

6557

6556

printk(")\n");

6558

printk(")\n");

6557

}

6559

}

6558

6560

6559

void dump_page(struct page *page)

6561

void dump_page(struct page *page)

6560

{

6562

{

6561

printk(KERN_ALERT

6563

printk(KERN_ALERT

6562

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6564

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6563

page, atomic_read(&page->_count), page_mapcount(page),

6565

page, atomic_read(&page->_count), page_mapcount(page),

6564

page->mapping, page->index);

6566

page->mapping, page->index);

6565

dump_page_flags(page->flags);

6567

dump_page_flags(page->flags);

GITLAB

mm: page_alloc: take the ALLOC_NO_WATERMARK check out of the fast path

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 #ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page %lu outside zone [ %lu - %lu ]\n",
 			pfn, start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		set_page_count(p, 0);
 		p->first_page = page;
 		/* Make sure p->first_page is always valid for PageTail() */
 		smp_wmb();
 		__SetPageTail(p);
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		/*
 		 * zone check is done late to avoid uselessly
 		 * calculating zone/node ids for pages that could
 		 * never merge.
 		 */
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	VM_BUG_ON(!zone_is_initialized(zone));
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	page_nid_reset_last(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 			if (likely(!is_migrate_isolate_page(page))) {
 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
 				if (is_migrate_cma(mt))
 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 			}
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	if (pageblock_order >= MAX_ORDER) {
 		i = pageblock_nr_pages;
 		p = page;
 		do {
 			set_page_refcounted(p);
 			__free_pages(p, MAX_ORDER - 1);
 			p += MAX_ORDER_NR_PAGES;
 		} while (i -= MAX_ORDER_NR_PAGES);
 	} else {
 		set_page_refcounted(page);
 		__free_pages(page, pageblock_order);
 	}
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_freepage_state(zone, -(1 << high),
 						  migratetype);
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		set_freepage_migratetype(page, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 #endif
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * If breaking a large block of pages, move all free pages to the preferred
  * allocation list. If falling back for a reclaimable kernel allocation, be
  * more aggressive about taking ownership of free pages.
  *
  * On the other hand, never change migration type of MIGRATE_CMA pageblocks
  * nor move CMA pages to different free lists. We don't want unmovable pages
  * to be allocated from MIGRATE_CMA areas.
  *
  * Returns the new migratetype of the pageblock (or the same old migratetype
  * if it was unchanged).
  */
 static int try_to_steal_freepages(struct zone *zone, struct page *page,
 				  int start_type, int fallback_type)
 {
 	int current_order = page_order(page);
 	/*
 	 * When borrowing from MIGRATE_CMA, we need to release the excess
 	 * buddy pages to CMA itself. We also ensure the freepage_migratetype
 	 * is set to CMA so it is returned to the correct freelist in case
 	 * the page ends up being not actually allocated from the pcp lists.
 	 */
 	if (is_migrate_cma(fallback_type))
 		return fallback_type;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		return start_type;
 	}
 	if (current_order >= pageblock_order / 2 ||
 	    start_type == MIGRATE_RECLAIMABLE ||
 	    page_group_by_mobility_disabled) {
 		int pages;
 		pages = move_freepages_block(zone, page, start_type);
 		/* Claim the whole block if over half of it is free */
 		if (pages >= (1 << (pageblock_order-1)) ||
 				page_group_by_mobility_disabled) {
 			set_pageblock_migratetype(page, start_type);
 			return start_type;
 		}
 	}
 	return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int migratetype, new_type, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			new_type = try_to_steal_freepages(zone, page,
 							  start_migratetype,
 							  migratetype);
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			expand(zone, page, order, current_order, area,
 			       new_type);
 			/* The freepage_migratetype may differ from pageblock's
 			 * migratetype depending on the decisions in
 			 * try_to_steal_freepages. This is OK as long as it does
 			 * not differ for MIGRATE_CMA type.
 			 */
 			set_freepage_migratetype(page, new_type);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype, new_type);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
 	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
 		if (is_migrate_cma(get_freepage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	unsigned long batch;
 	local_irq_save(flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	if (pcp->count >= batch)
 		to_drain = batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	int nr_pages;
 	order = page_order(page);
 	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	return nr_pages;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_freepage_migratetype(page));
 	}
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	long free_cma = 0;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	if (free_pages - free_cma <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return local_zone->node == zone->node;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 static void __paginginit init_zone_allows_reclaim(int nid)
 {
 	int i;
 	for_each_node_state(i, N_MEMORY)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
 		else
 			zone_reclaim_mode = 1;
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static inline void init_zone_allows_reclaim(int nid)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
 				(gfp_mask & __GFP_WRITE);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		unsigned long mark;
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
-		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
-		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
-			goto try_this_zone;
 		/*
 		 * Distribute pages in proportion to the individual
 		 * zone size to ensure fair page aging.  The zone a
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			if (!zone_local(preferred_zone, zone))
 				continue;
 			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
 				continue;
 		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if (consider_zone_dirty && !zone_dirty_ok(zone))
 			continue;
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_ok(zone, order, mark,
 				       classzone_idx, alloc_flags)) {
 			int ret;
+			/* Checked here to keep the fast path fast */
+			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+			if (alloc_flags & ALLOC_NO_WATERMARKS)
+				goto try_this_zone;
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto try_this_zone;
 				/*
 				 * Failed to reclaim enough to meet watermark.
 				 * Only mark the zone full if checking the min
 				 * watermark or if we failed to reclaim just
 				 * 1<<order pages or else the page allocator
 				 * fastpath will prematurely mark zones full
 				 * when the watermark is between the low and
 				 * min watermarks.
 				 */
 				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
 				    ret == ZONE_RECLAIM_SOME)
 					goto this_zone_full;
 				continue;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
 		 * that the caller is taking steps that will free more
 		 * memory. The caller should avoid the page being used
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * Walking all memory to count page types is very expensive and should
 	 * be inhibited in non-blockable contexts.
 	 */
 	if (!(gfp_mask & __GFP_WAIT))
 		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		struct page *page;
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype,
 	enum migrate_mode mode, bool *contended_compaction,
 	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (IS_ENABLED(CONFIG_NUMA))
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
 					preferred_zone, classzone_idx,
 					migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, classzone_idx, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static void reset_alloc_batches(struct zonelist *zonelist,
 				enum zone_type high_zoneidx,
 				struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		/*
 		 * Only reset the batches of zones that were actually
 		 * considered in the fairness pass, we don't want to
 		 * trash fairness information for zones that are not
 		 * actually part of this zonelist's round-robin cycle.
 		 */
 		if (!zone_local(preferred_zone, zone))
 			continue;
 		mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
 			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 	}
 }
 static void wake_all_kswapds(unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
 			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (atomic) {
 		/*
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
 		if (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed_softwall().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int classzone_idx, int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (IS_ENABLED(CONFIG_NUMA) &&
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
 		struct zoneref *preferred_zoneref;
 		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
 				NULL,
 				&preferred_zone);
 		classzone_idx = zonelist_zone_idx(preferred_zoneref);
 	}
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		/*
 		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
 		 * the allocation is high priority and these type of
 		 * allocations are system rather than user orientated
 		 */
 		zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			goto got_pg;
 		}
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone,
 					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	migration_mode = MIGRATE_SYNC_LIGHT;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * requested a movable allocation that does not heavily disrupt the
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
 						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					classzone_idx, migratetype,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					classzone_idx, migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone,
 					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	struct mem_cgroup *memcg = NULL;
 	int classzone_idx;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 	/*
 	 * Will only have any effect when __GFP_KMEMCG is set.  This is
 	 * verified in the (always inline) callee
 	 */
 	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 	classzone_idx = zonelist_zone_idx(preferred_zoneref);
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, classzone_idx, migratetype);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
 		 * fairly within the local node.  However, the local
 		 * node might have free pages left after the fairness
 		 * batches are exhausted, and remote zones haven't
 		 * even been considered yet.  Try once more without
 		 * fairness, and include remote zones now, before
 		 * entering the slowpath and waking kswapd: prefer
 		 * spilling to a remote zone over swapping locally.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			reset_alloc_batches(zonelist, high_zoneidx,
 					    preferred_zone);
 			alloc_flags &= ~ALLOC_FAIR;
 			goto retry;
 		}
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
 		 * complete.
 		 */
 		gfp_mask = memalloc_noio_flags(gfp_mask);
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, classzone_idx, migratetype);
 	}
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	memcg_kmem_commit_charge(page, memcg, order);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
  * pages allocated with __GFP_KMEMCG.
  *
  * Those pages are accounted to a particular memcg, embedded in the
  * corresponding page_cgroup. To avoid adding a hit in the allocator to search
  * for that information only to find out that it is NULL for users who have no
  * interest in that whatsoever, we provide these functions.
  *
  * The caller knows better which flags it relies on.
  */
 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
 {
 	memcg_kmem_uncharge_pages(page, order);
 	__free_pages(page, order);
 }
 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
 	}
 }
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *     managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = read_mems_allowed_begin();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RESERVE]	= 'R',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk("(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE),
 		global_page_state(NR_FREE_CMA_PAGES));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk("= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write) {
 		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
 			ret = -EINVAL;
 			goto out;
 		}
 		strcpy(saved_string, (char *)table->data);
 	}
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		ret = __parse_numa_zonelist_order((char *)table->data);
 		if (ret) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char *)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
 	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->managed_pages;
 				total_size += z->managed_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
 	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
 	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	int old_reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = zone_end_pfn(zone);
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	old_reserve = zone->nr_migrate_reserve_block;
 	/* When memory hot-add, we almost always need to do nothing */
 	if (reserve == old_reserve)
 		return;
 	zone->nr_migrate_reserve_block = reserve;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		} else if (!old_reserve) {
 			/*
 			 * At boot time we don't need to scan the whole zone
 			 * for turning off MIGRATE_RESERVE.
 			 */
 			break;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
 		page_nid_reset_last(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < zone_end_pfn(z))
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void pageset_set_high_and_batch(struct zone *zone,
 				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
 	 */
 	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
 	static int __meminitdata last_nid;
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		last_start_pfn = start_pfn;
 		last_end_pfn = end_pfn;
 		last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 							 node_start_pfn,
 							 node_end_pfn,
 							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not naturally algined on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
 						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
 								node_start_pfn,
 								node_end_pfn,
 								zholes_size);
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (freesize >= memmap_pages) {
 			freesize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds freesize %lu\n",
 				zone_names[j], memmap_pages, freesize);
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	if (node_state(nid, N_MEMORY))
 		init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, start_pfn, end_pfn,
 			    zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	printk("Memory: %luK/%luK available "
 	       "(%luK kernel code, %luK rwdata, %luK rodata, "
 	       "%luK init, %luK bss, %luK reserved"
 #ifdef	CONFIG_HIGHMEM
 	       ", %luK highmem"
 #endif
 	       "%s%s)\n",
 	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
 	       codesize >> 10, datasize >> 10, rosize >> 10,
 	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
 	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
 #ifdef	CONFIG_HIGHMEM
 	       totalhigh_pages << (PAGE_SHIFT-10),
 #endif
 	       str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->managed_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
 				      low_wmark_pages(zone) -
 				      zone_page_state(zone, NR_ALLOC_BATCH));
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
 	int ret;
 	mutex_lock(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || ret < 0)
 		goto out;
 	/* Sanity checking to avoid pcp imbalance */
 	if (percpu_pagelist_fraction &&
 	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
 		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
 		ret = -EINVAL;
 		goto out;
 	}
 	/* No change? */
 	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
 		goto out;
 	for_each_populated_zone(zone) {
 		unsigned int cpu;
 		for_each_possible_cpu(cpu)
 			pageset_set_high_and_batch(zone,
 					per_cpu_ptr(zone->pageset, cpu));
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_mask(struct page *page,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long word;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	word = bitmap[word_bitidx];
 	bitidx += end_bitidx;
 	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 }
 /**
  * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long old_word, word;
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
 	flags <<= (BITS_PER_LONG - bitidx - 1);
 	word = ACCESS_ONCE(bitmap[word_bitidx]);
 	for (;;) {
 		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 		if (word == old_word)
 			break;
 		word = old_word;
 	}
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    NULL, 0, cc->mode, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);