Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

73

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

75

DEFINE_PER_CPU(int, numa_node);

75

DEFINE_PER_CPU(int, numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

77

#endif

77

#endif

78

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

80

/*

80

/*

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

84

* defined in <linux/topology.h>.

84

* defined in <linux/topology.h>.

85

*/

85

*/

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

88

#endif

88

#endif

89

90

/*

90

/*

91

* Array of node states.

91

* Array of node states.

92

*/

92

*/

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

94

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_POSSIBLE] = NODE_MASK_ALL,

95

[N_ONLINE] = { { [0] = 1UL } },

95

[N_ONLINE] = { { [0] = 1UL } },

96

#ifndef CONFIG_NUMA

96

#ifndef CONFIG_NUMA

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

100

#endif

100

#endif

101

#ifdef CONFIG_MOVABLE_NODE

101

#ifdef CONFIG_MOVABLE_NODE

102

[N_MEMORY] = { { [0] = 1UL } },

102

[N_MEMORY] = { { [0] = 1UL } },

103

#endif

103

#endif

104

[N_CPU] = { { [0] = 1UL } },

104

[N_CPU] = { { [0] = 1UL } },

105

#endif /* NUMA */

105

#endif /* NUMA */

106

};

106

};

107

EXPORT_SYMBOL(node_states);

107

EXPORT_SYMBOL(node_states);

108

109

/* Protect totalram_pages and zone->managed_pages */

109

/* Protect totalram_pages and zone->managed_pages */

110

static DEFINE_SPINLOCK(managed_page_count_lock);

110

static DEFINE_SPINLOCK(managed_page_count_lock);

111

112

unsigned long totalram_pages __read_mostly;

112

unsigned long totalram_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

114

/*

114

/*

115

* When calculating the number of globally allowed dirty pages, there

115

* When calculating the number of globally allowed dirty pages, there

116

* is a certain number of per-zone reserves that should not be

116

* is a certain number of per-zone reserves that should not be

117

* considered dirtyable memory. This is the sum of those reserves

117

* considered dirtyable memory. This is the sum of those reserves

118

* over all existing zones that contribute dirtyable memory.

118

* over all existing zones that contribute dirtyable memory.

119

*/

119

*/

120

unsigned long dirty_balance_reserve __read_mostly;

120

unsigned long dirty_balance_reserve __read_mostly;

121

122

int percpu_pagelist_fraction;

122

int percpu_pagelist_fraction;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

124

125

#ifdef CONFIG_PM_SLEEP

125

#ifdef CONFIG_PM_SLEEP

126

/*

126

/*

127

* The following functions are used by the suspend/hibernate code to temporarily

127

* The following functions are used by the suspend/hibernate code to temporarily

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

132

* guaranteed not to run in parallel with that modification).

132

* guaranteed not to run in parallel with that modification).

133

*/

133

*/

134

135

static gfp_t saved_gfp_mask;

135

static gfp_t saved_gfp_mask;

136

137

void pm_restore_gfp_mask(void)

137

void pm_restore_gfp_mask(void)

138

{

138

{

139

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(!mutex_is_locked(&pm_mutex));

140

if (saved_gfp_mask) {

140

if (saved_gfp_mask) {

141

gfp_allowed_mask = saved_gfp_mask;

141

gfp_allowed_mask = saved_gfp_mask;

142

saved_gfp_mask = 0;

142

saved_gfp_mask = 0;

143

}

143

}

144

}

144

}

145

146

void pm_restrict_gfp_mask(void)

146

void pm_restrict_gfp_mask(void)

147

{

147

{

148

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(!mutex_is_locked(&pm_mutex));

149

WARN_ON(saved_gfp_mask);

149

WARN_ON(saved_gfp_mask);

150

saved_gfp_mask = gfp_allowed_mask;

150

saved_gfp_mask = gfp_allowed_mask;

151

gfp_allowed_mask &= ~GFP_IOFS;

151

gfp_allowed_mask &= ~GFP_IOFS;

152

}

152

}

153

154

bool pm_suspended_storage(void)

154

bool pm_suspended_storage(void)

155

{

155

{

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

157

return false;

157

return false;

158

return true;

158

return true;

159

}

159

}

160

#endif /* CONFIG_PM_SLEEP */

160

#endif /* CONFIG_PM_SLEEP */

161

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

163

int pageblock_order __read_mostly;

163

int pageblock_order __read_mostly;

164

#endif

164

#endif

165

166

static void __free_pages_ok(struct page *page, unsigned int order);

166

static void __free_pages_ok(struct page *page, unsigned int order);

167

168

/*

168

/*

169

* results with 256, 32 in the lowmem_reserve sysctl:

169

* results with 256, 32 in the lowmem_reserve sysctl:

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

175

*

175

*

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

177

* don't need any ZONE_NORMAL reservation

177

* don't need any ZONE_NORMAL reservation

178

*/

178

*/

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

180

#ifdef CONFIG_ZONE_DMA

180

#ifdef CONFIG_ZONE_DMA

181

256,

181

256,

182

#endif

182

#endif

183

#ifdef CONFIG_ZONE_DMA32

183

#ifdef CONFIG_ZONE_DMA32

184

256,

184

256,

185

#endif

185

#endif

186

#ifdef CONFIG_HIGHMEM

186

#ifdef CONFIG_HIGHMEM

187

32,

187

32,

188

#endif

188

#endif

189

32,

189

32,

190

};

190

};

191

192

EXPORT_SYMBOL(totalram_pages);

192

EXPORT_SYMBOL(totalram_pages);

193

194

static char * const zone_names[MAX_NR_ZONES] = {

194

static char * const zone_names[MAX_NR_ZONES] = {

195

#ifdef CONFIG_ZONE_DMA

195

#ifdef CONFIG_ZONE_DMA

196

"DMA",

196

"DMA",

197

#endif

197

#endif

198

#ifdef CONFIG_ZONE_DMA32

198

#ifdef CONFIG_ZONE_DMA32

199

"DMA32",

199

"DMA32",

200

#endif

200

#endif

201

"Normal",

201

"Normal",

202

#ifdef CONFIG_HIGHMEM

202

#ifdef CONFIG_HIGHMEM

203

"HighMem",

203

"HighMem",

204

#endif

204

#endif

205

"Movable",

205

"Movable",

206

};

206

};

207

208

int min_free_kbytes = 1024;

208

int min_free_kbytes = 1024;

209

int user_min_free_kbytes;

209

int user_min_free_kbytes;

210

211

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_kernel_pages;

212

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata nr_all_pages;

213

static unsigned long __meminitdata dma_reserve;

213

static unsigned long __meminitdata dma_reserve;

214

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

218

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_kernelcore;

219

static unsigned long __initdata required_movablecore;

219

static unsigned long __initdata required_movablecore;

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

221

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

223

int movable_zone;

223

int movable_zone;

224

EXPORT_SYMBOL(movable_zone);

224

EXPORT_SYMBOL(movable_zone);

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

226

227

#if MAX_NUMNODES > 1

227

#if MAX_NUMNODES > 1

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

229

int nr_online_nodes __read_mostly = 1;

229

int nr_online_nodes __read_mostly = 1;

230

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_node_ids);

231

EXPORT_SYMBOL(nr_online_nodes);

231

EXPORT_SYMBOL(nr_online_nodes);

232

#endif

232

#endif

233

234

int page_group_by_mobility_disabled __read_mostly;

234

int page_group_by_mobility_disabled __read_mostly;

235

236

void set_pageblock_migratetype(struct page *page, int migratetype)

236

void set_pageblock_migratetype(struct page *page, int migratetype)

237

{

237

{

238

239

if (unlikely(page_group_by_mobility_disabled))

239

if (unlikely(page_group_by_mobility_disabled))

240

migratetype = MIGRATE_UNMOVABLE;

240

migratetype = MIGRATE_UNMOVABLE;

241

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

243

PB_migrate, PB_migrate_end);

243

PB_migrate, PB_migrate_end);

244

}

244

}

245

246

bool oom_killer_disabled __read_mostly;

246

bool oom_killer_disabled __read_mostly;

247

248

#ifdef CONFIG_DEBUG_VM

248

#ifdef CONFIG_DEBUG_VM

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

250

{

250

{

251

int ret = 0;

251

int ret = 0;

252

unsigned seq;

252

unsigned seq;

253

unsigned long pfn = page_to_pfn(page);

253

unsigned long pfn = page_to_pfn(page);

254

unsigned long sp, start_pfn;

254

unsigned long sp, start_pfn;

255

256

do {

256

do {

257

seq = zone_span_seqbegin(zone);

257

seq = zone_span_seqbegin(zone);

258

start_pfn = zone->zone_start_pfn;

258

start_pfn = zone->zone_start_pfn;

259

sp = zone->spanned_pages;

259

sp = zone->spanned_pages;

260

if (!zone_spans_pfn(zone, pfn))

260

if (!zone_spans_pfn(zone, pfn))

261

ret = 1;

261

ret = 1;

262

} while (zone_span_seqretry(zone, seq));

262

} while (zone_span_seqretry(zone, seq));

263

264

if (ret)

264

if (ret)

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

266

pfn, start_pfn, start_pfn + sp);

266

pfn, start_pfn, start_pfn + sp);

267

268

return ret;

268

return ret;

269

}

269

}

270

271

static int page_is_consistent(struct zone *zone, struct page *page)

271

static int page_is_consistent(struct zone *zone, struct page *page)

272

{

272

{

273

if (!pfn_valid_within(page_to_pfn(page)))

273

if (!pfn_valid_within(page_to_pfn(page)))

274

return 0;

274

return 0;

275

if (zone != page_zone(page))

275

if (zone != page_zone(page))

276

return 0;

276

return 0;

277

278

return 1;

278

return 1;

279

}

279

}

280

/*

280

/*

281

* Temporary debugging check for pages not lying within a given zone.

281

* Temporary debugging check for pages not lying within a given zone.

282

*/

282

*/

283

static int bad_range(struct zone *zone, struct page *page)

283

static int bad_range(struct zone *zone, struct page *page)

284

{

284

{

285

if (page_outside_zone_boundaries(zone, page))

285

if (page_outside_zone_boundaries(zone, page))

286

return 1;

286

return 1;

287

if (!page_is_consistent(zone, page))

287

if (!page_is_consistent(zone, page))

288

return 1;

288

return 1;

289

290

return 0;

290

return 0;

291

}

291

}

292

#else

292

#else

293

static inline int bad_range(struct zone *zone, struct page *page)

293

static inline int bad_range(struct zone *zone, struct page *page)

294

{

294

{

295

return 0;

295

return 0;

296

}

296

}

297

#endif

297

#endif

298

299

static void bad_page(struct page *page)

299

static void bad_page(struct page *page)

300

{

300

{

301

static unsigned long resume;

301

static unsigned long resume;

302

static unsigned long nr_shown;

302

static unsigned long nr_shown;

303

static unsigned long nr_unshown;

303

static unsigned long nr_unshown;

304

305

/* Don't complain about poisoned pages */

305

/* Don't complain about poisoned pages */

306

if (PageHWPoison(page)) {

306

if (PageHWPoison(page)) {

307

page_mapcount_reset(page); /* remove PageBuddy */

307

page_mapcount_reset(page); /* remove PageBuddy */

308

return;

308

return;

309

}

309

}

310

311

/*

311

/*

312

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* Allow a burst of 60 reports, then keep quiet for that minute;

313

* or allow a steady drip of one report per second.

313

* or allow a steady drip of one report per second.

314

*/

314

*/

315

if (nr_shown == 60) {

315

if (nr_shown == 60) {

316

if (time_before(jiffies, resume)) {

316

if (time_before(jiffies, resume)) {

317

nr_unshown++;

317

nr_unshown++;

318

goto out;

318

goto out;

319

}

319

}

320

if (nr_unshown) {

320

if (nr_unshown) {

321

printk(KERN_ALERT

321

printk(KERN_ALERT

322

"BUG: Bad page state: %lu messages suppressed\n",

322

"BUG: Bad page state: %lu messages suppressed\n",

323

nr_unshown);

323

nr_unshown);

324

nr_unshown = 0;

324

nr_unshown = 0;

325

}

325

}

326

nr_shown = 0;

326

nr_shown = 0;

327

}

327

}

328

if (nr_shown++ == 0)

328

if (nr_shown++ == 0)

329

resume = jiffies + 60 * HZ;

329

resume = jiffies + 60 * HZ;

330

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

332

current->comm, page_to_pfn(page));

332

current->comm, page_to_pfn(page));

333

dump_page(page);

333

dump_page(page);

334

335

print_modules();

335

print_modules();

336

dump_stack();

336

dump_stack();

337

out:

337

out:

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

339

page_mapcount_reset(page); /* remove PageBuddy */

339

page_mapcount_reset(page); /* remove PageBuddy */

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

341

}

341

}

342

343

/*

343

/*

344

* Higher-order pages are called "compound pages". They are structured thusly:

344

* Higher-order pages are called "compound pages". They are structured thusly:

345

*

345

*

346

* The first PAGE_SIZE page is called the "head page".

346

* The first PAGE_SIZE page is called the "head page".

347

*

347

*

348

* The remaining PAGE_SIZE pages are called "tail pages".

348

* The remaining PAGE_SIZE pages are called "tail pages".

349

*

349

*

350

* All pages have PG_compound set. All tail pages have their ->first_page

350

* All pages have PG_compound set. All tail pages have their ->first_page

351

* pointing at the head page.

351

* pointing at the head page.

352

*

352

*

353

* The first tail page's ->lru.next holds the address of the compound page's

353

* The first tail page's ->lru.next holds the address of the compound page's

354

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* put_page() function. Its ->lru.prev holds the order of allocation.

355

* This usage means that zero-order pages may not be compound.

355

* This usage means that zero-order pages may not be compound.

356

*/

356

*/

357

358

static void free_compound_page(struct page *page)

358

static void free_compound_page(struct page *page)

359

{

359

{

360

__free_pages_ok(page, compound_order(page));

360

__free_pages_ok(page, compound_order(page));

361

}

361

}

362

363

void prep_compound_page(struct page *page, unsigned long order)

363

void prep_compound_page(struct page *page, unsigned long order)

364

{

364

{

365

int i;

365

int i;

366

int nr_pages = 1 << order;

366

int nr_pages = 1 << order;

367

368

set_compound_page_dtor(page, free_compound_page);

368

set_compound_page_dtor(page, free_compound_page);

369

set_compound_order(page, order);

369

set_compound_order(page, order);

370

__SetPageHead(page);

370

__SetPageHead(page);

371

for (i = 1; i < nr_pages; i++) {

371

for (i = 1; i < nr_pages; i++) {

372

struct page *p = page + i;

372

struct page *p = page + i;

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

/* Make sure p->first_page is always valid for PageTail() */

375

/* Make sure p->first_page is always valid for PageTail() */

376

smp_wmb();

376

smp_wmb();

377

__SetPageTail(p);

377

__SetPageTail(p);

378

}

378

}

379

}

379

}

380

381

/* update __split_huge_page_refcount if you change this function */

381

/* update __split_huge_page_refcount if you change this function */

382

static int destroy_compound_page(struct page *page, unsigned long order)

382

static int destroy_compound_page(struct page *page, unsigned long order)

383

{

383

{

384

int i;

384

int i;

385

int nr_pages = 1 << order;

385

int nr_pages = 1 << order;

386

int bad = 0;

386

int bad = 0;

387

388

if (unlikely(compound_order(page) != order)) {

388

if (unlikely(compound_order(page) != order)) {

389

bad_page(page);

389

bad_page(page);

390

bad++;

390

bad++;

391

}

391

}

392

393

__ClearPageHead(page);

393

__ClearPageHead(page);

394

395

for (i = 1; i < nr_pages; i++) {

395

for (i = 1; i < nr_pages; i++) {

396

struct page *p = page + i;

396

struct page *p = page + i;

397

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

399

bad_page(page);

399

bad_page(page);

400

bad++;

400

bad++;

401

}

401

}

402

__ClearPageTail(p);

402

__ClearPageTail(p);

403

}

403

}

404

405

return bad;

405

return bad;

406

}

406

}

407

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

409

{

409

{

410

int i;

410

int i;

411

412

/*

412

/*

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

415

*/

415

*/

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

417

for (i = 0; i < (1 << order); i++)

417

for (i = 0; i < (1 << order); i++)

418

clear_highpage(page + i);

418

clear_highpage(page + i);

419

}

419

}

420

421

#ifdef CONFIG_DEBUG_PAGEALLOC

421

#ifdef CONFIG_DEBUG_PAGEALLOC

422

unsigned int _debug_guardpage_minorder;

422

unsigned int _debug_guardpage_minorder;

423

424

static int __init debug_guardpage_minorder_setup(char *buf)

424

static int __init debug_guardpage_minorder_setup(char *buf)

425

{

425

{

426

unsigned long res;

426

unsigned long res;

427

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

430

return 0;

430

return 0;

431

}

431

}

432

_debug_guardpage_minorder = res;

432

_debug_guardpage_minorder = res;

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

434

return 0;

434

return 0;

435

}

435

}

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

437

438

static inline void set_page_guard_flag(struct page *page)

438

static inline void set_page_guard_flag(struct page *page)

439

{

439

{

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

441

}

441

}

442

443

static inline void clear_page_guard_flag(struct page *page)

443

static inline void clear_page_guard_flag(struct page *page)

444

{

444

{

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

446

}

446

}

447

#else

447

#else

448

static inline void set_page_guard_flag(struct page *page) { }

448

static inline void set_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

450

#endif

450

#endif

451

452

static inline void set_page_order(struct page *page, int order)

452

static inline void set_page_order(struct page *page, int order)

453

{

453

{

454

set_page_private(page, order);

454

set_page_private(page, order);

455

__SetPageBuddy(page);

455

__SetPageBuddy(page);

456

}

456

}

457

458

static inline void rmv_page_order(struct page *page)

458

static inline void rmv_page_order(struct page *page)

459

{

459

{

460

__ClearPageBuddy(page);

460

__ClearPageBuddy(page);

461

set_page_private(page, 0);

461

set_page_private(page, 0);

462

}

462

}

463

464

/*

464

/*

465

* Locate the struct page for both the matching buddy in our

465

* Locate the struct page for both the matching buddy in our

466

* pair (buddy1) and the combined O(n+1) page they form (page).

466

* pair (buddy1) and the combined O(n+1) page they form (page).

467

*

467

*

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

469

* the following equation:

469

* the following equation:

470

* B2 = B1 ^ (1 << O)

470

* B2 = B1 ^ (1 << O)

471

* For example, if the starting buddy (buddy2) is #8 its order

471

* For example, if the starting buddy (buddy2) is #8 its order

472

* 1 buddy is #10:

472

* 1 buddy is #10:

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

474

*

474

*

475

* 2) Any buddy B will have an order O+1 parent P which

475

* 2) Any buddy B will have an order O+1 parent P which

476

* satisfies the following equation:

476

* satisfies the following equation:

477

* P = B & ~(1 << O)

477

* P = B & ~(1 << O)

478

*

478

*

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

480

*/

480

*/

481

static inline unsigned long

481

static inline unsigned long

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

483

{

483

{

484

return page_idx ^ (1 << order);

484

return page_idx ^ (1 << order);

485

}

485

}

486

487

/*

487

/*

488

* This function checks whether a page is free && is the buddy

488

* This function checks whether a page is free && is the buddy

489

* we can do coalesce a page and its buddy if

489

* we can do coalesce a page and its buddy if

490

* (a) the buddy is not in a hole &&

490

* (a) the buddy is not in a hole &&

491

* (b) the buddy is in the buddy system &&

491

* (b) the buddy is in the buddy system &&

492

* (c) a page and its buddy have the same order &&

492

* (c) a page and its buddy have the same order &&

493

* (d) a page and its buddy are in the same zone.

493

* (d) a page and its buddy are in the same zone.

494

*

494

*

495

* For recording whether a page is in the buddy system, we set ->_mapcount

495

* For recording whether a page is in the buddy system, we set ->_mapcount

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

498

* serialized by zone->lock.

498

* serialized by zone->lock.

499

*

499

*

500

* For recording page's order, we use page_private(page).

500

* For recording page's order, we use page_private(page).

501

*/

501

*/

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

503

int order)

503

int order)

504

{

504

{

505

if (!pfn_valid_within(page_to_pfn(buddy)))

505

if (!pfn_valid_within(page_to_pfn(buddy)))

506

return 0;

506

return 0;

507

508

if (page_zone_id(page) != page_zone_id(buddy))

508

if (page_zone_id(page) != page_zone_id(buddy))

509

return 0;

509

return 0;

510

511

if (page_is_guard(buddy) && page_order(buddy) == order) {

511

if (page_is_guard(buddy) && page_order(buddy) == order) {

512

VM_BUG_ON(page_count(buddy) != 0);

512

VM_BUG_ON(page_count(buddy) != 0);

513

return 1;

513

return 1;

514

}

514

}

515

516

if (PageBuddy(buddy) && page_order(buddy) == order) {

516

if (PageBuddy(buddy) && page_order(buddy) == order) {

517

VM_BUG_ON(page_count(buddy) != 0);

517

VM_BUG_ON(page_count(buddy) != 0);

518

return 1;

518

return 1;

519

}

519

}

520

return 0;

520

return 0;

521

}

521

}

522

523

/*

523

/*

524

* Freeing function for a buddy system allocator.

524

* Freeing function for a buddy system allocator.

525

*

525

*

526

* The concept of a buddy system is to maintain direct-mapped table

526

* The concept of a buddy system is to maintain direct-mapped table

527

* (containing bit values) for memory blocks of various "orders".

527

* (containing bit values) for memory blocks of various "orders".

528

* The bottom level table contains the map for the smallest allocatable

528

* The bottom level table contains the map for the smallest allocatable

529

* units of memory (here, pages), and each level above it describes

529

* units of memory (here, pages), and each level above it describes

530

* pairs of units from the levels below, hence, "buddies".

530

* pairs of units from the levels below, hence, "buddies".

531

* At a high level, all that happens here is marking the table entry

531

* At a high level, all that happens here is marking the table entry

532

* at the bottom level available, and propagating the changes upward

532

* at the bottom level available, and propagating the changes upward

533

* as necessary, plus some accounting needed to play nicely with other

533

* as necessary, plus some accounting needed to play nicely with other

534

* parts of the VM system.

534

* parts of the VM system.

535

* At each level, we keep a list of pages, which are heads of continuous

535

* At each level, we keep a list of pages, which are heads of continuous

536

* free pages of length of (1 << order) and marked with _mapcount

536

* free pages of length of (1 << order) and marked with _mapcount

537

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

537

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

538

* field.

538

* field.

539

* So when we are allocating or freeing one, we can derive the state of the

539

* So when we are allocating or freeing one, we can derive the state of the

540

* other. That is, if we allocate a small block, and both were

540

* other. That is, if we allocate a small block, and both were

541

* free, the remainder of the region must be split into blocks.

541

* free, the remainder of the region must be split into blocks.

542

* If a block is freed, and its buddy is also free, then this

542

* If a block is freed, and its buddy is also free, then this

543

* triggers coalescing into a block of larger size.

543

* triggers coalescing into a block of larger size.

544

*

544

*

545

* -- nyc

545

* -- nyc

546

*/

546

*/

547

548

static inline void __free_one_page(struct page *page,

548

static inline void __free_one_page(struct page *page,

549

struct zone *zone, unsigned int order,

549

struct zone *zone, unsigned int order,

550

int migratetype)

550

int migratetype)

551

{

551

{

552

unsigned long page_idx;

552

unsigned long page_idx;

553

unsigned long combined_idx;

553

unsigned long combined_idx;

554

unsigned long uninitialized_var(buddy_idx);

554

unsigned long uninitialized_var(buddy_idx);

555

struct page *buddy;

555

struct page *buddy;

556

557

VM_BUG_ON(!zone_is_initialized(zone));

557

VM_BUG_ON(!zone_is_initialized(zone));

558

559

if (unlikely(PageCompound(page)))

559

if (unlikely(PageCompound(page)))

560

if (unlikely(destroy_compound_page(page, order)))

560

if (unlikely(destroy_compound_page(page, order)))

561

return;

561

return;

562

563

VM_BUG_ON(migratetype == -1);

563

VM_BUG_ON(migratetype == -1);

564

565

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

565

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

566

567

VM_BUG_ON(page_idx & ((1 << order) - 1));

567

VM_BUG_ON(page_idx & ((1 << order) - 1));

568

VM_BUG_ON(bad_range(zone, page));

568

VM_BUG_ON(bad_range(zone, page));

569

570

while (order < MAX_ORDER-1) {

570

while (order < MAX_ORDER-1) {

571

buddy_idx = __find_buddy_index(page_idx, order);

571

buddy_idx = __find_buddy_index(page_idx, order);

572

buddy = page + (buddy_idx - page_idx);

572

buddy = page + (buddy_idx - page_idx);

573

if (!page_is_buddy(page, buddy, order))

573

if (!page_is_buddy(page, buddy, order))

574

break;

574

break;

575

/*

575

/*

576

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

576

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

577

* merge with it and move up one order.

577

* merge with it and move up one order.

578

*/

578

*/

579

if (page_is_guard(buddy)) {

579

if (page_is_guard(buddy)) {

580

clear_page_guard_flag(buddy);

580

clear_page_guard_flag(buddy);

581

set_page_private(page, 0);

581

set_page_private(page, 0);

582

__mod_zone_freepage_state(zone, 1 << order,

582

__mod_zone_freepage_state(zone, 1 << order,

583

migratetype);

583

migratetype);

584

} else {

584

} else {

585

list_del(&buddy->lru);

585

list_del(&buddy->lru);

586

zone->free_area[order].nr_free--;

586

zone->free_area[order].nr_free--;

587

rmv_page_order(buddy);

587

rmv_page_order(buddy);

588

}

588

}

589

combined_idx = buddy_idx & page_idx;

589

combined_idx = buddy_idx & page_idx;

590

page = page + (combined_idx - page_idx);

590

page = page + (combined_idx - page_idx);

591

page_idx = combined_idx;

591

page_idx = combined_idx;

592

order++;

592

order++;

593

}

593

}

594

set_page_order(page, order);

594

set_page_order(page, order);

595

596

/*

596

/*

597

* If this is not the largest possible page, check if the buddy

597

* If this is not the largest possible page, check if the buddy

598

* of the next-highest order is free. If it is, it's possible

598

* of the next-highest order is free. If it is, it's possible

599

* that pages are being freed that will coalesce soon. In case,

599

* that pages are being freed that will coalesce soon. In case,

600

* that is happening, add the free page to the tail of the list

600

* that is happening, add the free page to the tail of the list

601

* so it's less likely to be used soon and more likely to be merged

601

* so it's less likely to be used soon and more likely to be merged

602

* as a higher order page

602

* as a higher order page

603

*/

603

*/

604

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

604

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

605

struct page *higher_page, *higher_buddy;

605

struct page *higher_page, *higher_buddy;

606

combined_idx = buddy_idx & page_idx;

606

combined_idx = buddy_idx & page_idx;

607

higher_page = page + (combined_idx - page_idx);

607

higher_page = page + (combined_idx - page_idx);

608

buddy_idx = __find_buddy_index(combined_idx, order + 1);

608

buddy_idx = __find_buddy_index(combined_idx, order + 1);

609

higher_buddy = higher_page + (buddy_idx - combined_idx);

609

higher_buddy = higher_page + (buddy_idx - combined_idx);

610

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

610

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

611

list_add_tail(&page->lru,

611

list_add_tail(&page->lru,

612

&zone->free_area[order].free_list[migratetype]);

612

&zone->free_area[order].free_list[migratetype]);

613

goto out;

613

goto out;

614

}

614

}

615

}

615

}

616

617

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

617

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

618

out:

618

out:

619

zone->free_area[order].nr_free++;

619

zone->free_area[order].nr_free++;

620

}

620

}

621

622

static inline int free_pages_check(struct page *page)

622

static inline int free_pages_check(struct page *page)

623

{

623

{

624

if (unlikely(page_mapcount(page) |

624

if (unlikely(page_mapcount(page) |

625

(page->mapping != NULL) |

625

(page->mapping != NULL) |

626

(atomic_read(&page->_count) != 0) |

626

(atomic_read(&page->_count) != 0) |

627

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

627

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

628

(mem_cgroup_bad_page_check(page)))) {

628

(mem_cgroup_bad_page_check(page)))) {

629

bad_page(page);

629

bad_page(page);

630

return 1;

630

return 1;

631

}

631

}

632

page_nid_reset_last(page);

632

page_nid_reset_last(page);

633

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

633

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

634

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

634

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

635

return 0;

635

return 0;

636

}

636

}

637

638

/*

638

/*

639

* Frees a number of pages from the PCP lists

639

* Frees a number of pages from the PCP lists

640

* Assumes all pages on list are in same zone, and of same order.

640

* Assumes all pages on list are in same zone, and of same order.

641

* count is the number of pages to free.

641

* count is the number of pages to free.

642

*

642

*

643

* If the zone was previously in an "all pages pinned" state then look to

643

* If the zone was previously in an "all pages pinned" state then look to

644

* see if this freeing clears that state.

644

* see if this freeing clears that state.

645

*

645

*

646

* And clear the zone's pages_scanned counter, to hold off the "all pages are

646

* And clear the zone's pages_scanned counter, to hold off the "all pages are

647

* pinned" detection logic.

647

* pinned" detection logic.

648

*/

648

*/

649

static void free_pcppages_bulk(struct zone *zone, int count,

649

static void free_pcppages_bulk(struct zone *zone, int count,

650

struct per_cpu_pages *pcp)

650

struct per_cpu_pages *pcp)

651

{

651

{

652

int migratetype = 0;

652

int migratetype = 0;

653

int batch_free = 0;

653

int batch_free = 0;

654

int to_free = count;

654

int to_free = count;

655

656

spin_lock(&zone->lock);

656

spin_lock(&zone->lock);

657

zone->pages_scanned = 0;

657

zone->pages_scanned = 0;

658

659

while (to_free) {

659

while (to_free) {

660

struct page *page;

660

struct page *page;

661

struct list_head *list;

661

struct list_head *list;

662

663

/*

663

/*

664

* Remove pages from lists in a round-robin fashion. A

664

* Remove pages from lists in a round-robin fashion. A

665

* batch_free count is maintained that is incremented when an

665

* batch_free count is maintained that is incremented when an

666

* empty list is encountered. This is so more pages are freed

666

* empty list is encountered. This is so more pages are freed

667

* off fuller lists instead of spinning excessively around empty

667

* off fuller lists instead of spinning excessively around empty

668

* lists

668

* lists

669

*/

669

*/

670

do {

670

do {

671

batch_free++;

671

batch_free++;

672

if (++migratetype == MIGRATE_PCPTYPES)

672

if (++migratetype == MIGRATE_PCPTYPES)

673

migratetype = 0;

673

migratetype = 0;

674

list = &pcp->lists[migratetype];

674

list = &pcp->lists[migratetype];

675

} while (list_empty(list));

675

} while (list_empty(list));

676

677

/* This is the only non-empty list. Free them all. */

677

/* This is the only non-empty list. Free them all. */

678

if (batch_free == MIGRATE_PCPTYPES)

678

if (batch_free == MIGRATE_PCPTYPES)

679

batch_free = to_free;

679

batch_free = to_free;

680

681

do {

681

do {

682

int mt; /* migratetype of the to-be-freed page */

682

int mt; /* migratetype of the to-be-freed page */

683

684

page = list_entry(list->prev, struct page, lru);

684

page = list_entry(list->prev, struct page, lru);

685

/* must delete as __free_one_page list manipulates */

685

/* must delete as __free_one_page list manipulates */

686

list_del(&page->lru);

686

list_del(&page->lru);

687

mt = get_freepage_migratetype(page);

687

mt = get_freepage_migratetype(page);

688

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

688

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

689

__free_one_page(page, zone, 0, mt);

689

__free_one_page(page, zone, 0, mt);

690

trace_mm_page_pcpu_drain(page, 0, mt);

690

trace_mm_page_pcpu_drain(page, 0, mt);

691

if (likely(!is_migrate_isolate_page(page))) {

691

if (likely(!is_migrate_isolate_page(page))) {

692

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

692

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

693

if (is_migrate_cma(mt))

693

if (is_migrate_cma(mt))

694

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

694

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

695

}

695

}

696

} while (--to_free && --batch_free && !list_empty(list));

696

} while (--to_free && --batch_free && !list_empty(list));

697

}

697

}

698

spin_unlock(&zone->lock);

698

spin_unlock(&zone->lock);

699

}

699

}

700

701

static void free_one_page(struct zone *zone, struct page *page, int order,

701

static void free_one_page(struct zone *zone, struct page *page, int order,

702

int migratetype)

702

int migratetype)

703

{

703

{

704

spin_lock(&zone->lock);

704

spin_lock(&zone->lock);

705

zone->pages_scanned = 0;

705

zone->pages_scanned = 0;

706

707

__free_one_page(page, zone, order, migratetype);

707

__free_one_page(page, zone, order, migratetype);

708

if (unlikely(!is_migrate_isolate(migratetype)))

708

if (unlikely(!is_migrate_isolate(migratetype)))

709

__mod_zone_freepage_state(zone, 1 << order, migratetype);

709

__mod_zone_freepage_state(zone, 1 << order, migratetype);

710

spin_unlock(&zone->lock);

710

spin_unlock(&zone->lock);

711

}

711

}

712

713

static bool free_pages_prepare(struct page *page, unsigned int order)

713

static bool free_pages_prepare(struct page *page, unsigned int order)

714

{

714

{

715

int i;

715

int i;

716

int bad = 0;

716

int bad = 0;

717

718

trace_mm_page_free(page, order);

718

trace_mm_page_free(page, order);

719

kmemcheck_free_shadow(page, order);

719

kmemcheck_free_shadow(page, order);

720

721

if (PageAnon(page))

721

if (PageAnon(page))

722

page->mapping = NULL;

722

page->mapping = NULL;

723

for (i = 0; i < (1 << order); i++)

723

for (i = 0; i < (1 << order); i++)

724

bad += free_pages_check(page + i);

724

bad += free_pages_check(page + i);

725

if (bad)

725

if (bad)

726

return false;

726

return false;

727

728

if (!PageHighMem(page)) {

728

if (!PageHighMem(page)) {

729

debug_check_no_locks_freed(page_address(page),

729

debug_check_no_locks_freed(page_address(page),

730

PAGE_SIZE << order);

730

PAGE_SIZE << order);

731

debug_check_no_obj_freed(page_address(page),

731

debug_check_no_obj_freed(page_address(page),

732

PAGE_SIZE << order);

732

PAGE_SIZE << order);

733

}

733

}

734

arch_free_page(page, order);

734

arch_free_page(page, order);

735

kernel_map_pages(page, 1 << order, 0);

735

kernel_map_pages(page, 1 << order, 0);

736

737

return true;

737

return true;

738

}

738

}

739

740

static void __free_pages_ok(struct page *page, unsigned int order)

740

static void __free_pages_ok(struct page *page, unsigned int order)

741

{

741

{

742

unsigned long flags;

742

unsigned long flags;

743

int migratetype;

743

int migratetype;

744

745

if (!free_pages_prepare(page, order))

745

if (!free_pages_prepare(page, order))

746

return;

746

return;

747

748

local_irq_save(flags);

748

local_irq_save(flags);

749

__count_vm_events(PGFREE, 1 << order);

749

__count_vm_events(PGFREE, 1 << order);

750

migratetype = get_pageblock_migratetype(page);

750

migratetype = get_pageblock_migratetype(page);

751

set_freepage_migratetype(page, migratetype);

751

set_freepage_migratetype(page, migratetype);

752

free_one_page(page_zone(page), page, order, migratetype);

752

free_one_page(page_zone(page), page, order, migratetype);

753

local_irq_restore(flags);

753

local_irq_restore(flags);

754

}

754

}

755

756

void __init __free_pages_bootmem(struct page *page, unsigned int order)

756

void __init __free_pages_bootmem(struct page *page, unsigned int order)

757

{

757

{

758

unsigned int nr_pages = 1 << order;

758

unsigned int nr_pages = 1 << order;

759

struct page *p = page;

759

struct page *p = page;

760

unsigned int loop;

760

unsigned int loop;

761

762

prefetchw(p);

762

prefetchw(p);

763

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

763

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

764

prefetchw(p + 1);

764

prefetchw(p + 1);

765

__ClearPageReserved(p);

765

__ClearPageReserved(p);

766

set_page_count(p, 0);

766

set_page_count(p, 0);

767

}

767

}

768

__ClearPageReserved(p);

768

__ClearPageReserved(p);

769

set_page_count(p, 0);

769

set_page_count(p, 0);

770

771

page_zone(page)->managed_pages += nr_pages;

771

page_zone(page)->managed_pages += nr_pages;

772

set_page_refcounted(page);

772

set_page_refcounted(page);

773

__free_pages(page, order);

773

__free_pages(page, order);

774

}

774

}

775

776

#ifdef CONFIG_CMA

776

#ifdef CONFIG_CMA

777

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

777

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

778

void __init init_cma_reserved_pageblock(struct page *page)

778

void __init init_cma_reserved_pageblock(struct page *page)

779

{

779

{

780

unsigned i = pageblock_nr_pages;

780

unsigned i = pageblock_nr_pages;

781

struct page *p = page;

781

struct page *p = page;

782

783

do {

783

do {

784

__ClearPageReserved(p);

784

__ClearPageReserved(p);

785

set_page_count(p, 0);

785

set_page_count(p, 0);

786

} while (++p, --i);

786

} while (++p, --i);

787

788

set_pageblock_migratetype(page, MIGRATE_CMA);

788

set_pageblock_migratetype(page, MIGRATE_CMA);

789

790

if (pageblock_order >= MAX_ORDER) {

790

if (pageblock_order >= MAX_ORDER) {

791

i = pageblock_nr_pages;

791

i = pageblock_nr_pages;

792

p = page;

792

p = page;

793

do {

793

do {

794

set_page_refcounted(p);

794

set_page_refcounted(p);

795

__free_pages(p, MAX_ORDER - 1);

795

__free_pages(p, MAX_ORDER - 1);

796

p += MAX_ORDER_NR_PAGES;

796

p += MAX_ORDER_NR_PAGES;

797

} while (i -= MAX_ORDER_NR_PAGES);

797

} while (i -= MAX_ORDER_NR_PAGES);

798

} else {

798

} else {

799

set_page_refcounted(page);

799

set_page_refcounted(page);

800

__free_pages(page, pageblock_order);

800

__free_pages(page, pageblock_order);

801

}

801

}

802

803

adjust_managed_page_count(page, pageblock_nr_pages);

803

adjust_managed_page_count(page, pageblock_nr_pages);

804

}

804

}

805

#endif

805

#endif

806

807

/*

807

/*

808

* The order of subdivision here is critical for the IO subsystem.

808

* The order of subdivision here is critical for the IO subsystem.

809

* Please do not alter this order without good reasons and regression

809

* Please do not alter this order without good reasons and regression

810

* testing. Specifically, as large blocks of memory are subdivided,

810

* testing. Specifically, as large blocks of memory are subdivided,

811

* the order in which smaller blocks are delivered depends on the order

811

* the order in which smaller blocks are delivered depends on the order

812

* they're subdivided in this function. This is the primary factor

812

* they're subdivided in this function. This is the primary factor

813

* influencing the order in which pages are delivered to the IO

813

* influencing the order in which pages are delivered to the IO

814

* subsystem according to empirical testing, and this is also justified

814

* subsystem according to empirical testing, and this is also justified

815

* by considering the behavior of a buddy system containing a single

815

* by considering the behavior of a buddy system containing a single

816

* large block of memory acted on by a series of small allocations.

816

* large block of memory acted on by a series of small allocations.

817

* This behavior is a critical factor in sglist merging's success.

817

* This behavior is a critical factor in sglist merging's success.

818

*

818

*

819

* -- nyc

819

* -- nyc

820

*/

820

*/

821

static inline void expand(struct zone *zone, struct page *page,

821

static inline void expand(struct zone *zone, struct page *page,

822

int low, int high, struct free_area *area,

822

int low, int high, struct free_area *area,

823

int migratetype)

823

int migratetype)

824

{

824

{

825

unsigned long size = 1 << high;

825

unsigned long size = 1 << high;

826

827

while (high > low) {

827

while (high > low) {

828

area--;

828

area--;

829

high--;

829

high--;

830

size >>= 1;

830

size >>= 1;

831

VM_BUG_ON(bad_range(zone, &page[size]));

831

VM_BUG_ON(bad_range(zone, &page[size]));

832

833

#ifdef CONFIG_DEBUG_PAGEALLOC

833

#ifdef CONFIG_DEBUG_PAGEALLOC

834

if (high < debug_guardpage_minorder()) {

834

if (high < debug_guardpage_minorder()) {

835

/*

835

/*

836

* Mark as guard pages (or page), that will allow to

836

* Mark as guard pages (or page), that will allow to

837

* merge back to allocator when buddy will be freed.

837

* merge back to allocator when buddy will be freed.

838

* Corresponding page table entries will not be touched,

838

* Corresponding page table entries will not be touched,

839

* pages will stay not present in virtual address space

839

* pages will stay not present in virtual address space

840

*/

840

*/

841

INIT_LIST_HEAD(&page[size].lru);

841

INIT_LIST_HEAD(&page[size].lru);

842

set_page_guard_flag(&page[size]);

842

set_page_guard_flag(&page[size]);

843

set_page_private(&page[size], high);

843

set_page_private(&page[size], high);

844

/* Guard pages are not available for any usage */

844

/* Guard pages are not available for any usage */

845

__mod_zone_freepage_state(zone, -(1 << high),

845

__mod_zone_freepage_state(zone, -(1 << high),

846

migratetype);

846

migratetype);

847

continue;

847

continue;

848

}

848

}

849

#endif

849

#endif

850

list_add(&page[size].lru, &area->free_list[migratetype]);

850

list_add(&page[size].lru, &area->free_list[migratetype]);

851

area->nr_free++;

851

area->nr_free++;

852

set_page_order(&page[size], high);

852

set_page_order(&page[size], high);

853

}

853

}

854

}

854

}

855

856

/*

856

/*

857

* This page is about to be returned from the page allocator

857

* This page is about to be returned from the page allocator

858

*/

858

*/

859

static inline int check_new_page(struct page *page)

859

static inline int check_new_page(struct page *page)

860

{

860

{

861

if (unlikely(page_mapcount(page) |

861

if (unlikely(page_mapcount(page) |

862

(page->mapping != NULL) |

862

(page->mapping != NULL) |

863

(atomic_read(&page->_count) != 0) |

863

(atomic_read(&page->_count) != 0) |

864

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

864

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

865

(mem_cgroup_bad_page_check(page)))) {

865

(mem_cgroup_bad_page_check(page)))) {

866

bad_page(page);

866

bad_page(page);

867

return 1;

867

return 1;

868

}

868

}

869

return 0;

869

return 0;

870

}

870

}

871

872

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

872

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

873

{

873

{

874

int i;

874

int i;

875

876

for (i = 0; i < (1 << order); i++) {

876

for (i = 0; i < (1 << order); i++) {

877

struct page *p = page + i;

877

struct page *p = page + i;

878

if (unlikely(check_new_page(p)))

878

if (unlikely(check_new_page(p)))

879

return 1;

879

return 1;

880

}

880

}

881

882

set_page_private(page, 0);

882

set_page_private(page, 0);

883

set_page_refcounted(page);

883

set_page_refcounted(page);

884

885

arch_alloc_page(page, order);

885

arch_alloc_page(page, order);

886

kernel_map_pages(page, 1 << order, 1);

886

kernel_map_pages(page, 1 << order, 1);

887

888

if (gfp_flags & __GFP_ZERO)

888

if (gfp_flags & __GFP_ZERO)

889

prep_zero_page(page, order, gfp_flags);

889

prep_zero_page(page, order, gfp_flags);

890

891

if (order && (gfp_flags & __GFP_COMP))

891

if (order && (gfp_flags & __GFP_COMP))

892

prep_compound_page(page, order);

892

prep_compound_page(page, order);

893

894

return 0;

894

return 0;

895

}

895

}

896

897

/*

897

/*

898

* Go through the free lists for the given migratetype and remove

898

* Go through the free lists for the given migratetype and remove

899

* the smallest available page from the freelists

899

* the smallest available page from the freelists

900

*/

900

*/

901

static inline

901

static inline

902

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

902

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

903

int migratetype)

903

int migratetype)

904

{

904

{

905

unsigned int current_order;

905

unsigned int current_order;

906

struct free_area *area;

906

struct free_area *area;

907

struct page *page;

907

struct page *page;

908

909

/* Find a page of the appropriate size in the preferred list */

909

/* Find a page of the appropriate size in the preferred list */

910

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

910

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

911

area = &(zone->free_area[current_order]);

911

area = &(zone->free_area[current_order]);

912

if (list_empty(&area->free_list[migratetype]))

912

if (list_empty(&area->free_list[migratetype]))

913

continue;

913

continue;

914

915

page = list_entry(area->free_list[migratetype].next,

915

page = list_entry(area->free_list[migratetype].next,

916

struct page, lru);

916

struct page, lru);

917

list_del(&page->lru);

917

list_del(&page->lru);

918

rmv_page_order(page);

918

rmv_page_order(page);

919

area->nr_free--;

919

area->nr_free--;

920

expand(zone, page, order, current_order, area, migratetype);

920

expand(zone, page, order, current_order, area, migratetype);

921

set_freepage_migratetype(page, migratetype);

921

return page;

922

return page;

922

}

923

}

923

924

return NULL;

925

return NULL;

925

}

926

}

926

927

928

/*

929

/*

929

* This array describes the order lists are fallen back to when

930

* This array describes the order lists are fallen back to when

930

* the free lists for the desirable migrate type are depleted

931

* the free lists for the desirable migrate type are depleted

931

*/

932

*/

932

static int fallbacks[MIGRATE_TYPES][4] = {

933

static int fallbacks[MIGRATE_TYPES][4] = {

933

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

934

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

934

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

935

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

935

#ifdef CONFIG_CMA

936

#ifdef CONFIG_CMA

936

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

937

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

937

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

938

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

938

#else

939

#else

939

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

940

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

940

#endif

941

#endif

941

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

942

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

942

#ifdef CONFIG_MEMORY_ISOLATION

943

#ifdef CONFIG_MEMORY_ISOLATION

943

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

944

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

944

#endif

945

#endif

945

};

946

};

946

947

/*

948

/*

948

* Move the free pages in a range to the free lists of the requested type.

949

* Move the free pages in a range to the free lists of the requested type.

949

* Note that start_page and end_pages are not aligned on a pageblock

950

* Note that start_page and end_pages are not aligned on a pageblock

950

* boundary. If alignment is required, use move_freepages_block()

951

* boundary. If alignment is required, use move_freepages_block()

951

*/

952

*/

952

int move_freepages(struct zone *zone,

953

int move_freepages(struct zone *zone,

953

struct page *start_page, struct page *end_page,

954

struct page *start_page, struct page *end_page,

954

int migratetype)

955

int migratetype)

955

{

956

{

956

struct page *page;

957

struct page *page;

957

unsigned long order;

958

unsigned long order;

958

int pages_moved = 0;

959

int pages_moved = 0;

959

960

#ifndef CONFIG_HOLES_IN_ZONE

961

#ifndef CONFIG_HOLES_IN_ZONE

961

/*

962

/*

962

* page_zone is not safe to call in this context when

963

* page_zone is not safe to call in this context when

963

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

964

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

964

* anyway as we check zone boundaries in move_freepages_block().

965

* anyway as we check zone boundaries in move_freepages_block().

965

* Remove at a later date when no bug reports exist related to

966

* Remove at a later date when no bug reports exist related to

966

* grouping pages by mobility

967

* grouping pages by mobility

967

*/

968

*/

968

BUG_ON(page_zone(start_page) != page_zone(end_page));

969

BUG_ON(page_zone(start_page) != page_zone(end_page));

969

#endif

970

#endif

970

971

for (page = start_page; page <= end_page;) {

972

for (page = start_page; page <= end_page;) {

972

/* Make sure we are not inadvertently changing nodes */

973

/* Make sure we are not inadvertently changing nodes */

973

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

974

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

974

975

if (!pfn_valid_within(page_to_pfn(page))) {

976

if (!pfn_valid_within(page_to_pfn(page))) {

976

page++;

977

page++;

977

continue;

978

continue;

978

}

979

}

979

980

if (!PageBuddy(page)) {

981

if (!PageBuddy(page)) {

981

page++;

982

page++;

982

continue;

983

continue;

983

}

984

}

984

985

order = page_order(page);

986

order = page_order(page);

986

list_move(&page->lru,

987

list_move(&page->lru,

987

&zone->free_area[order].free_list[migratetype]);

988

&zone->free_area[order].free_list[migratetype]);

988

set_freepage_migratetype(page, migratetype);

989

set_freepage_migratetype(page, migratetype);

989

page += 1 << order;

990

page += 1 << order;

990

pages_moved += 1 << order;

991

pages_moved += 1 << order;

991

}

992

}

992

993

return pages_moved;

994

return pages_moved;

994

}

995

}

995

996

int move_freepages_block(struct zone *zone, struct page *page,

997

int move_freepages_block(struct zone *zone, struct page *page,

997

int migratetype)

998

int migratetype)

998

{

999

{

999

unsigned long start_pfn, end_pfn;

1000

unsigned long start_pfn, end_pfn;

1000

struct page *start_page, *end_page;

1001

struct page *start_page, *end_page;

1001

1002

start_pfn = page_to_pfn(page);

1003

start_pfn = page_to_pfn(page);

1003

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1004

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1004

start_page = pfn_to_page(start_pfn);

1005

start_page = pfn_to_page(start_pfn);

1005

end_page = start_page + pageblock_nr_pages - 1;

1006

end_page = start_page + pageblock_nr_pages - 1;

1006

end_pfn = start_pfn + pageblock_nr_pages - 1;

1007

end_pfn = start_pfn + pageblock_nr_pages - 1;

1007

1008

/* Do not cross zone boundaries */

1009

/* Do not cross zone boundaries */

1009

if (!zone_spans_pfn(zone, start_pfn))

1010

if (!zone_spans_pfn(zone, start_pfn))

1010

start_page = page;

1011

start_page = page;

1011

if (!zone_spans_pfn(zone, end_pfn))

1012

if (!zone_spans_pfn(zone, end_pfn))

1012

return 0;

1013

return 0;

1013

1014

return move_freepages(zone, start_page, end_page, migratetype);

1015

return move_freepages(zone, start_page, end_page, migratetype);

1015

}

1016

}

1016

1017

static void change_pageblock_range(struct page *pageblock_page,

1018

static void change_pageblock_range(struct page *pageblock_page,

1018

int start_order, int migratetype)

1019

int start_order, int migratetype)

1019

{

1020

{

1020

int nr_pageblocks = 1 << (start_order - pageblock_order);

1021

int nr_pageblocks = 1 << (start_order - pageblock_order);

1021

1022

while (nr_pageblocks--) {

1023

while (nr_pageblocks--) {

1023

set_pageblock_migratetype(pageblock_page, migratetype);

1024

set_pageblock_migratetype(pageblock_page, migratetype);

1024

pageblock_page += pageblock_nr_pages;

1025

pageblock_page += pageblock_nr_pages;

1025

}

1026

}

1026

}

1027

}

1027

1028

/*

1029

/*

1029

* If breaking a large block of pages, move all free pages to the preferred

1030

* If breaking a large block of pages, move all free pages to the preferred

1030

* allocation list. If falling back for a reclaimable kernel allocation, be

1031

* allocation list. If falling back for a reclaimable kernel allocation, be

1031

* more aggressive about taking ownership of free pages.

1032

* more aggressive about taking ownership of free pages.

1032

*

1033

*

1033

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1034

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1034

* nor move CMA pages to different free lists. We don't want unmovable pages

1035

* nor move CMA pages to different free lists. We don't want unmovable pages

1035

* to be allocated from MIGRATE_CMA areas.

1036

* to be allocated from MIGRATE_CMA areas.

1036

*

1037

*

1037

* Returns the new migratetype of the pageblock (or the same old migratetype

1038

* Returns the new migratetype of the pageblock (or the same old migratetype

1038

* if it was unchanged).

1039

* if it was unchanged).

1039

*/

1040

*/

1040

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1041

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1041

int start_type, int fallback_type)

1042

int start_type, int fallback_type)

1042

{

1043

{

1043

int current_order = page_order(page);

1044

int current_order = page_order(page);

1044

1045

/*

1046

/*

1046

* When borrowing from MIGRATE_CMA, we need to release the excess

1047

* When borrowing from MIGRATE_CMA, we need to release the excess

1047

* buddy pages to CMA itself.

1048

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1049

* is set to CMA so it is returned to the correct freelist in case

1050

* the page ends up being not actually allocated from the pcp lists.

1048

*/

1051

*/

1049

if (is_migrate_cma(fallback_type))

1052

if (is_migrate_cma(fallback_type))

1050

return fallback_type;

1053

return fallback_type;

1051

1054

1052

/* Take ownership for orders >= pageblock_order */

1055

/* Take ownership for orders >= pageblock_order */

1053

if (current_order >= pageblock_order) {

1056

if (current_order >= pageblock_order) {

1054

change_pageblock_range(page, current_order, start_type);

1057

change_pageblock_range(page, current_order, start_type);

1055

return start_type;

1058

return start_type;

1056

}

1059

}

1057

1060

1058

if (current_order >= pageblock_order / 2 ||

1061

if (current_order >= pageblock_order / 2 ||

1059

start_type == MIGRATE_RECLAIMABLE ||

1062

start_type == MIGRATE_RECLAIMABLE ||

1060

page_group_by_mobility_disabled) {

1063

page_group_by_mobility_disabled) {

1061

int pages;

1064

int pages;

1062

1065

1063

pages = move_freepages_block(zone, page, start_type);

1066

pages = move_freepages_block(zone, page, start_type);

1064

1067

1065

/* Claim the whole block if over half of it is free */

1068

/* Claim the whole block if over half of it is free */

1066

if (pages >= (1 << (pageblock_order-1)) ||

1069

if (pages >= (1 << (pageblock_order-1)) ||

1067

page_group_by_mobility_disabled) {

1070

page_group_by_mobility_disabled) {

1068

1071

1069

set_pageblock_migratetype(page, start_type);

1072

set_pageblock_migratetype(page, start_type);

1070

return start_type;

1073

return start_type;

1071

}

1074

}

1072

1075

1073

}

1076

}

1074

1077

1075

return fallback_type;

1078

return fallback_type;

1076

}

1079

}

1077

1080

1078

/* Remove an element from the buddy allocator from the fallback list */

1081

/* Remove an element from the buddy allocator from the fallback list */

1079

static inline struct page *

1082

static inline struct page *

1080

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1083

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1081

{

1084

{

1082

struct free_area *area;

1085

struct free_area *area;

1083

int current_order;

1086

int current_order;

1084

struct page *page;

1087

struct page *page;

1085

int migratetype, new_type, i;

1088

int migratetype, new_type, i;

1086

1089

1087

/* Find the largest possible block of pages in the other list */

1090

/* Find the largest possible block of pages in the other list */

1088

for (current_order = MAX_ORDER-1; current_order >= order;

1091

for (current_order = MAX_ORDER-1; current_order >= order;

1089

--current_order) {

1092

--current_order) {

1090

for (i = 0;; i++) {

1093

for (i = 0;; i++) {

1091

migratetype = fallbacks[start_migratetype][i];

1094

migratetype = fallbacks[start_migratetype][i];

1092

1095

1093

/* MIGRATE_RESERVE handled later if necessary */

1096

/* MIGRATE_RESERVE handled later if necessary */

1094

if (migratetype == MIGRATE_RESERVE)

1097

if (migratetype == MIGRATE_RESERVE)

1095

break;

1098

break;

1096

1099

1097

area = &(zone->free_area[current_order]);

1100

area = &(zone->free_area[current_order]);

1098

if (list_empty(&area->free_list[migratetype]))

1101

if (list_empty(&area->free_list[migratetype]))

1099

continue;

1102

continue;

1100

1103

1101

page = list_entry(area->free_list[migratetype].next,

1104

page = list_entry(area->free_list[migratetype].next,

1102

struct page, lru);

1105

struct page, lru);

1103

area->nr_free--;

1106

area->nr_free--;

1104

1107

1105

new_type = try_to_steal_freepages(zone, page,

1108

new_type = try_to_steal_freepages(zone, page,

1106

start_migratetype,

1109

start_migratetype,

1107

migratetype);

1110

migratetype);

1108

1111

1109

/* Remove the page from the freelists */

1112

/* Remove the page from the freelists */

1110

list_del(&page->lru);

1113

list_del(&page->lru);

1111

rmv_page_order(page);

1114

rmv_page_order(page);

1112

1115

1113

expand(zone, page, order, current_order, area,

1116

expand(zone, page, order, current_order, area,

1114

new_type);

1117

new_type);

1118

/* The freepage_migratetype may differ from pageblock's

1119

* migratetype depending on the decisions in

1120

* try_to_steal_freepages. This is OK as long as it does

1121

* not differ for MIGRATE_CMA type.

1122

*/

1123

set_freepage_migratetype(page, new_type);

1115

1124

1116

trace_mm_page_alloc_extfrag(page, order, current_order,

1125

trace_mm_page_alloc_extfrag(page, order, current_order,

1117

start_migratetype, migratetype, new_type);

1126

start_migratetype, migratetype, new_type);

1118

1127

1119

return page;

1128

return page;

1120

}

1129

}

1121

}

1130

}

1122

1131

1123

return NULL;

1132

return NULL;

1124

}

1133

}

1125

1134

1126

/*

1135

/*

1127

* Do the hard work of removing an element from the buddy allocator.

1136

* Do the hard work of removing an element from the buddy allocator.

1128

* Call me with the zone->lock already held.

1137

* Call me with the zone->lock already held.

1129

*/

1138

*/

1130

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1139

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1131

int migratetype)

1140

int migratetype)

1132

{

1141

{

1133

struct page *page;

1142

struct page *page;

1134

1143

1135

retry_reserve:

1144

retry_reserve:

1136

page = __rmqueue_smallest(zone, order, migratetype);

1145

page = __rmqueue_smallest(zone, order, migratetype);

1137

1146

1138

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1147

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1139

page = __rmqueue_fallback(zone, order, migratetype);

1148

page = __rmqueue_fallback(zone, order, migratetype);

1140

1149

1141

/*

1150

/*

1142

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1151

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1143

* is used because __rmqueue_smallest is an inline function

1152

* is used because __rmqueue_smallest is an inline function

1144

* and we want just one call site

1153

* and we want just one call site

1145

*/

1154

*/

1146

if (!page) {

1155

if (!page) {

1147

migratetype = MIGRATE_RESERVE;

1156

migratetype = MIGRATE_RESERVE;

1148

goto retry_reserve;

1157

goto retry_reserve;

1149

}

1158

}

1150

}

1159

}

1151

1160

1152

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1161

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1153

return page;

1162

return page;

1154

}

1163

}

1155

1164

1156

/*

1165

/*

1157

* Obtain a specified number of elements from the buddy allocator, all under

1166

* Obtain a specified number of elements from the buddy allocator, all under

1158

* a single hold of the lock, for efficiency. Add them to the supplied list.

1167

* a single hold of the lock, for efficiency. Add them to the supplied list.

1159

* Returns the number of new pages which were placed at *list.

1168

* Returns the number of new pages which were placed at *list.

1160

*/

1169

*/

1161

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1170

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1162

unsigned long count, struct list_head *list,

1171

unsigned long count, struct list_head *list,

1163

int migratetype, int cold)

1172

int migratetype, int cold)

1164

{

1173

{

1165

int mt = migratetype, i;

1174

int i;

1166

1175

1167

spin_lock(&zone->lock);

1176

spin_lock(&zone->lock);

1168

for (i = 0; i < count; ++i) {

1177

for (i = 0; i < count; ++i) {

1169

struct page *page = __rmqueue(zone, order, migratetype);

1178

struct page *page = __rmqueue(zone, order, migratetype);

1170

if (unlikely(page == NULL))

1179

if (unlikely(page == NULL))

1171

break;

1180

break;

1172

1181

1173

/*

1182

/*

1174

* Split buddy pages returned by expand() are received here

1183

* Split buddy pages returned by expand() are received here

1175

* in physical page order. The page is added to the callers and

1184

* in physical page order. The page is added to the callers and

1176

* list and the list head then moves forward. From the callers

1185

* list and the list head then moves forward. From the callers

1177

* perspective, the linked list is ordered by page number in

1186

* perspective, the linked list is ordered by page number in

1178

* some conditions. This is useful for IO devices that can

1187

* some conditions. This is useful for IO devices that can

1179

* merge IO requests if the physical pages are ordered

1188

* merge IO requests if the physical pages are ordered

1180

* properly.

1189

* properly.

1181

*/

1190

*/

1182

if (likely(cold == 0))

1191

if (likely(cold == 0))

1183

list_add(&page->lru, list);

1192

list_add(&page->lru, list);

1184

else

1193

else

1185

list_add_tail(&page->lru, list);

1194

list_add_tail(&page->lru, list);

1186

if (IS_ENABLED(CONFIG_CMA)) {

1187

mt = get_pageblock_migratetype(page);

1188

if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))

1189

mt = migratetype;

1190

}

1191

set_freepage_migratetype(page, mt);

1192

list = &page->lru;

1195

list = &page->lru;

1193

if (is_migrate_cma(mt))

1196

if (is_migrate_cma(get_freepage_migratetype(page)))

1194

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1197

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1195

-(1 << order));

1198

-(1 << order));

1196

}

1199

}

1197

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1200

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1198

spin_unlock(&zone->lock);

1201

spin_unlock(&zone->lock);

1199

return i;

1202

return i;

1200

}

1203

}

1201

1204

1202

#ifdef CONFIG_NUMA

1205

#ifdef CONFIG_NUMA

1203

/*

1206

/*

1204

* Called from the vmstat counter updater to drain pagesets of this

1207

* Called from the vmstat counter updater to drain pagesets of this

1205

* currently executing processor on remote nodes after they have

1208

* currently executing processor on remote nodes after they have

1206

* expired.

1209

* expired.

1207

*

1210

*

1208

* Note that this function must be called with the thread pinned to

1211

* Note that this function must be called with the thread pinned to

1209

* a single processor.

1212

* a single processor.

1210

*/

1213

*/

1211

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1214

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1212

{

1215

{

1213

unsigned long flags;

1216

unsigned long flags;

1214

int to_drain;

1217

int to_drain;

1215

unsigned long batch;

1218

unsigned long batch;

1216

1219

1217

local_irq_save(flags);

1220

local_irq_save(flags);

1218

batch = ACCESS_ONCE(pcp->batch);

1221

batch = ACCESS_ONCE(pcp->batch);

1219

if (pcp->count >= batch)

1222

if (pcp->count >= batch)

1220

to_drain = batch;

1223

to_drain = batch;

1221

else

1224

else

1222

to_drain = pcp->count;

1225

to_drain = pcp->count;

1223

if (to_drain > 0) {

1226

if (to_drain > 0) {

1224

free_pcppages_bulk(zone, to_drain, pcp);

1227

free_pcppages_bulk(zone, to_drain, pcp);

1225

pcp->count -= to_drain;

1228

pcp->count -= to_drain;

1226

}

1229

}

1227

local_irq_restore(flags);

1230

local_irq_restore(flags);

1228

}

1231

}

1229

#endif

1232

#endif

1230

1233

1231

/*

1234

/*

1232

* Drain pages of the indicated processor.

1235

* Drain pages of the indicated processor.

1233

*

1236

*

1234

* The processor must either be the current processor and the

1237

* The processor must either be the current processor and the

1235

* thread pinned to the current processor or a processor that

1238

* thread pinned to the current processor or a processor that

1236

* is not online.

1239

* is not online.

1237

*/

1240

*/

1238

static void drain_pages(unsigned int cpu)

1241

static void drain_pages(unsigned int cpu)

1239

{

1242

{

1240

unsigned long flags;

1243

unsigned long flags;

1241

struct zone *zone;

1244

struct zone *zone;

1242

1245

1243

for_each_populated_zone(zone) {

1246

for_each_populated_zone(zone) {

1244

struct per_cpu_pageset *pset;

1247

struct per_cpu_pageset *pset;

1245

struct per_cpu_pages *pcp;

1248

struct per_cpu_pages *pcp;

1246

1249

1247

local_irq_save(flags);

1250

local_irq_save(flags);

1248

pset = per_cpu_ptr(zone->pageset, cpu);

1251

pset = per_cpu_ptr(zone->pageset, cpu);

1249

1252

1250

pcp = &pset->pcp;

1253

pcp = &pset->pcp;

1251

if (pcp->count) {

1254

if (pcp->count) {

1252

free_pcppages_bulk(zone, pcp->count, pcp);

1255

free_pcppages_bulk(zone, pcp->count, pcp);

1253

pcp->count = 0;

1256

pcp->count = 0;

1254

}

1257

}

1255

local_irq_restore(flags);

1258

local_irq_restore(flags);

1256

}

1259

}

1257

}

1260

}

1258

1261

1259

/*

1262

/*

1260

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1263

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1261

*/

1264

*/

1262

void drain_local_pages(void *arg)

1265

void drain_local_pages(void *arg)

1263

{

1266

{

1264

drain_pages(smp_processor_id());

1267

drain_pages(smp_processor_id());

1265

}

1268

}

1266

1269

1267

/*

1270

/*

1268

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1271

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1269

*

1272

*

1270

* Note that this code is protected against sending an IPI to an offline

1273

* Note that this code is protected against sending an IPI to an offline

1271

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1274

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1272

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1275

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1273

* nothing keeps CPUs from showing up after we populated the cpumask and

1276

* nothing keeps CPUs from showing up after we populated the cpumask and

1274

* before the call to on_each_cpu_mask().

1277

* before the call to on_each_cpu_mask().

1275

*/

1278

*/

1276

void drain_all_pages(void)

1279

void drain_all_pages(void)

1277

{

1280

{

1278

int cpu;

1281

int cpu;

1279

struct per_cpu_pageset *pcp;

1282

struct per_cpu_pageset *pcp;

1280

struct zone *zone;

1283

struct zone *zone;

1281

1284

1282

/*

1285

/*

1283

* Allocate in the BSS so we wont require allocation in

1286

* Allocate in the BSS so we wont require allocation in

1284

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1287

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1285

*/

1288

*/

1286

static cpumask_t cpus_with_pcps;

1289

static cpumask_t cpus_with_pcps;

1287

1290

1288

/*

1291

/*

1289

* We don't care about racing with CPU hotplug event

1292

* We don't care about racing with CPU hotplug event

1290

* as offline notification will cause the notified

1293

* as offline notification will cause the notified

1291

* cpu to drain that CPU pcps and on_each_cpu_mask

1294

* cpu to drain that CPU pcps and on_each_cpu_mask

1292

* disables preemption as part of its processing

1295

* disables preemption as part of its processing

1293

*/

1296

*/

1294

for_each_online_cpu(cpu) {

1297

for_each_online_cpu(cpu) {

1295

bool has_pcps = false;

1298

bool has_pcps = false;

1296

for_each_populated_zone(zone) {

1299

for_each_populated_zone(zone) {

1297

pcp = per_cpu_ptr(zone->pageset, cpu);

1300

pcp = per_cpu_ptr(zone->pageset, cpu);

1298

if (pcp->pcp.count) {

1301

if (pcp->pcp.count) {

1299

has_pcps = true;

1302

has_pcps = true;

1300

break;

1303

break;

1301

}

1304

}

1302

}

1305

}

1303

if (has_pcps)

1306

if (has_pcps)

1304

cpumask_set_cpu(cpu, &cpus_with_pcps);

1307

cpumask_set_cpu(cpu, &cpus_with_pcps);

1305

else

1308

else

1306

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1309

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1307

}

1310

}

1308

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1311

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1309

}

1312

}

1310

1313

1311

#ifdef CONFIG_HIBERNATION

1314

#ifdef CONFIG_HIBERNATION

1312

1315

1313

void mark_free_pages(struct zone *zone)

1316

void mark_free_pages(struct zone *zone)

1314

{

1317

{

1315

unsigned long pfn, max_zone_pfn;

1318

unsigned long pfn, max_zone_pfn;

1316

unsigned long flags;

1319

unsigned long flags;

1317

int order, t;

1320

int order, t;

1318

struct list_head *curr;

1321

struct list_head *curr;

1319

1322

1320

if (zone_is_empty(zone))

1323

if (zone_is_empty(zone))

1321

return;

1324

return;

1322

1325

1323

spin_lock_irqsave(&zone->lock, flags);

1326

spin_lock_irqsave(&zone->lock, flags);

1324

1327

1325

max_zone_pfn = zone_end_pfn(zone);

1328

max_zone_pfn = zone_end_pfn(zone);

1326

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1329

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1327

if (pfn_valid(pfn)) {

1330

if (pfn_valid(pfn)) {

1328

struct page *page = pfn_to_page(pfn);

1331

struct page *page = pfn_to_page(pfn);

1329

1332

1330

if (!swsusp_page_is_forbidden(page))

1333

if (!swsusp_page_is_forbidden(page))

1331

swsusp_unset_page_free(page);

1334

swsusp_unset_page_free(page);

1332

}

1335

}

1333

1336

1334

for_each_migratetype_order(order, t) {

1337

for_each_migratetype_order(order, t) {

1335

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1338

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1336

unsigned long i;

1339

unsigned long i;

1337

1340

1338

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1341

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1339

for (i = 0; i < (1UL << order); i++)

1342

for (i = 0; i < (1UL << order); i++)

1340

swsusp_set_page_free(pfn_to_page(pfn + i));

1343

swsusp_set_page_free(pfn_to_page(pfn + i));

1341

}

1344

}

1342

}

1345

}

1343

spin_unlock_irqrestore(&zone->lock, flags);

1346

spin_unlock_irqrestore(&zone->lock, flags);

1344

}

1347

}

1345

#endif /* CONFIG_PM */

1348

#endif /* CONFIG_PM */

1346

1349

1347

/*

1350

/*

1348

* Free a 0-order page

1351

* Free a 0-order page

1349

* cold == 1 ? free a cold page : free a hot page

1352

* cold == 1 ? free a cold page : free a hot page

1350

*/

1353

*/

1351

void free_hot_cold_page(struct page *page, int cold)

1354

void free_hot_cold_page(struct page *page, int cold)

1352

{

1355

{

1353

struct zone *zone = page_zone(page);

1356

struct zone *zone = page_zone(page);

1354

struct per_cpu_pages *pcp;

1357

struct per_cpu_pages *pcp;

1355

unsigned long flags;

1358

unsigned long flags;

1356

int migratetype;

1359

int migratetype;

1357

1360

1358

if (!free_pages_prepare(page, 0))

1361

if (!free_pages_prepare(page, 0))

1359

return;

1362

return;

1360

1363

1361

migratetype = get_pageblock_migratetype(page);

1364

migratetype = get_pageblock_migratetype(page);

1362

set_freepage_migratetype(page, migratetype);

1365

set_freepage_migratetype(page, migratetype);

1363

local_irq_save(flags);

1366

local_irq_save(flags);

1364

__count_vm_event(PGFREE);

1367

__count_vm_event(PGFREE);

1365

1368

1366

/*

1369

/*

1367

* We only track unmovable, reclaimable and movable on pcp lists.

1370

* We only track unmovable, reclaimable and movable on pcp lists.

1368

* Free ISOLATE pages back to the allocator because they are being

1371

* Free ISOLATE pages back to the allocator because they are being

1369

* offlined but treat RESERVE as movable pages so we can get those

1372

* offlined but treat RESERVE as movable pages so we can get those

1370

* areas back if necessary. Otherwise, we may have to free

1373

* areas back if necessary. Otherwise, we may have to free

1371

* excessively into the page allocator

1374

* excessively into the page allocator

1372

*/

1375

*/

1373

if (migratetype >= MIGRATE_PCPTYPES) {

1376

if (migratetype >= MIGRATE_PCPTYPES) {

1374

if (unlikely(is_migrate_isolate(migratetype))) {

1377

if (unlikely(is_migrate_isolate(migratetype))) {

1375

free_one_page(zone, page, 0, migratetype);

1378

free_one_page(zone, page, 0, migratetype);

1376

goto out;

1379

goto out;

1377

}

1380

}

1378

migratetype = MIGRATE_MOVABLE;

1381

migratetype = MIGRATE_MOVABLE;

1379

}

1382

}

1380

1383

1381

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1384

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1382

if (cold)

1385

if (cold)

1383

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1386

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1384

else

1387

else

1385

list_add(&page->lru, &pcp->lists[migratetype]);

1388

list_add(&page->lru, &pcp->lists[migratetype]);

1386

pcp->count++;

1389

pcp->count++;

1387

if (pcp->count >= pcp->high) {

1390

if (pcp->count >= pcp->high) {

1388

unsigned long batch = ACCESS_ONCE(pcp->batch);

1391

unsigned long batch = ACCESS_ONCE(pcp->batch);

1389

free_pcppages_bulk(zone, batch, pcp);

1392

free_pcppages_bulk(zone, batch, pcp);

1390

pcp->count -= batch;

1393

pcp->count -= batch;

1391

}

1394

}

1392

1395

1393

out:

1396

out:

1394

local_irq_restore(flags);

1397

local_irq_restore(flags);

1395

}

1398

}

1396

1399

1397

/*

1400

/*

1398

* Free a list of 0-order pages

1401

* Free a list of 0-order pages

1399

*/

1402

*/

1400

void free_hot_cold_page_list(struct list_head *list, int cold)

1403

void free_hot_cold_page_list(struct list_head *list, int cold)

1401

{

1404

{

1402

struct page *page, *next;

1405

struct page *page, *next;

1403

1406

1404

list_for_each_entry_safe(page, next, list, lru) {

1407

list_for_each_entry_safe(page, next, list, lru) {

1405

trace_mm_page_free_batched(page, cold);

1408

trace_mm_page_free_batched(page, cold);

1406

free_hot_cold_page(page, cold);

1409

free_hot_cold_page(page, cold);

1407

}

1410

}

1408

}

1411

}

1409

1412

1410

/*

1413

/*

1411

* split_page takes a non-compound higher-order page, and splits it into

1414

* split_page takes a non-compound higher-order page, and splits it into

1412

* n (1<<order) sub-pages: page[0..n]

1415

* n (1<<order) sub-pages: page[0..n]

1413

* Each sub-page must be freed individually.

1416

* Each sub-page must be freed individually.

1414

*

1417

*

1415

* Note: this is probably too low level an operation for use in drivers.

1418

* Note: this is probably too low level an operation for use in drivers.

1416

* Please consult with lkml before using this in your driver.

1419

* Please consult with lkml before using this in your driver.

1417

*/

1420

*/

1418

void split_page(struct page *page, unsigned int order)

1421

void split_page(struct page *page, unsigned int order)

1419

{

1422

{

1420

int i;

1423

int i;

1421

1424

1422

VM_BUG_ON(PageCompound(page));

1425

VM_BUG_ON(PageCompound(page));

1423

VM_BUG_ON(!page_count(page));

1426

VM_BUG_ON(!page_count(page));

1424

1427

1425

#ifdef CONFIG_KMEMCHECK

1428

#ifdef CONFIG_KMEMCHECK

1426

/*

1429

/*

1427

* Split shadow pages too, because free(page[0]) would

1430

* Split shadow pages too, because free(page[0]) would

1428

* otherwise free the whole shadow.

1431

* otherwise free the whole shadow.

1429

*/

1432

*/

1430

if (kmemcheck_page_is_tracked(page))

1433

if (kmemcheck_page_is_tracked(page))

1431

split_page(virt_to_page(page[0].shadow), order);

1434

split_page(virt_to_page(page[0].shadow), order);

1432

#endif

1435

#endif

1433

1436

1434

for (i = 1; i < (1 << order); i++)

1437

for (i = 1; i < (1 << order); i++)

1435

set_page_refcounted(page + i);

1438

set_page_refcounted(page + i);

1436

}

1439

}

1437

EXPORT_SYMBOL_GPL(split_page);

1440

EXPORT_SYMBOL_GPL(split_page);

1438

1441

1439

static int __isolate_free_page(struct page *page, unsigned int order)

1442

static int __isolate_free_page(struct page *page, unsigned int order)

1440

{

1443

{

1441

unsigned long watermark;

1444

unsigned long watermark;

1442

struct zone *zone;

1445

struct zone *zone;

1443

int mt;

1446

int mt;

1444

1447

1445

BUG_ON(!PageBuddy(page));

1448

BUG_ON(!PageBuddy(page));

1446

1449

1447

zone = page_zone(page);

1450

zone = page_zone(page);

1448

mt = get_pageblock_migratetype(page);

1451

mt = get_pageblock_migratetype(page);

1449

1452

1450

if (!is_migrate_isolate(mt)) {

1453

if (!is_migrate_isolate(mt)) {

1451

/* Obey watermarks as if the page was being allocated */

1454

/* Obey watermarks as if the page was being allocated */

1452

watermark = low_wmark_pages(zone) + (1 << order);

1455

watermark = low_wmark_pages(zone) + (1 << order);

1453

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1456

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1454

return 0;

1457

return 0;

1455

1458

1456

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1459

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1457

}

1460

}

1458

1461

1459

/* Remove page from free list */

1462

/* Remove page from free list */

1460

list_del(&page->lru);

1463

list_del(&page->lru);

1461

zone->free_area[order].nr_free--;

1464

zone->free_area[order].nr_free--;

1462

rmv_page_order(page);

1465

rmv_page_order(page);

1463

1466

1464

/* Set the pageblock if the isolated page is at least a pageblock */

1467

/* Set the pageblock if the isolated page is at least a pageblock */

1465

if (order >= pageblock_order - 1) {

1468

if (order >= pageblock_order - 1) {

1466

struct page *endpage = page + (1 << order) - 1;

1469

struct page *endpage = page + (1 << order) - 1;

1467

for (; page < endpage; page += pageblock_nr_pages) {

1470

for (; page < endpage; page += pageblock_nr_pages) {

1468

int mt = get_pageblock_migratetype(page);

1471

int mt = get_pageblock_migratetype(page);

1469

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1472

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1470

set_pageblock_migratetype(page,

1473

set_pageblock_migratetype(page,

1471

MIGRATE_MOVABLE);

1474

MIGRATE_MOVABLE);

1472

}

1475

}

1473

}

1476

}

1474

1477

1475

return 1UL << order;

1478

return 1UL << order;

1476

}

1479

}

1477

1480

1478

/*

1481

/*

1479

* Similar to split_page except the page is already free. As this is only

1482

* Similar to split_page except the page is already free. As this is only

1480

* being used for migration, the migratetype of the block also changes.

1483

* being used for migration, the migratetype of the block also changes.

1481

* As this is called with interrupts disabled, the caller is responsible

1484

* As this is called with interrupts disabled, the caller is responsible

1482

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1485

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1483

* are enabled.

1486

* are enabled.

1484

*

1487

*

1485

* Note: this is probably too low level an operation for use in drivers.

1488

* Note: this is probably too low level an operation for use in drivers.

1486

* Please consult with lkml before using this in your driver.

1489

* Please consult with lkml before using this in your driver.

1487

*/

1490

*/

1488

int split_free_page(struct page *page)

1491

int split_free_page(struct page *page)

1489

{

1492

{

1490

unsigned int order;

1493

unsigned int order;

1491

int nr_pages;

1494

int nr_pages;

1492

1495

1493

order = page_order(page);

1496

order = page_order(page);

1494

1497

1495

nr_pages = __isolate_free_page(page, order);

1498

nr_pages = __isolate_free_page(page, order);

1496

if (!nr_pages)

1499

if (!nr_pages)

1497

return 0;

1500

return 0;

1498

1501

1499

/* Split into individual pages */

1502

/* Split into individual pages */

1500

set_page_refcounted(page);

1503

set_page_refcounted(page);

1501

split_page(page, order);

1504

split_page(page, order);

1502

return nr_pages;

1505

return nr_pages;

1503

}

1506

}

1504

1507

1505

/*

1508

/*

1506

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1509

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1507

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1510

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1508

* or two.

1511

* or two.

1509

*/

1512

*/

1510

static inline

1513

static inline

1511

struct page *buffered_rmqueue(struct zone *preferred_zone,

1514

struct page *buffered_rmqueue(struct zone *preferred_zone,

1512

struct zone *zone, int order, gfp_t gfp_flags,

1515

struct zone *zone, int order, gfp_t gfp_flags,

1513

int migratetype)

1516

int migratetype)

1514

{

1517

{

1515

unsigned long flags;

1518

unsigned long flags;

1516

struct page *page;

1519

struct page *page;

1517

int cold = !!(gfp_flags & __GFP_COLD);

1520

int cold = !!(gfp_flags & __GFP_COLD);

1518

1521

1519

again:

1522

again:

1520

if (likely(order == 0)) {

1523

if (likely(order == 0)) {

1521

struct per_cpu_pages *pcp;

1524

struct per_cpu_pages *pcp;

1522

struct list_head *list;

1525

struct list_head *list;

1523

1526

1524

local_irq_save(flags);

1527

local_irq_save(flags);

1525

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1528

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1526

list = &pcp->lists[migratetype];

1529

list = &pcp->lists[migratetype];

1527

if (list_empty(list)) {

1530

if (list_empty(list)) {

1528

pcp->count += rmqueue_bulk(zone, 0,

1531

pcp->count += rmqueue_bulk(zone, 0,

1529

pcp->batch, list,

1532

pcp->batch, list,

1530

migratetype, cold);

1533

migratetype, cold);

1531

if (unlikely(list_empty(list)))

1534

if (unlikely(list_empty(list)))

1532

goto failed;

1535

goto failed;

1533

}

1536

}

1534

1537

1535

if (cold)

1538

if (cold)

1536

page = list_entry(list->prev, struct page, lru);

1539

page = list_entry(list->prev, struct page, lru);

1537

else

1540

else

1538

page = list_entry(list->next, struct page, lru);

1541

page = list_entry(list->next, struct page, lru);

1539

1542

1540

list_del(&page->lru);

1543

list_del(&page->lru);

1541

pcp->count--;

1544

pcp->count--;

1542

} else {

1545

} else {

1543

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1546

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1544

/*

1547

/*

1545

* __GFP_NOFAIL is not to be used in new code.

1548

* __GFP_NOFAIL is not to be used in new code.

1546

*

1549

*

1547

* All __GFP_NOFAIL callers should be fixed so that they

1550

* All __GFP_NOFAIL callers should be fixed so that they

1548

* properly detect and handle allocation failures.

1551

* properly detect and handle allocation failures.

1549

*

1552

*

1550

* We most definitely don't want callers attempting to

1553

* We most definitely don't want callers attempting to

1551

* allocate greater than order-1 page units with

1554

* allocate greater than order-1 page units with

1552

* __GFP_NOFAIL.

1555

* __GFP_NOFAIL.

1553

*/

1556

*/

1554

WARN_ON_ONCE(order > 1);

1557

WARN_ON_ONCE(order > 1);

1555

}

1558

}

1556

spin_lock_irqsave(&zone->lock, flags);

1559

spin_lock_irqsave(&zone->lock, flags);

1557

page = __rmqueue(zone, order, migratetype);

1560

page = __rmqueue(zone, order, migratetype);

1558

spin_unlock(&zone->lock);

1561

spin_unlock(&zone->lock);

1559

if (!page)

1562

if (!page)

1560

goto failed;

1563

goto failed;

1561

__mod_zone_freepage_state(zone, -(1 << order),

1564

__mod_zone_freepage_state(zone, -(1 << order),

1562

get_pageblock_migratetype(page));

1565

get_freepage_migratetype(page));

1563

}

1566

}

1564

1567

1565

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1568

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1566

1569

1567

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1570

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1568

zone_statistics(preferred_zone, zone, gfp_flags);

1571

zone_statistics(preferred_zone, zone, gfp_flags);

1569

local_irq_restore(flags);

1572

local_irq_restore(flags);

1570

1573

1571

VM_BUG_ON(bad_range(zone, page));

1574

VM_BUG_ON(bad_range(zone, page));

1572

if (prep_new_page(page, order, gfp_flags))

1575

if (prep_new_page(page, order, gfp_flags))

1573

goto again;

1576

goto again;

1574

return page;

1577

return page;

1575

1578

1576

failed:

1579

failed:

1577

local_irq_restore(flags);

1580

local_irq_restore(flags);

1578

return NULL;

1581

return NULL;

1579

}

1582

}

1580

1583

1581

#ifdef CONFIG_FAIL_PAGE_ALLOC

1584

#ifdef CONFIG_FAIL_PAGE_ALLOC

1582

1585

1583

static struct {

1586

static struct {

1584

struct fault_attr attr;

1587

struct fault_attr attr;

1585

1588

1586

u32 ignore_gfp_highmem;

1589

u32 ignore_gfp_highmem;

1587

u32 ignore_gfp_wait;

1590

u32 ignore_gfp_wait;

1588

u32 min_order;

1591

u32 min_order;

1589

} fail_page_alloc = {

1592

} fail_page_alloc = {

1590

.attr = FAULT_ATTR_INITIALIZER,

1593

.attr = FAULT_ATTR_INITIALIZER,

1591

.ignore_gfp_wait = 1,

1594

.ignore_gfp_wait = 1,

1592

.ignore_gfp_highmem = 1,

1595

.ignore_gfp_highmem = 1,

1593

.min_order = 1,

1596

.min_order = 1,

1594

};

1597

};

1595

1598

1596

static int __init setup_fail_page_alloc(char *str)

1599

static int __init setup_fail_page_alloc(char *str)

1597

{

1600

{

1598

return setup_fault_attr(&fail_page_alloc.attr, str);

1601

return setup_fault_attr(&fail_page_alloc.attr, str);

1599

}

1602

}

1600

__setup("fail_page_alloc=", setup_fail_page_alloc);

1603

__setup("fail_page_alloc=", setup_fail_page_alloc);

1601

1604

1602

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1605

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1603

{

1606

{

1604

if (order < fail_page_alloc.min_order)

1607

if (order < fail_page_alloc.min_order)

1605

return false;

1608

return false;

1606

if (gfp_mask & __GFP_NOFAIL)

1609

if (gfp_mask & __GFP_NOFAIL)

1607

return false;

1610

return false;

1608

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1611

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1609

return false;

1612

return false;

1610

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1613

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1611

return false;

1614

return false;

1612

1615

1613

return should_fail(&fail_page_alloc.attr, 1 << order);

1616

return should_fail(&fail_page_alloc.attr, 1 << order);

1614

}

1617

}

1615

1618

1616

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1619

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1617

1620

1618

static int __init fail_page_alloc_debugfs(void)

1621

static int __init fail_page_alloc_debugfs(void)

1619

{

1622

{

1620

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1623

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1621

struct dentry *dir;

1624

struct dentry *dir;

1622

1625

1623

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1626

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1624

&fail_page_alloc.attr);

1627

&fail_page_alloc.attr);

1625

if (IS_ERR(dir))

1628

if (IS_ERR(dir))

1626

return PTR_ERR(dir);

1629

return PTR_ERR(dir);

1627

1630

1628

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1631

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1629

&fail_page_alloc.ignore_gfp_wait))

1632

&fail_page_alloc.ignore_gfp_wait))

1630

goto fail;

1633

goto fail;

1631

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1634

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1632

&fail_page_alloc.ignore_gfp_highmem))

1635

&fail_page_alloc.ignore_gfp_highmem))

1633

goto fail;

1636

goto fail;

1634

if (!debugfs_create_u32("min-order", mode, dir,

1637

if (!debugfs_create_u32("min-order", mode, dir,

1635

&fail_page_alloc.min_order))

1638

&fail_page_alloc.min_order))

1636

goto fail;

1639

goto fail;

1637

1640

1638

return 0;

1641

return 0;

1639

fail:

1642

fail:

1640

debugfs_remove_recursive(dir);

1643

debugfs_remove_recursive(dir);

1641

1644

1642

return -ENOMEM;

1645

return -ENOMEM;

1643

}

1646

}

1644

1647

1645

late_initcall(fail_page_alloc_debugfs);

1648

late_initcall(fail_page_alloc_debugfs);

1646

1649

1647

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1650

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1648

1651

1649

#else /* CONFIG_FAIL_PAGE_ALLOC */

1652

#else /* CONFIG_FAIL_PAGE_ALLOC */

1650

1653

1651

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1654

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1652

{

1655

{

1653

return false;

1656

return false;

1654

}

1657

}

1655

1658

1656

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1659

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1657

1660

1658

/*

1661

/*

1659

* Return true if free pages are above 'mark'. This takes into account the order

1662

* Return true if free pages are above 'mark'. This takes into account the order

1660

* of the allocation.

1663

* of the allocation.

1661

*/

1664

*/

1662

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1665

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1663

int classzone_idx, int alloc_flags, long free_pages)

1666

int classzone_idx, int alloc_flags, long free_pages)

1664

{

1667

{

1665

/* free_pages my go negative - that's OK */

1668

/* free_pages my go negative - that's OK */

1666

long min = mark;

1669

long min = mark;

1667

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1670

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1668

int o;

1671

int o;

1669

long free_cma = 0;

1672

long free_cma = 0;

1670

1673

1671

free_pages -= (1 << order) - 1;

1674

free_pages -= (1 << order) - 1;

1672

if (alloc_flags & ALLOC_HIGH)

1675

if (alloc_flags & ALLOC_HIGH)

1673

min -= min / 2;

1676

min -= min / 2;

1674

if (alloc_flags & ALLOC_HARDER)

1677

if (alloc_flags & ALLOC_HARDER)

1675

min -= min / 4;

1678

min -= min / 4;

1676

#ifdef CONFIG_CMA

1679

#ifdef CONFIG_CMA

1677

/* If allocation can't use CMA areas don't use free CMA pages */

1680

/* If allocation can't use CMA areas don't use free CMA pages */

1678

if (!(alloc_flags & ALLOC_CMA))

1681

if (!(alloc_flags & ALLOC_CMA))

1679

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1682

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1680

#endif

1683

#endif

1681

1684

1682

if (free_pages - free_cma <= min + lowmem_reserve)

1685

if (free_pages - free_cma <= min + lowmem_reserve)

1683

return false;

1686

return false;

1684

for (o = 0; o < order; o++) {

1687

for (o = 0; o < order; o++) {

1685

/* At the next order, this order's pages become unavailable */

1688

/* At the next order, this order's pages become unavailable */

1686

free_pages -= z->free_area[o].nr_free << o;

1689

free_pages -= z->free_area[o].nr_free << o;

1687

1690

1688

/* Require fewer higher order pages to be free */

1691

/* Require fewer higher order pages to be free */

1689

min >>= 1;

1692

min >>= 1;

1690

1693

1691

if (free_pages <= min)

1694

if (free_pages <= min)

1692

return false;

1695

return false;

1693

}

1696

}

1694

return true;

1697

return true;

1695

}

1698

}

1696

1699

1697

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1700

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1698

int classzone_idx, int alloc_flags)

1701

int classzone_idx, int alloc_flags)

1699

{

1702

{

1700

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1703

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1701

zone_page_state(z, NR_FREE_PAGES));

1704

zone_page_state(z, NR_FREE_PAGES));

1702

}

1705

}

1703

1706

1704

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1707

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1705

int classzone_idx, int alloc_flags)

1708

int classzone_idx, int alloc_flags)

1706

{

1709

{

1707

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1710

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1708

1711

1709

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1712

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1710

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1713

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1711

1714

1712

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1715

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1713

free_pages);

1716

free_pages);

1714

}

1717

}

1715

1718

1716

#ifdef CONFIG_NUMA

1719

#ifdef CONFIG_NUMA

1717

/*

1720

/*

1718

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1721

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1719

* skip over zones that are not allowed by the cpuset, or that have

1722

* skip over zones that are not allowed by the cpuset, or that have

1720

* been recently (in last second) found to be nearly full. See further

1723

* been recently (in last second) found to be nearly full. See further

1721

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1724

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1722

* that have to skip over a lot of full or unallowed zones.

1725

* that have to skip over a lot of full or unallowed zones.

1723

*

1726

*

1724

* If the zonelist cache is present in the passed in zonelist, then

1727

* If the zonelist cache is present in the passed in zonelist, then

1725

* returns a pointer to the allowed node mask (either the current

1728

* returns a pointer to the allowed node mask (either the current

1726

* tasks mems_allowed, or node_states[N_MEMORY].)

1729

* tasks mems_allowed, or node_states[N_MEMORY].)

1727

*

1730

*

1728

* If the zonelist cache is not available for this zonelist, does

1731

* If the zonelist cache is not available for this zonelist, does

1729

* nothing and returns NULL.

1732

* nothing and returns NULL.

1730

*

1733

*

1731

* If the fullzones BITMAP in the zonelist cache is stale (more than

1734

* If the fullzones BITMAP in the zonelist cache is stale (more than

1732

* a second since last zap'd) then we zap it out (clear its bits.)

1735

* a second since last zap'd) then we zap it out (clear its bits.)

1733

*

1736

*

1734

* We hold off even calling zlc_setup, until after we've checked the

1737

* We hold off even calling zlc_setup, until after we've checked the

1735

* first zone in the zonelist, on the theory that most allocations will

1738

* first zone in the zonelist, on the theory that most allocations will

1736

* be satisfied from that first zone, so best to examine that zone as

1739

* be satisfied from that first zone, so best to examine that zone as

1737

* quickly as we can.

1740

* quickly as we can.

1738

*/

1741

*/

1739

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1742

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1740

{

1743

{

1741

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1744

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1742

nodemask_t *allowednodes; /* zonelist_cache approximation */

1745

nodemask_t *allowednodes; /* zonelist_cache approximation */

1743

1746

1744

zlc = zonelist->zlcache_ptr;

1747

zlc = zonelist->zlcache_ptr;

1745

if (!zlc)

1748

if (!zlc)

1746

return NULL;

1749

return NULL;

1747

1750

1748

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1751

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1749

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1752

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1750

zlc->last_full_zap = jiffies;

1753

zlc->last_full_zap = jiffies;

1751

}

1754

}

1752

1755

1753

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1756

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1754

&cpuset_current_mems_allowed :

1757

&cpuset_current_mems_allowed :

1755

&node_states[N_MEMORY];

1758

&node_states[N_MEMORY];

1756

return allowednodes;

1759

return allowednodes;

1757

}

1760

}

1758

1761

1759

/*

1762

/*

1760

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1763

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1761

* if it is worth looking at further for free memory:

1764

* if it is worth looking at further for free memory:

1762

* 1) Check that the zone isn't thought to be full (doesn't have its

1765

* 1) Check that the zone isn't thought to be full (doesn't have its

1763

* bit set in the zonelist_cache fullzones BITMAP).

1766

* bit set in the zonelist_cache fullzones BITMAP).

1764

* 2) Check that the zones node (obtained from the zonelist_cache

1767

* 2) Check that the zones node (obtained from the zonelist_cache

1765

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1768

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1766

* Return true (non-zero) if zone is worth looking at further, or

1769

* Return true (non-zero) if zone is worth looking at further, or

1767

* else return false (zero) if it is not.

1770

* else return false (zero) if it is not.

1768

*

1771

*

1769

* This check -ignores- the distinction between various watermarks,

1772

* This check -ignores- the distinction between various watermarks,

1770

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1773

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1771

* found to be full for any variation of these watermarks, it will

1774

* found to be full for any variation of these watermarks, it will

1772

* be considered full for up to one second by all requests, unless

1775

* be considered full for up to one second by all requests, unless

1773

* we are so low on memory on all allowed nodes that we are forced

1776

* we are so low on memory on all allowed nodes that we are forced

1774

* into the second scan of the zonelist.

1777

* into the second scan of the zonelist.

1775

*

1778

*

1776

* In the second scan we ignore this zonelist cache and exactly

1779

* In the second scan we ignore this zonelist cache and exactly

1777

* apply the watermarks to all zones, even it is slower to do so.

1780

* apply the watermarks to all zones, even it is slower to do so.

1778

* We are low on memory in the second scan, and should leave no stone

1781

* We are low on memory in the second scan, and should leave no stone

1779

* unturned looking for a free page.

1782

* unturned looking for a free page.

1780

*/

1783

*/

1781

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1784

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1782

nodemask_t *allowednodes)

1785

nodemask_t *allowednodes)

1783

{

1786

{

1784

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1787

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1785

int i; /* index of *z in zonelist zones */

1788

int i; /* index of *z in zonelist zones */

1786

int n; /* node that zone *z is on */

1789

int n; /* node that zone *z is on */

1787

1790

1788

zlc = zonelist->zlcache_ptr;

1791

zlc = zonelist->zlcache_ptr;

1789

if (!zlc)

1792

if (!zlc)

1790

return 1;

1793

return 1;

1791

1794

1792

i = z - zonelist->_zonerefs;

1795

i = z - zonelist->_zonerefs;

1793

n = zlc->z_to_n[i];

1796

n = zlc->z_to_n[i];

1794

1797

1795

/* This zone is worth trying if it is allowed but not full */

1798

/* This zone is worth trying if it is allowed but not full */

1796

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1799

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1797

}

1800

}

1798

1801

1799

/*

1802

/*

1800

* Given 'z' scanning a zonelist, set the corresponding bit in

1803

* Given 'z' scanning a zonelist, set the corresponding bit in

1801

* zlc->fullzones, so that subsequent attempts to allocate a page

1804

* zlc->fullzones, so that subsequent attempts to allocate a page

1802

* from that zone don't waste time re-examining it.

1805

* from that zone don't waste time re-examining it.

1803

*/

1806

*/

1804

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1807

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1805

{

1808

{

1806

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1809

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1807

int i; /* index of *z in zonelist zones */

1810

int i; /* index of *z in zonelist zones */

1808

1811

1809

zlc = zonelist->zlcache_ptr;

1812

zlc = zonelist->zlcache_ptr;

1810

if (!zlc)

1813

if (!zlc)

1811

return;

1814

return;

1812

1815

1813

i = z - zonelist->_zonerefs;

1816

i = z - zonelist->_zonerefs;

1814

1817

1815

set_bit(i, zlc->fullzones);

1818

set_bit(i, zlc->fullzones);

1816

}

1819

}

1817

1820

1818

/*

1821

/*

1819

* clear all zones full, called after direct reclaim makes progress so that

1822

* clear all zones full, called after direct reclaim makes progress so that

1820

* a zone that was recently full is not skipped over for up to a second

1823

* a zone that was recently full is not skipped over for up to a second

1821

*/

1824

*/

1822

static void zlc_clear_zones_full(struct zonelist *zonelist)

1825

static void zlc_clear_zones_full(struct zonelist *zonelist)

1823

{

1826

{

1824

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1827

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1825

1828

1826

zlc = zonelist->zlcache_ptr;

1829

zlc = zonelist->zlcache_ptr;

1827

if (!zlc)

1830

if (!zlc)

1828

return;

1831

return;

1829

1832

1830

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1833

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1831

}

1834

}

1832

1835

1833

static bool zone_local(struct zone *local_zone, struct zone *zone)

1836

static bool zone_local(struct zone *local_zone, struct zone *zone)

1834

{

1837

{

1835

return local_zone->node == zone->node;

1838

return local_zone->node == zone->node;

1836

}

1839

}

1837

1840

1838

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1841

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1839

{

1842

{

1840

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1843

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1841

}

1844

}

1842

1845

1843

static void __paginginit init_zone_allows_reclaim(int nid)

1846

static void __paginginit init_zone_allows_reclaim(int nid)

1844

{

1847

{

1845

int i;

1848

int i;

1846

1849

1847

for_each_node_state(i, N_MEMORY)

1850

for_each_node_state(i, N_MEMORY)

1848

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1851

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1849

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1852

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1850

else

1853

else

1851

zone_reclaim_mode = 1;

1854

zone_reclaim_mode = 1;

1852

}

1855

}

1853

1856

1854

#else /* CONFIG_NUMA */

1857

#else /* CONFIG_NUMA */

1855

1858

1856

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1859

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1857

{

1860

{

1858

return NULL;

1861

return NULL;

1859

}

1862

}

1860

1863

1861

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1864

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1862

nodemask_t *allowednodes)

1865

nodemask_t *allowednodes)

1863

{

1866

{

1864

return 1;

1867

return 1;

1865

}

1868

}

1866

1869

1867

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1870

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1868

{

1871

{

1869

}

1872

}

1870

1873

1871

static void zlc_clear_zones_full(struct zonelist *zonelist)

1874

static void zlc_clear_zones_full(struct zonelist *zonelist)

1872

{

1875

{

1873

}

1876

}

1874

1877

1875

static bool zone_local(struct zone *local_zone, struct zone *zone)

1878

static bool zone_local(struct zone *local_zone, struct zone *zone)

1876

{

1879

{

1877

return true;

1880

return true;

1878

}

1881

}

1879

1882

1880

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1883

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1881

{

1884

{

1882

return true;

1885

return true;

1883

}

1886

}

1884

1887

1885

static inline void init_zone_allows_reclaim(int nid)

1888

static inline void init_zone_allows_reclaim(int nid)

1886

{

1889

{

1887

}

1890

}

1888

#endif /* CONFIG_NUMA */

1891

#endif /* CONFIG_NUMA */

1889

1892

1890

/*

1893

/*

1891

* get_page_from_freelist goes through the zonelist trying to allocate

1894

* get_page_from_freelist goes through the zonelist trying to allocate

1892

* a page.

1895

* a page.

1893

*/

1896

*/

1894

static struct page *

1897

static struct page *

1895

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1898

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1896

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1899

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1897

struct zone *preferred_zone, int migratetype)

1900

struct zone *preferred_zone, int migratetype)

1898

{

1901

{

1899

struct zoneref *z;

1902

struct zoneref *z;

1900

struct page *page = NULL;

1903

struct page *page = NULL;

1901

int classzone_idx;

1904

int classzone_idx;

1902

struct zone *zone;

1905

struct zone *zone;

1903

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1906

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1904

int zlc_active = 0; /* set if using zonelist_cache */

1907

int zlc_active = 0; /* set if using zonelist_cache */

1905

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1908

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1906

1909

1907

classzone_idx = zone_idx(preferred_zone);

1910

classzone_idx = zone_idx(preferred_zone);

1908

zonelist_scan:

1911

zonelist_scan:

1909

/*

1912

/*

1910

* Scan zonelist, looking for a zone with enough free.

1913

* Scan zonelist, looking for a zone with enough free.

1911

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1914

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1912

*/

1915

*/

1913

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1916

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1914

high_zoneidx, nodemask) {

1917

high_zoneidx, nodemask) {

1915

unsigned long mark;

1918

unsigned long mark;

1916

1919

1917

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1920

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1918

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1921

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1919

continue;

1922

continue;

1920

if ((alloc_flags & ALLOC_CPUSET) &&

1923

if ((alloc_flags & ALLOC_CPUSET) &&

1921

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1924

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1922

continue;

1925

continue;

1923

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1926

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1924

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1927

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1925

goto try_this_zone;

1928

goto try_this_zone;

1926

/*

1929

/*

1927

* Distribute pages in proportion to the individual

1930

* Distribute pages in proportion to the individual

1928

* zone size to ensure fair page aging. The zone a

1931

* zone size to ensure fair page aging. The zone a

1929

* page was allocated in should have no effect on the

1932

* page was allocated in should have no effect on the

1930

* time the page has in memory before being reclaimed.

1933

* time the page has in memory before being reclaimed.

1931

*/

1934

*/

1932

if (alloc_flags & ALLOC_FAIR) {

1935

if (alloc_flags & ALLOC_FAIR) {

1933

if (!zone_local(preferred_zone, zone))

1936

if (!zone_local(preferred_zone, zone))

1934

continue;

1937

continue;

1935

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1938

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1936

continue;

1939

continue;

1937

}

1940

}

1938

/*

1941

/*

1939

* When allocating a page cache page for writing, we

1942

* When allocating a page cache page for writing, we

1940

* want to get it from a zone that is within its dirty

1943

* want to get it from a zone that is within its dirty

1941

* limit, such that no single zone holds more than its

1944

* limit, such that no single zone holds more than its

1942

* proportional share of globally allowed dirty pages.

1945

* proportional share of globally allowed dirty pages.

1943

* The dirty limits take into account the zone's

1946

* The dirty limits take into account the zone's

1944

* lowmem reserves and high watermark so that kswapd

1947

* lowmem reserves and high watermark so that kswapd

1945

* should be able to balance it without having to

1948

* should be able to balance it without having to

1946

* write pages from its LRU list.

1949

* write pages from its LRU list.

1947

*

1950

*

1948

* This may look like it could increase pressure on

1951

* This may look like it could increase pressure on

1949

* lower zones by failing allocations in higher zones

1952

* lower zones by failing allocations in higher zones

1950

* before they are full. But the pages that do spill

1953

* before they are full. But the pages that do spill

1951

* over are limited as the lower zones are protected

1954

* over are limited as the lower zones are protected

1952

* by this very same mechanism. It should not become

1955

* by this very same mechanism. It should not become

1953

* a practical burden to them.

1956

* a practical burden to them.

1954

*

1957

*

1955

* XXX: For now, allow allocations to potentially

1958

* XXX: For now, allow allocations to potentially

1956

* exceed the per-zone dirty limit in the slowpath

1959

* exceed the per-zone dirty limit in the slowpath

1957

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1960

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1958

* which is important when on a NUMA setup the allowed

1961

* which is important when on a NUMA setup the allowed

1959

* zones are together not big enough to reach the

1962

* zones are together not big enough to reach the

1960

* global limit. The proper fix for these situations

1963

* global limit. The proper fix for these situations

1961

* will require awareness of zones in the

1964

* will require awareness of zones in the

1962

* dirty-throttling and the flusher threads.

1965

* dirty-throttling and the flusher threads.

1963

*/

1966

*/

1964

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1967

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1965

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1968

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1966

goto this_zone_full;

1969

goto this_zone_full;

1967

1970

1968

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1971

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1969

if (!zone_watermark_ok(zone, order, mark,

1972

if (!zone_watermark_ok(zone, order, mark,

1970

classzone_idx, alloc_flags)) {

1973

classzone_idx, alloc_flags)) {

1971

int ret;

1974

int ret;

1972

1975

1973

if (IS_ENABLED(CONFIG_NUMA) &&

1976

if (IS_ENABLED(CONFIG_NUMA) &&

1974

!did_zlc_setup && nr_online_nodes > 1) {

1977

!did_zlc_setup && nr_online_nodes > 1) {

1975

/*

1978

/*

1976

* we do zlc_setup if there are multiple nodes

1979

* we do zlc_setup if there are multiple nodes

1977

* and before considering the first zone allowed

1980

* and before considering the first zone allowed

1978

* by the cpuset.

1981

* by the cpuset.

1979

*/

1982

*/

1980

allowednodes = zlc_setup(zonelist, alloc_flags);

1983

allowednodes = zlc_setup(zonelist, alloc_flags);

1981

zlc_active = 1;

1984

zlc_active = 1;

1982

did_zlc_setup = 1;

1985

did_zlc_setup = 1;

1983

}

1986

}

1984

1987

1985

if (zone_reclaim_mode == 0 ||

1988

if (zone_reclaim_mode == 0 ||

1986

!zone_allows_reclaim(preferred_zone, zone))

1989

!zone_allows_reclaim(preferred_zone, zone))

1987

goto this_zone_full;

1990

goto this_zone_full;

1988

1991

1989

/*

1992

/*

1990

* As we may have just activated ZLC, check if the first

1993

* As we may have just activated ZLC, check if the first

1991

* eligible zone has failed zone_reclaim recently.

1994

* eligible zone has failed zone_reclaim recently.

1992

*/

1995

*/

1993

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1996

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1994

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1997

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1995

continue;

1998

continue;

1996

1999

1997

ret = zone_reclaim(zone, gfp_mask, order);

2000

ret = zone_reclaim(zone, gfp_mask, order);

1998

switch (ret) {

2001

switch (ret) {

1999

case ZONE_RECLAIM_NOSCAN:

2002

case ZONE_RECLAIM_NOSCAN:

2000

/* did not scan */

2003

/* did not scan */

2001

continue;

2004

continue;

2002

case ZONE_RECLAIM_FULL:

2005

case ZONE_RECLAIM_FULL:

2003

/* scanned but unreclaimable */

2006

/* scanned but unreclaimable */

2004

continue;

2007

continue;

2005

default:

2008

default:

2006

/* did we reclaim enough */

2009

/* did we reclaim enough */

2007

if (zone_watermark_ok(zone, order, mark,

2010

if (zone_watermark_ok(zone, order, mark,

2008

classzone_idx, alloc_flags))

2011

classzone_idx, alloc_flags))

2009

goto try_this_zone;

2012

goto try_this_zone;

2010

2013

2011

/*

2014

/*

2012

* Failed to reclaim enough to meet watermark.

2015

* Failed to reclaim enough to meet watermark.

2013

* Only mark the zone full if checking the min

2016

* Only mark the zone full if checking the min

2014

* watermark or if we failed to reclaim just

2017

* watermark or if we failed to reclaim just

2015

* 1<<order pages or else the page allocator

2018

* 1<<order pages or else the page allocator

2016

* fastpath will prematurely mark zones full

2019

* fastpath will prematurely mark zones full

2017

* when the watermark is between the low and

2020

* when the watermark is between the low and

2018

* min watermarks.

2021

* min watermarks.

2019

*/

2022

*/

2020

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2023

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2021

ret == ZONE_RECLAIM_SOME)

2024

ret == ZONE_RECLAIM_SOME)

2022

goto this_zone_full;

2025

goto this_zone_full;

2023

2026

2024

continue;

2027

continue;

2025

}

2028

}

2026

}

2029

}

2027

2030

2028

try_this_zone:

2031

try_this_zone:

2029

page = buffered_rmqueue(preferred_zone, zone, order,

2032

page = buffered_rmqueue(preferred_zone, zone, order,

2030

gfp_mask, migratetype);

2033

gfp_mask, migratetype);

2031

if (page)

2034

if (page)

2032

break;

2035

break;

2033

this_zone_full:

2036

this_zone_full:

2034

if (IS_ENABLED(CONFIG_NUMA))

2037

if (IS_ENABLED(CONFIG_NUMA))

2035

zlc_mark_zone_full(zonelist, z);

2038

zlc_mark_zone_full(zonelist, z);

2036

}

2039

}

2037

2040

2038

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2041

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2039

/* Disable zlc cache for second zonelist scan */

2042

/* Disable zlc cache for second zonelist scan */

2040

zlc_active = 0;

2043

zlc_active = 0;

2041

goto zonelist_scan;

2044

goto zonelist_scan;

2042

}

2045

}

2043

2046

2044

if (page)

2047

if (page)

2045

/*

2048

/*

2046

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2049

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2047

* necessary to allocate the page. The expectation is

2050

* necessary to allocate the page. The expectation is

2048

* that the caller is taking steps that will free more

2051

* that the caller is taking steps that will free more

2049

* memory. The caller should avoid the page being used

2052

* memory. The caller should avoid the page being used

2050

* for !PFMEMALLOC purposes.

2053

* for !PFMEMALLOC purposes.

2051

*/

2054

*/

2052

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2055

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2053

2056

2054

return page;

2057

return page;

2055

}

2058

}

2056

2059

2057

/*

2060

/*

2058

* Large machines with many possible nodes should not always dump per-node

2061

* Large machines with many possible nodes should not always dump per-node

2059

* meminfo in irq context.

2062

* meminfo in irq context.

2060

*/

2063

*/

2061

static inline bool should_suppress_show_mem(void)

2064

static inline bool should_suppress_show_mem(void)

2062

{

2065

{

2063

bool ret = false;

2066

bool ret = false;

2064

2067

2065

#if NODES_SHIFT > 8

2068

#if NODES_SHIFT > 8

2066

ret = in_interrupt();

2069

ret = in_interrupt();

2067

#endif

2070

#endif

2068

return ret;

2071

return ret;

2069

}

2072

}

2070

2073

2071

static DEFINE_RATELIMIT_STATE(nopage_rs,

2074

static DEFINE_RATELIMIT_STATE(nopage_rs,

2072

DEFAULT_RATELIMIT_INTERVAL,

2075

DEFAULT_RATELIMIT_INTERVAL,

2073

DEFAULT_RATELIMIT_BURST);

2076

DEFAULT_RATELIMIT_BURST);

2074

2077

2075

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2078

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2076

{

2079

{

2077

unsigned int filter = SHOW_MEM_FILTER_NODES;

2080

unsigned int filter = SHOW_MEM_FILTER_NODES;

2078

2081

2079

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2082

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2080

debug_guardpage_minorder() > 0)

2083

debug_guardpage_minorder() > 0)

2081

return;

2084

return;

2082

2085

2083

/*

2086

/*

2084

* Walking all memory to count page types is very expensive and should

2087

* Walking all memory to count page types is very expensive and should

2085

* be inhibited in non-blockable contexts.

2088

* be inhibited in non-blockable contexts.

2086

*/

2089

*/

2087

if (!(gfp_mask & __GFP_WAIT))

2090

if (!(gfp_mask & __GFP_WAIT))

2088

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2091

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2089

2092

2090

/*

2093

/*

2091

* This documents exceptions given to allocations in certain

2094

* This documents exceptions given to allocations in certain

2092

* contexts that are allowed to allocate outside current's set

2095

* contexts that are allowed to allocate outside current's set

2093

* of allowed nodes.

2096

* of allowed nodes.

2094

*/

2097

*/

2095

if (!(gfp_mask & __GFP_NOMEMALLOC))

2098

if (!(gfp_mask & __GFP_NOMEMALLOC))

2096

if (test_thread_flag(TIF_MEMDIE) ||

2099

if (test_thread_flag(TIF_MEMDIE) ||

2097

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2100

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2098

filter &= ~SHOW_MEM_FILTER_NODES;

2101

filter &= ~SHOW_MEM_FILTER_NODES;

2099

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2102

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2100

filter &= ~SHOW_MEM_FILTER_NODES;

2103

filter &= ~SHOW_MEM_FILTER_NODES;

2101

2104

2102

if (fmt) {

2105

if (fmt) {

2103

struct va_format vaf;

2106

struct va_format vaf;

2104

va_list args;

2107

va_list args;

2105

2108

2106

va_start(args, fmt);

2109

va_start(args, fmt);

2107

2110

2108

vaf.fmt = fmt;

2111

vaf.fmt = fmt;

2109

vaf.va = &args;

2112

vaf.va = &args;

2110

2113

2111

pr_warn("%pV", &vaf);

2114

pr_warn("%pV", &vaf);

2112

2115

2113

va_end(args);

2116

va_end(args);

2114

}

2117

}

2115

2118

2116

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2119

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2117

current->comm, order, gfp_mask);

2120

current->comm, order, gfp_mask);

2118

2121

2119

dump_stack();

2122

dump_stack();

2120

if (!should_suppress_show_mem())

2123

if (!should_suppress_show_mem())

2121

show_mem(filter);

2124

show_mem(filter);

2122

}

2125

}

2123

2126

2124

static inline int

2127

static inline int

2125

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2128

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2126

unsigned long did_some_progress,

2129

unsigned long did_some_progress,

2127

unsigned long pages_reclaimed)

2130

unsigned long pages_reclaimed)

2128

{

2131

{

2129

/* Do not loop if specifically requested */

2132

/* Do not loop if specifically requested */

2130

if (gfp_mask & __GFP_NORETRY)

2133

if (gfp_mask & __GFP_NORETRY)

2131

return 0;

2134

return 0;

2132

2135

2133

/* Always retry if specifically requested */

2136

/* Always retry if specifically requested */

2134

if (gfp_mask & __GFP_NOFAIL)

2137

if (gfp_mask & __GFP_NOFAIL)

2135

return 1;

2138

return 1;

2136

2139

2137

/*

2140

/*

2138

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2141

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2139

* making forward progress without invoking OOM. Suspend also disables

2142

* making forward progress without invoking OOM. Suspend also disables

2140

* storage devices so kswapd will not help. Bail if we are suspending.

2143

* storage devices so kswapd will not help. Bail if we are suspending.

2141

*/

2144

*/

2142

if (!did_some_progress && pm_suspended_storage())

2145

if (!did_some_progress && pm_suspended_storage())

2143

return 0;

2146

return 0;

2144

2147

2145

/*

2148

/*

2146

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2149

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2147

* means __GFP_NOFAIL, but that may not be true in other

2150

* means __GFP_NOFAIL, but that may not be true in other

2148

* implementations.

2151

* implementations.

2149

*/

2152

*/

2150

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2153

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2151

return 1;

2154

return 1;

2152

2155

2153

/*

2156

/*

2154

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2157

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2155

* specified, then we retry until we no longer reclaim any pages

2158

* specified, then we retry until we no longer reclaim any pages

2156

* (above), or we've reclaimed an order of pages at least as

2159

* (above), or we've reclaimed an order of pages at least as

2157

* large as the allocation's order. In both cases, if the

2160

* large as the allocation's order. In both cases, if the

2158

* allocation still fails, we stop retrying.

2161

* allocation still fails, we stop retrying.

2159

*/

2162

*/

2160

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2163

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2161

return 1;

2164

return 1;

2162

2165

2163

return 0;

2166

return 0;

2164

}

2167

}

2165

2168

2166

static inline struct page *

2169

static inline struct page *

2167

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2170

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2168

struct zonelist *zonelist, enum zone_type high_zoneidx,

2171

struct zonelist *zonelist, enum zone_type high_zoneidx,

2169

nodemask_t *nodemask, struct zone *preferred_zone,

2172

nodemask_t *nodemask, struct zone *preferred_zone,

2170

int migratetype)

2173

int migratetype)

2171

{

2174

{

2172

struct page *page;

2175

struct page *page;

2173

2176

2174

/* Acquire the OOM killer lock for the zones in zonelist */

2177

/* Acquire the OOM killer lock for the zones in zonelist */

2175

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2178

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2176

schedule_timeout_uninterruptible(1);

2179

schedule_timeout_uninterruptible(1);

2177

return NULL;

2180

return NULL;

2178

}

2181

}

2179

2182

2180

/*

2183

/*

2181

* Go through the zonelist yet one more time, keep very high watermark

2184

* Go through the zonelist yet one more time, keep very high watermark

2182

* here, this is only to catch a parallel oom killing, we must fail if

2185

* here, this is only to catch a parallel oom killing, we must fail if

2183

* we're still under heavy pressure.

2186

* we're still under heavy pressure.

2184

*/

2187

*/

2185

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2188

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2186

order, zonelist, high_zoneidx,

2189

order, zonelist, high_zoneidx,

2187

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2190

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2188

preferred_zone, migratetype);

2191

preferred_zone, migratetype);

2189

if (page)

2192

if (page)

2190

goto out;

2193

goto out;

2191

2194

2192

if (!(gfp_mask & __GFP_NOFAIL)) {

2195

if (!(gfp_mask & __GFP_NOFAIL)) {

2193

/* The OOM killer will not help higher order allocs */

2196

/* The OOM killer will not help higher order allocs */

2194

if (order > PAGE_ALLOC_COSTLY_ORDER)

2197

if (order > PAGE_ALLOC_COSTLY_ORDER)

2195

goto out;

2198

goto out;

2196

/* The OOM killer does not needlessly kill tasks for lowmem */

2199

/* The OOM killer does not needlessly kill tasks for lowmem */

2197

if (high_zoneidx < ZONE_NORMAL)

2200

if (high_zoneidx < ZONE_NORMAL)

2198

goto out;

2201

goto out;

2199

/*

2202

/*

2200

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2203

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2201

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2204

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2202

* The caller should handle page allocation failure by itself if

2205

* The caller should handle page allocation failure by itself if

2203

* it specifies __GFP_THISNODE.

2206

* it specifies __GFP_THISNODE.

2204

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2207

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2205

*/

2208

*/

2206

if (gfp_mask & __GFP_THISNODE)

2209

if (gfp_mask & __GFP_THISNODE)

2207

goto out;

2210

goto out;

2208

}

2211

}

2209

/* Exhausted what can be done so it's blamo time */

2212

/* Exhausted what can be done so it's blamo time */

2210

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2213

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2211

2214

2212

out:

2215

out:

2213

clear_zonelist_oom(zonelist, gfp_mask);

2216

clear_zonelist_oom(zonelist, gfp_mask);

2214

return page;

2217

return page;

2215

}

2218

}

2216

2219

2217

#ifdef CONFIG_COMPACTION

2220

#ifdef CONFIG_COMPACTION

2218

/* Try memory compaction for high-order allocations before reclaim */

2221

/* Try memory compaction for high-order allocations before reclaim */

2219

static struct page *

2222

static struct page *

2220

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2223

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2221

struct zonelist *zonelist, enum zone_type high_zoneidx,

2224

struct zonelist *zonelist, enum zone_type high_zoneidx,

2222

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2225

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2223

int migratetype, enum migrate_mode mode,

2226

int migratetype, enum migrate_mode mode,

2224

bool *contended_compaction, bool *deferred_compaction,

2227

bool *contended_compaction, bool *deferred_compaction,

2225

unsigned long *did_some_progress)

2228

unsigned long *did_some_progress)

2226

{

2229

{

2227

if (!order)

2230

if (!order)

2228

return NULL;

2231

return NULL;

2229

2232

2230

if (compaction_deferred(preferred_zone, order)) {

2233

if (compaction_deferred(preferred_zone, order)) {

2231

*deferred_compaction = true;

2234

*deferred_compaction = true;

2232

return NULL;

2235

return NULL;

2233

}

2236

}

2234

2237

2235

current->flags |= PF_MEMALLOC;

2238

current->flags |= PF_MEMALLOC;

2236

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2239

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2237

nodemask, mode,

2240

nodemask, mode,

2238

contended_compaction);

2241

contended_compaction);

2239

current->flags &= ~PF_MEMALLOC;

2242

current->flags &= ~PF_MEMALLOC;

2240

2243

2241

if (*did_some_progress != COMPACT_SKIPPED) {

2244

if (*did_some_progress != COMPACT_SKIPPED) {

2242

struct page *page;

2245

struct page *page;

2243

2246

2244

/* Page migration frees to the PCP lists but we want merging */

2247

/* Page migration frees to the PCP lists but we want merging */

2245

drain_pages(get_cpu());

2248

drain_pages(get_cpu());

2246

put_cpu();

2249

put_cpu();

2247

2250

2248

page = get_page_from_freelist(gfp_mask, nodemask,

2251

page = get_page_from_freelist(gfp_mask, nodemask,

2249

order, zonelist, high_zoneidx,

2252

order, zonelist, high_zoneidx,

2250

alloc_flags & ~ALLOC_NO_WATERMARKS,

2253

alloc_flags & ~ALLOC_NO_WATERMARKS,

2251

preferred_zone, migratetype);

2254

preferred_zone, migratetype);

2252

if (page) {

2255

if (page) {

2253

preferred_zone->compact_blockskip_flush = false;

2256

preferred_zone->compact_blockskip_flush = false;

2254

compaction_defer_reset(preferred_zone, order, true);

2257

compaction_defer_reset(preferred_zone, order, true);

2255

count_vm_event(COMPACTSUCCESS);

2258

count_vm_event(COMPACTSUCCESS);

2256

return page;

2259

return page;

2257

}

2260

}

2258

2261

2259

/*

2262

/*

2260

* It's bad if compaction run occurs and fails.

2263

* It's bad if compaction run occurs and fails.

2261

* The most likely reason is that pages exist,

2264

* The most likely reason is that pages exist,

2262

* but not enough to satisfy watermarks.

2265

* but not enough to satisfy watermarks.

2263

*/

2266

*/

2264

count_vm_event(COMPACTFAIL);

2267

count_vm_event(COMPACTFAIL);

2265

2268

2266

/*

2269

/*

2267

* As async compaction considers a subset of pageblocks, only

2270

* As async compaction considers a subset of pageblocks, only

2268

* defer if the failure was a sync compaction failure.

2271

* defer if the failure was a sync compaction failure.

2269

*/

2272

*/

2270

if (mode != MIGRATE_ASYNC)

2273

if (mode != MIGRATE_ASYNC)

2271

defer_compaction(preferred_zone, order);

2274

defer_compaction(preferred_zone, order);

2272

2275

2273

cond_resched();

2276

cond_resched();

2274

}

2277

}

2275

2278

2276

return NULL;

2279

return NULL;

2277

}

2280

}

2278

#else

2281

#else

2279

static inline struct page *

2282

static inline struct page *

2280

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2283

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2281

struct zonelist *zonelist, enum zone_type high_zoneidx,

2284

struct zonelist *zonelist, enum zone_type high_zoneidx,

2282

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2285

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2283

int migratetype, enum migrate_mode mode, bool *contended_compaction,

2286

int migratetype, enum migrate_mode mode, bool *contended_compaction,

2284

bool *deferred_compaction, unsigned long *did_some_progress)

2287

bool *deferred_compaction, unsigned long *did_some_progress)

2285

{

2288

{

2286

return NULL;

2289

return NULL;

2287

}

2290

}

2288

#endif /* CONFIG_COMPACTION */

2291

#endif /* CONFIG_COMPACTION */

2289

2292

2290

/* Perform direct synchronous page reclaim */

2293

/* Perform direct synchronous page reclaim */

2291

static int

2294

static int

2292

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2295

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2293

nodemask_t *nodemask)

2296

nodemask_t *nodemask)

2294

{

2297

{

2295

struct reclaim_state reclaim_state;

2298

struct reclaim_state reclaim_state;

2296

int progress;

2299

int progress;

2297

2300

2298

cond_resched();

2301

cond_resched();

2299

2302

2300

/* We now go into synchronous reclaim */

2303

/* We now go into synchronous reclaim */

2301

cpuset_memory_pressure_bump();

2304

cpuset_memory_pressure_bump();

2302

current->flags |= PF_MEMALLOC;

2305

current->flags |= PF_MEMALLOC;

2303

lockdep_set_current_reclaim_state(gfp_mask);

2306

lockdep_set_current_reclaim_state(gfp_mask);

2304

reclaim_state.reclaimed_slab = 0;

2307

reclaim_state.reclaimed_slab = 0;

2305

current->reclaim_state = &reclaim_state;

2308

current->reclaim_state = &reclaim_state;

2306

2309

2307

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2310

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2308

2311

2309

current->reclaim_state = NULL;

2312

current->reclaim_state = NULL;

2310

lockdep_clear_current_reclaim_state();

2313

lockdep_clear_current_reclaim_state();

2311

current->flags &= ~PF_MEMALLOC;

2314

current->flags &= ~PF_MEMALLOC;

2312

2315

2313

cond_resched();

2316

cond_resched();

2314

2317

2315

return progress;

2318

return progress;

2316

}

2319

}

2317

2320

2318

/* The really slow allocator path where we enter direct reclaim */

2321

/* The really slow allocator path where we enter direct reclaim */

2319

static inline struct page *

2322

static inline struct page *

2320

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2323

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2321

struct zonelist *zonelist, enum zone_type high_zoneidx,

2324

struct zonelist *zonelist, enum zone_type high_zoneidx,

2322

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2325

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2323

int migratetype, unsigned long *did_some_progress)

2326

int migratetype, unsigned long *did_some_progress)

2324

{

2327

{

2325

struct page *page = NULL;

2328

struct page *page = NULL;

2326

bool drained = false;

2329

bool drained = false;

2327

2330

2328

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2331

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2329

nodemask);

2332

nodemask);

2330

if (unlikely(!(*did_some_progress)))

2333

if (unlikely(!(*did_some_progress)))

2331

return NULL;

2334

return NULL;

2332

2335

2333

/* After successful reclaim, reconsider all zones for allocation */

2336

/* After successful reclaim, reconsider all zones for allocation */

2334

if (IS_ENABLED(CONFIG_NUMA))

2337

if (IS_ENABLED(CONFIG_NUMA))

2335

zlc_clear_zones_full(zonelist);

2338

zlc_clear_zones_full(zonelist);

2336

2339

2337

retry:

2340

retry:

2338

page = get_page_from_freelist(gfp_mask, nodemask, order,

2341

page = get_page_from_freelist(gfp_mask, nodemask, order,

2339

zonelist, high_zoneidx,

2342

zonelist, high_zoneidx,

2340

alloc_flags & ~ALLOC_NO_WATERMARKS,

2343

alloc_flags & ~ALLOC_NO_WATERMARKS,

2341

preferred_zone, migratetype);

2344

preferred_zone, migratetype);

2342

2345

2343

/*

2346

/*

2344

* If an allocation failed after direct reclaim, it could be because

2347

* If an allocation failed after direct reclaim, it could be because

2345

* pages are pinned on the per-cpu lists. Drain them and try again

2348

* pages are pinned on the per-cpu lists. Drain them and try again

2346

*/

2349

*/

2347

if (!page && !drained) {

2350

if (!page && !drained) {

2348

drain_all_pages();

2351

drain_all_pages();

2349

drained = true;

2352

drained = true;

2350

goto retry;

2353

goto retry;

2351

}

2354

}

2352

2355

2353

return page;

2356

return page;

2354

}

2357

}

2355

2358

2356

/*

2359

/*

2357

* This is called in the allocator slow-path if the allocation request is of

2360

* This is called in the allocator slow-path if the allocation request is of

2358

* sufficient urgency to ignore watermarks and take other desperate measures

2361

* sufficient urgency to ignore watermarks and take other desperate measures

2359

*/

2362

*/

2360

static inline struct page *

2363

static inline struct page *

2361

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2364

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2362

struct zonelist *zonelist, enum zone_type high_zoneidx,

2365

struct zonelist *zonelist, enum zone_type high_zoneidx,

2363

nodemask_t *nodemask, struct zone *preferred_zone,

2366

nodemask_t *nodemask, struct zone *preferred_zone,

2364

int migratetype)

2367

int migratetype)

2365

{

2368

{

2366

struct page *page;

2369

struct page *page;

2367

2370

2368

do {

2371

do {

2369

page = get_page_from_freelist(gfp_mask, nodemask, order,

2372

page = get_page_from_freelist(gfp_mask, nodemask, order,

2370

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2373

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2371

preferred_zone, migratetype);

2374

preferred_zone, migratetype);

2372

2375

2373

if (!page && gfp_mask & __GFP_NOFAIL)

2376

if (!page && gfp_mask & __GFP_NOFAIL)

2374

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2377

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2375

} while (!page && (gfp_mask & __GFP_NOFAIL));

2378

} while (!page && (gfp_mask & __GFP_NOFAIL));

2376

2379

2377

return page;

2380

return page;

2378

}

2381

}

2379

2382

2380

static void reset_alloc_batches(struct zonelist *zonelist,

2383

static void reset_alloc_batches(struct zonelist *zonelist,

2381

enum zone_type high_zoneidx,

2384

enum zone_type high_zoneidx,

2382

struct zone *preferred_zone)

2385

struct zone *preferred_zone)

2383

{

2386

{

2384

struct zoneref *z;

2387

struct zoneref *z;

2385

struct zone *zone;

2388

struct zone *zone;

2386

2389

2387

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2390

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2388

/*

2391

/*

2389

* Only reset the batches of zones that were actually

2392

* Only reset the batches of zones that were actually

2390

* considered in the fairness pass, we don't want to

2393

* considered in the fairness pass, we don't want to

2391

* trash fairness information for zones that are not

2394

* trash fairness information for zones that are not

2392

* actually part of this zonelist's round-robin cycle.

2395

* actually part of this zonelist's round-robin cycle.

2393

*/

2396

*/

2394

if (!zone_local(preferred_zone, zone))

2397

if (!zone_local(preferred_zone, zone))

2395

continue;

2398

continue;

2396

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2399

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2397

high_wmark_pages(zone) - low_wmark_pages(zone) -

2400

high_wmark_pages(zone) - low_wmark_pages(zone) -

2398

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2401

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2399

}

2402

}

2400

}

2403

}

2401

2404

2402

static void wake_all_kswapds(unsigned int order,

2405

static void wake_all_kswapds(unsigned int order,

2403

struct zonelist *zonelist,

2406

struct zonelist *zonelist,

2404

enum zone_type high_zoneidx,

2407

enum zone_type high_zoneidx,

2405

struct zone *preferred_zone)

2408

struct zone *preferred_zone)

2406

{

2409

{

2407

struct zoneref *z;

2410

struct zoneref *z;

2408

struct zone *zone;

2411

struct zone *zone;

2409

2412

2410

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2413

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2411

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2414

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2412

}

2415

}

2413

2416

2414

static inline int

2417

static inline int

2415

gfp_to_alloc_flags(gfp_t gfp_mask)

2418

gfp_to_alloc_flags(gfp_t gfp_mask)

2416

{

2419

{

2417

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2420

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2418

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2421

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2419

2422

2420

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2423

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2421

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2424

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2422

2425

2423

/*

2426

/*

2424

* The caller may dip into page reserves a bit more if the caller

2427

* The caller may dip into page reserves a bit more if the caller

2425

* cannot run direct reclaim, or if the caller has realtime scheduling

2428

* cannot run direct reclaim, or if the caller has realtime scheduling

2426

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2429

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2427

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2430

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2428

*/

2431

*/

2429

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2432

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2430

2433

2431

if (atomic) {

2434

if (atomic) {

2432

/*

2435

/*

2433

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2436

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2434

* if it can't schedule.

2437

* if it can't schedule.

2435

*/

2438

*/

2436

if (!(gfp_mask & __GFP_NOMEMALLOC))

2439

if (!(gfp_mask & __GFP_NOMEMALLOC))

2437

alloc_flags |= ALLOC_HARDER;

2440

alloc_flags |= ALLOC_HARDER;

2438

/*

2441

/*

2439

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2442

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2440

* comment for __cpuset_node_allowed_softwall().

2443

* comment for __cpuset_node_allowed_softwall().

2441

*/

2444

*/

2442

alloc_flags &= ~ALLOC_CPUSET;

2445

alloc_flags &= ~ALLOC_CPUSET;

2443

} else if (unlikely(rt_task(current)) && !in_interrupt())

2446

} else if (unlikely(rt_task(current)) && !in_interrupt())

2444

alloc_flags |= ALLOC_HARDER;

2447

alloc_flags |= ALLOC_HARDER;

2445

2448

2446

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2449

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2447

if (gfp_mask & __GFP_MEMALLOC)

2450

if (gfp_mask & __GFP_MEMALLOC)

2448

alloc_flags |= ALLOC_NO_WATERMARKS;

2451

alloc_flags |= ALLOC_NO_WATERMARKS;

2449

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2452

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2450

alloc_flags |= ALLOC_NO_WATERMARKS;

2453

alloc_flags |= ALLOC_NO_WATERMARKS;

2451

else if (!in_interrupt() &&

2454

else if (!in_interrupt() &&

2452

((current->flags & PF_MEMALLOC) ||

2455

((current->flags & PF_MEMALLOC) ||

2453

unlikely(test_thread_flag(TIF_MEMDIE))))

2456

unlikely(test_thread_flag(TIF_MEMDIE))))

2454

alloc_flags |= ALLOC_NO_WATERMARKS;

2457

alloc_flags |= ALLOC_NO_WATERMARKS;

2455

}

2458

}

2456

#ifdef CONFIG_CMA

2459

#ifdef CONFIG_CMA

2457

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2460

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2458

alloc_flags |= ALLOC_CMA;

2461

alloc_flags |= ALLOC_CMA;

2459

#endif

2462

#endif

2460

return alloc_flags;

2463

return alloc_flags;

2461

}

2464

}

2462

2465

2463

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2466

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2464

{

2467

{

2465

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2468

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2466

}

2469

}

2467

2470

2468

static inline struct page *

2471

static inline struct page *

2469

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2472

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2470

struct zonelist *zonelist, enum zone_type high_zoneidx,

2473

struct zonelist *zonelist, enum zone_type high_zoneidx,

2471

nodemask_t *nodemask, struct zone *preferred_zone,

2474

nodemask_t *nodemask, struct zone *preferred_zone,

2472

int migratetype)

2475

int migratetype)

2473

{

2476

{

2474

const gfp_t wait = gfp_mask & __GFP_WAIT;

2477

const gfp_t wait = gfp_mask & __GFP_WAIT;

2475

struct page *page = NULL;

2478

struct page *page = NULL;

2476

int alloc_flags;

2479

int alloc_flags;

2477

unsigned long pages_reclaimed = 0;

2480

unsigned long pages_reclaimed = 0;

2478

unsigned long did_some_progress;

2481

unsigned long did_some_progress;

2479

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2482

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2480

bool deferred_compaction = false;

2483

bool deferred_compaction = false;

2481

bool contended_compaction = false;

2484

bool contended_compaction = false;

2482

2485

2483

/*

2486

/*

2484

* In the slowpath, we sanity check order to avoid ever trying to

2487

* In the slowpath, we sanity check order to avoid ever trying to

2485

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2488

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2486

* be using allocators in order of preference for an area that is

2489

* be using allocators in order of preference for an area that is

2487

* too large.

2490

* too large.

2488

*/

2491

*/

2489

if (order >= MAX_ORDER) {

2492

if (order >= MAX_ORDER) {

2490

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2493

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2491

return NULL;

2494

return NULL;

2492

}

2495

}

2493

2496

2494

/*

2497

/*

2495

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2498

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2496

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2499

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2497

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2500

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2498

* using a larger set of nodes after it has established that the

2501

* using a larger set of nodes after it has established that the

2499

* allowed per node queues are empty and that nodes are

2502

* allowed per node queues are empty and that nodes are

2500

* over allocated.

2503

* over allocated.

2501

*/

2504

*/

2502

if (IS_ENABLED(CONFIG_NUMA) &&

2505

if (IS_ENABLED(CONFIG_NUMA) &&

2503

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2506

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2504

goto nopage;

2507

goto nopage;

2505

2508

2506

restart:

2509

restart:

2507

if (!(gfp_mask & __GFP_NO_KSWAPD))

2510

if (!(gfp_mask & __GFP_NO_KSWAPD))

2508

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2511

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2509

2512

2510

/*

2513

/*

2511

* OK, we're below the kswapd watermark and have kicked background

2514

* OK, we're below the kswapd watermark and have kicked background

2512

* reclaim. Now things get more complex, so set up alloc_flags according

2515

* reclaim. Now things get more complex, so set up alloc_flags according

2513

* to how we want to proceed.

2516

* to how we want to proceed.

2514

*/

2517

*/

2515

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2518

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2516

2519

2517

/*

2520

/*

2518

* Find the true preferred zone if the allocation is unconstrained by

2521

* Find the true preferred zone if the allocation is unconstrained by

2519

* cpusets.

2522

* cpusets.

2520

*/

2523

*/

2521

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2524

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2522

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2525

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2523

&preferred_zone);

2526

&preferred_zone);

2524

2527

2525

rebalance:

2528

rebalance:

2526

/* This is the last chance, in general, before the goto nopage. */

2529

/* This is the last chance, in general, before the goto nopage. */

2527

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2530

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2528

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2531

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2529

preferred_zone, migratetype);

2532

preferred_zone, migratetype);

2530

if (page)

2533

if (page)

2531

goto got_pg;

2534

goto got_pg;

2532

2535

2533

/* Allocate without watermarks if the context allows */

2536

/* Allocate without watermarks if the context allows */

2534

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2537

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2535

/*

2538

/*

2536

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2539

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2537

* the allocation is high priority and these type of

2540

* the allocation is high priority and these type of

2538

* allocations are system rather than user orientated

2541

* allocations are system rather than user orientated

2539

*/

2542

*/

2540

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2543

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2541

2544

2542

page = __alloc_pages_high_priority(gfp_mask, order,

2545

page = __alloc_pages_high_priority(gfp_mask, order,

2543

zonelist, high_zoneidx, nodemask,

2546

zonelist, high_zoneidx, nodemask,

2544

preferred_zone, migratetype);

2547

preferred_zone, migratetype);

2545

if (page) {

2548

if (page) {

2546

goto got_pg;

2549

goto got_pg;

2547

}

2550

}

2548

}

2551

}

2549

2552

2550

/* Atomic allocations - we can't balance anything */

2553

/* Atomic allocations - we can't balance anything */

2551

if (!wait)

2554

if (!wait)

2552

goto nopage;

2555

goto nopage;

2553

2556

2554

/* Avoid recursion of direct reclaim */

2557

/* Avoid recursion of direct reclaim */

2555

if (current->flags & PF_MEMALLOC)

2558

if (current->flags & PF_MEMALLOC)

2556

goto nopage;

2559

goto nopage;

2557

2560

2558

/* Avoid allocations with no watermarks from looping endlessly */

2561

/* Avoid allocations with no watermarks from looping endlessly */

2559

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2562

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2560

goto nopage;

2563

goto nopage;

2561

2564

2562

/*

2565

/*

2563

* Try direct compaction. The first pass is asynchronous. Subsequent

2566

* Try direct compaction. The first pass is asynchronous. Subsequent

2564

* attempts after direct reclaim are synchronous

2567

* attempts after direct reclaim are synchronous

2565

*/

2568

*/

2566

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2569

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2567

high_zoneidx, nodemask, alloc_flags,

2570

high_zoneidx, nodemask, alloc_flags,

2568

preferred_zone, migratetype,

2571

preferred_zone, migratetype,

2569

migration_mode, &contended_compaction,

2572

migration_mode, &contended_compaction,

2570

&deferred_compaction,

2573

&deferred_compaction,

2571

&did_some_progress);

2574

&did_some_progress);

2572

if (page)

2575

if (page)

2573

goto got_pg;

2576

goto got_pg;

2574

migration_mode = MIGRATE_SYNC_LIGHT;

2577

migration_mode = MIGRATE_SYNC_LIGHT;

2575

2578

2576

/*

2579

/*

2577

* If compaction is deferred for high-order allocations, it is because

2580

* If compaction is deferred for high-order allocations, it is because

2578

* sync compaction recently failed. In this is the case and the caller

2581

* sync compaction recently failed. In this is the case and the caller

2579

* requested a movable allocation that does not heavily disrupt the

2582

* requested a movable allocation that does not heavily disrupt the

2580

* system then fail the allocation instead of entering direct reclaim.

2583

* system then fail the allocation instead of entering direct reclaim.

2581

*/

2584

*/

2582

if ((deferred_compaction || contended_compaction) &&

2585

if ((deferred_compaction || contended_compaction) &&

2583

(gfp_mask & __GFP_NO_KSWAPD))

2586

(gfp_mask & __GFP_NO_KSWAPD))

2584

goto nopage;

2587

goto nopage;

2585

2588

2586

/* Try direct reclaim and then allocating */

2589

/* Try direct reclaim and then allocating */

2587

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2590

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2588

zonelist, high_zoneidx,

2591

zonelist, high_zoneidx,

2589

nodemask,

2592

nodemask,

2590

alloc_flags, preferred_zone,

2593

alloc_flags, preferred_zone,

2591

migratetype, &did_some_progress);

2594

migratetype, &did_some_progress);

2592

if (page)

2595

if (page)

2593

goto got_pg;

2596

goto got_pg;

2594

2597

2595

/*

2598

/*

2596

* If we failed to make any progress reclaiming, then we are

2599

* If we failed to make any progress reclaiming, then we are

2597

* running out of options and have to consider going OOM

2600

* running out of options and have to consider going OOM

2598

*/

2601

*/

2599

if (!did_some_progress) {

2602

if (!did_some_progress) {

2600

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2603

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2601

if (oom_killer_disabled)

2604

if (oom_killer_disabled)

2602

goto nopage;

2605

goto nopage;

2603

/* Coredumps can quickly deplete all memory reserves */

2606

/* Coredumps can quickly deplete all memory reserves */

2604

if ((current->flags & PF_DUMPCORE) &&

2607

if ((current->flags & PF_DUMPCORE) &&

2605

!(gfp_mask & __GFP_NOFAIL))

2608

!(gfp_mask & __GFP_NOFAIL))

2606

goto nopage;

2609

goto nopage;

2607

page = __alloc_pages_may_oom(gfp_mask, order,

2610

page = __alloc_pages_may_oom(gfp_mask, order,

2608

zonelist, high_zoneidx,

2611

zonelist, high_zoneidx,

2609

nodemask, preferred_zone,

2612

nodemask, preferred_zone,

2610

migratetype);

2613

migratetype);

2611

if (page)

2614

if (page)

2612

goto got_pg;

2615

goto got_pg;

2613

2616

2614

if (!(gfp_mask & __GFP_NOFAIL)) {

2617

if (!(gfp_mask & __GFP_NOFAIL)) {

2615

/*

2618

/*

2616

* The oom killer is not called for high-order

2619

* The oom killer is not called for high-order

2617

* allocations that may fail, so if no progress

2620

* allocations that may fail, so if no progress

2618

* is being made, there are no other options and

2621

* is being made, there are no other options and

2619

* retrying is unlikely to help.

2622

* retrying is unlikely to help.

2620

*/

2623

*/

2621

if (order > PAGE_ALLOC_COSTLY_ORDER)

2624

if (order > PAGE_ALLOC_COSTLY_ORDER)

2622

goto nopage;

2625

goto nopage;

2623

/*

2626

/*

2624

* The oom killer is not called for lowmem

2627

* The oom killer is not called for lowmem

2625

* allocations to prevent needlessly killing

2628

* allocations to prevent needlessly killing

2626

* innocent tasks.

2629

* innocent tasks.

2627

*/

2630

*/

2628

if (high_zoneidx < ZONE_NORMAL)

2631

if (high_zoneidx < ZONE_NORMAL)

2629

goto nopage;

2632

goto nopage;

2630

}

2633

}

2631

2634

2632

goto restart;

2635

goto restart;

2633

}

2636

}

2634

}

2637

}

2635

2638

2636

/* Check if we should retry the allocation */

2639

/* Check if we should retry the allocation */

2637

pages_reclaimed += did_some_progress;

2640

pages_reclaimed += did_some_progress;

2638

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2641

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2639

pages_reclaimed)) {

2642

pages_reclaimed)) {

2640

/* Wait for some write requests to complete then retry */

2643

/* Wait for some write requests to complete then retry */

2641

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2644

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2642

goto rebalance;

2645

goto rebalance;

2643

} else {

2646

} else {

2644

/*

2647

/*

2645

* High-order allocations do not necessarily loop after

2648

* High-order allocations do not necessarily loop after

2646

* direct reclaim and reclaim/compaction depends on compaction

2649

* direct reclaim and reclaim/compaction depends on compaction

2647

* being called after reclaim so call directly if necessary

2650

* being called after reclaim so call directly if necessary

2648

*/

2651

*/

2649

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2652

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2650

high_zoneidx, nodemask, alloc_flags,

2653

high_zoneidx, nodemask, alloc_flags,

2651

preferred_zone, migratetype,

2654

preferred_zone, migratetype,

2652

migration_mode, &contended_compaction,

2655

migration_mode, &contended_compaction,

2653

&deferred_compaction,

2656

&deferred_compaction,

2654

&did_some_progress);

2657

&did_some_progress);

2655

if (page)

2658

if (page)

2656

goto got_pg;

2659

goto got_pg;

2657

}

2660

}

2658

2661

2659

nopage:

2662

nopage:

2660

warn_alloc_failed(gfp_mask, order, NULL);

2663

warn_alloc_failed(gfp_mask, order, NULL);

2661

return page;

2664

return page;

2662

got_pg:

2665

got_pg:

2663

if (kmemcheck_enabled)

2666

if (kmemcheck_enabled)

2664

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2667

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2665

2668

2666

return page;

2669

return page;

2667

}

2670

}

2668

2671

2669

/*

2672

/*

2670

* This is the 'heart' of the zoned buddy allocator.

2673

* This is the 'heart' of the zoned buddy allocator.

2671

*/

2674

*/

2672

struct page *

2675

struct page *

2673

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2676

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2674

struct zonelist *zonelist, nodemask_t *nodemask)

2677

struct zonelist *zonelist, nodemask_t *nodemask)

2675

{

2678

{

2676

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2679

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2677

struct zone *preferred_zone;

2680

struct zone *preferred_zone;

2678

struct page *page = NULL;

2681

struct page *page = NULL;

2679

int migratetype = allocflags_to_migratetype(gfp_mask);

2682

int migratetype = allocflags_to_migratetype(gfp_mask);

2680

unsigned int cpuset_mems_cookie;

2683

unsigned int cpuset_mems_cookie;

2681

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2684

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2682

struct mem_cgroup *memcg = NULL;

2685

struct mem_cgroup *memcg = NULL;

2683

2686

2684

gfp_mask &= gfp_allowed_mask;

2687

gfp_mask &= gfp_allowed_mask;

2685

2688

2686

lockdep_trace_alloc(gfp_mask);

2689

lockdep_trace_alloc(gfp_mask);

2687

2690

2688

might_sleep_if(gfp_mask & __GFP_WAIT);

2691

might_sleep_if(gfp_mask & __GFP_WAIT);

2689

2692

2690

if (should_fail_alloc_page(gfp_mask, order))

2693

if (should_fail_alloc_page(gfp_mask, order))

2691

return NULL;

2694

return NULL;

2692

2695

2693

/*

2696

/*

2694

* Check the zones suitable for the gfp_mask contain at least one

2697

* Check the zones suitable for the gfp_mask contain at least one

2695

* valid zone. It's possible to have an empty zonelist as a result

2698

* valid zone. It's possible to have an empty zonelist as a result

2696

* of GFP_THISNODE and a memoryless node

2699

* of GFP_THISNODE and a memoryless node

2697

*/

2700

*/

2698

if (unlikely(!zonelist->_zonerefs->zone))

2701

if (unlikely(!zonelist->_zonerefs->zone))

2699

return NULL;

2702

return NULL;

2700

2703

2701

/*

2704

/*

2702

* Will only have any effect when __GFP_KMEMCG is set. This is

2705

* Will only have any effect when __GFP_KMEMCG is set. This is

2703

* verified in the (always inline) callee

2706

* verified in the (always inline) callee

2704

*/

2707

*/

2705

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2708

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2706

return NULL;

2709

return NULL;

2707

2710

2708

retry_cpuset:

2711

retry_cpuset:

2709

cpuset_mems_cookie = read_mems_allowed_begin();

2712

cpuset_mems_cookie = read_mems_allowed_begin();

2710

2713

2711

/* The preferred zone is used for statistics later */

2714

/* The preferred zone is used for statistics later */

2712

first_zones_zonelist(zonelist, high_zoneidx,

2715

first_zones_zonelist(zonelist, high_zoneidx,

2713

nodemask ? : &cpuset_current_mems_allowed,

2716

nodemask ? : &cpuset_current_mems_allowed,

2714

&preferred_zone);

2717

&preferred_zone);

2715

if (!preferred_zone)

2718

if (!preferred_zone)

2716

goto out;

2719

goto out;

2717

2720

2718

#ifdef CONFIG_CMA

2721

#ifdef CONFIG_CMA

2719

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2722

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2720

alloc_flags |= ALLOC_CMA;

2723

alloc_flags |= ALLOC_CMA;

2721

#endif

2724

#endif

2722

retry:

2725

retry:

2723

/* First allocation attempt */

2726

/* First allocation attempt */

2724

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2727

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2725

zonelist, high_zoneidx, alloc_flags,

2728

zonelist, high_zoneidx, alloc_flags,

2726

preferred_zone, migratetype);

2729

preferred_zone, migratetype);

2727

if (unlikely(!page)) {

2730

if (unlikely(!page)) {

2728

/*

2731

/*

2729

* The first pass makes sure allocations are spread

2732

* The first pass makes sure allocations are spread

2730

* fairly within the local node. However, the local

2733

* fairly within the local node. However, the local

2731

* node might have free pages left after the fairness

2734

* node might have free pages left after the fairness

2732

* batches are exhausted, and remote zones haven't

2735

* batches are exhausted, and remote zones haven't

2733

* even been considered yet. Try once more without

2736

* even been considered yet. Try once more without

2734

* fairness, and include remote zones now, before

2737

* fairness, and include remote zones now, before

2735

* entering the slowpath and waking kswapd: prefer

2738

* entering the slowpath and waking kswapd: prefer

2736

* spilling to a remote zone over swapping locally.

2739

* spilling to a remote zone over swapping locally.

2737

*/

2740

*/

2738

if (alloc_flags & ALLOC_FAIR) {

2741

if (alloc_flags & ALLOC_FAIR) {

2739

reset_alloc_batches(zonelist, high_zoneidx,

2742

reset_alloc_batches(zonelist, high_zoneidx,

2740

preferred_zone);

2743

preferred_zone);

2741

alloc_flags &= ~ALLOC_FAIR;

2744

alloc_flags &= ~ALLOC_FAIR;

2742

goto retry;

2745

goto retry;

2743

}

2746

}

2744

/*

2747

/*

2745

* Runtime PM, block IO and its error handling path

2748

* Runtime PM, block IO and its error handling path

2746

* can deadlock because I/O on the device might not

2749

* can deadlock because I/O on the device might not

2747

* complete.

2750

* complete.

2748

*/

2751

*/

2749

gfp_mask = memalloc_noio_flags(gfp_mask);

2752

gfp_mask = memalloc_noio_flags(gfp_mask);

2750

page = __alloc_pages_slowpath(gfp_mask, order,

2753

page = __alloc_pages_slowpath(gfp_mask, order,

2751

zonelist, high_zoneidx, nodemask,

2754

zonelist, high_zoneidx, nodemask,

2752

preferred_zone, migratetype);

2755

preferred_zone, migratetype);

2753

}

2756

}

2754

2757

2755

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2758

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2756

2759

2757

out:

2760

out:

2758

/*

2761

/*

2759

* When updating a task's mems_allowed, it is possible to race with

2762

* When updating a task's mems_allowed, it is possible to race with

2760

* parallel threads in such a way that an allocation can fail while

2763

* parallel threads in such a way that an allocation can fail while

2761

* the mask is being updated. If a page allocation is about to fail,

2764

* the mask is being updated. If a page allocation is about to fail,

2762

* check if the cpuset changed during allocation and if so, retry.

2765

* check if the cpuset changed during allocation and if so, retry.

2763

*/

2766

*/

2764

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2767

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2765

goto retry_cpuset;

2768

goto retry_cpuset;

2766

2769

2767

memcg_kmem_commit_charge(page, memcg, order);

2770

memcg_kmem_commit_charge(page, memcg, order);

2768

2771

2769

return page;

2772

return page;

2770

}

2773

}

2771

EXPORT_SYMBOL(__alloc_pages_nodemask);

2774

EXPORT_SYMBOL(__alloc_pages_nodemask);

2772

2775

2773

/*

2776

/*

2774

* Common helper functions.

2777

* Common helper functions.

2775

*/

2778

*/

2776

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2779

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2777

{

2780

{

2778

struct page *page;

2781

struct page *page;

2779

2782

2780

/*

2783

/*

2781

* __get_free_pages() returns a 32-bit address, which cannot represent

2784

* __get_free_pages() returns a 32-bit address, which cannot represent

2782

* a highmem page

2785

* a highmem page

2783

*/

2786

*/

2784

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2787

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2785

2788

2786

page = alloc_pages(gfp_mask, order);

2789

page = alloc_pages(gfp_mask, order);

2787

if (!page)

2790

if (!page)

2788

return 0;

2791

return 0;

2789

return (unsigned long) page_address(page);

2792

return (unsigned long) page_address(page);

2790

}

2793

}

2791

EXPORT_SYMBOL(__get_free_pages);

2794

EXPORT_SYMBOL(__get_free_pages);

2792

2795

2793

unsigned long get_zeroed_page(gfp_t gfp_mask)

2796

unsigned long get_zeroed_page(gfp_t gfp_mask)

2794

{

2797

{

2795

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2798

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2796

}

2799

}

2797

EXPORT_SYMBOL(get_zeroed_page);

2800

EXPORT_SYMBOL(get_zeroed_page);

2798

2801

2799

void __free_pages(struct page *page, unsigned int order)

2802

void __free_pages(struct page *page, unsigned int order)

2800

{

2803

{

2801

if (put_page_testzero(page)) {

2804

if (put_page_testzero(page)) {

2802

if (order == 0)

2805

if (order == 0)

2803

free_hot_cold_page(page, 0);

2806

free_hot_cold_page(page, 0);

2804

else

2807

else

2805

__free_pages_ok(page, order);

2808

__free_pages_ok(page, order);

2806

}

2809

}

2807

}

2810

}

2808

2811

2809

EXPORT_SYMBOL(__free_pages);

2812

EXPORT_SYMBOL(__free_pages);

2810

2813

2811

void free_pages(unsigned long addr, unsigned int order)

2814

void free_pages(unsigned long addr, unsigned int order)

2812

{

2815

{

2813

if (addr != 0) {

2816

if (addr != 0) {

2814

VM_BUG_ON(!virt_addr_valid((void *)addr));

2817

VM_BUG_ON(!virt_addr_valid((void *)addr));

2815

__free_pages(virt_to_page((void *)addr), order);

2818

__free_pages(virt_to_page((void *)addr), order);

2816

}

2819

}

2817

}

2820

}

2818

2821

2819

EXPORT_SYMBOL(free_pages);

2822

EXPORT_SYMBOL(free_pages);

2820

2823

2821

/*

2824

/*

2822

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2825

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2823

* pages allocated with __GFP_KMEMCG.

2826

* pages allocated with __GFP_KMEMCG.

2824

*

2827

*

2825

* Those pages are accounted to a particular memcg, embedded in the

2828

* Those pages are accounted to a particular memcg, embedded in the

2826

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2829

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2827

* for that information only to find out that it is NULL for users who have no

2830

* for that information only to find out that it is NULL for users who have no

2828

* interest in that whatsoever, we provide these functions.

2831

* interest in that whatsoever, we provide these functions.

2829

*

2832

*

2830

* The caller knows better which flags it relies on.

2833

* The caller knows better which flags it relies on.

2831

*/

2834

*/

2832

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2835

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2833

{

2836

{

2834

memcg_kmem_uncharge_pages(page, order);

2837

memcg_kmem_uncharge_pages(page, order);

2835

__free_pages(page, order);

2838

__free_pages(page, order);

2836

}

2839

}

2837

2840

2838

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2841

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2839

{

2842

{

2840

if (addr != 0) {

2843

if (addr != 0) {

2841

VM_BUG_ON(!virt_addr_valid((void *)addr));

2844

VM_BUG_ON(!virt_addr_valid((void *)addr));

2842

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2845

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2843

}

2846

}

2844

}

2847

}

2845

2848

2846

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2849

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2847

{

2850

{

2848

if (addr) {

2851

if (addr) {

2849

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2852

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2850

unsigned long used = addr + PAGE_ALIGN(size);

2853

unsigned long used = addr + PAGE_ALIGN(size);

2851

2854

2852

split_page(virt_to_page((void *)addr), order);

2855

split_page(virt_to_page((void *)addr), order);

2853

while (used < alloc_end) {

2856

while (used < alloc_end) {

2854

free_page(used);

2857

free_page(used);

2855

used += PAGE_SIZE;

2858

used += PAGE_SIZE;

2856

}

2859

}

2857

}

2860

}

2858

return (void *)addr;

2861

return (void *)addr;

2859

}

2862

}

2860

2863

2861

/**

2864

/**

2862

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2865

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2863

* @size: the number of bytes to allocate

2866

* @size: the number of bytes to allocate

2864

* @gfp_mask: GFP flags for the allocation

2867

* @gfp_mask: GFP flags for the allocation

2865

*

2868

*

2866

* This function is similar to alloc_pages(), except that it allocates the

2869

* This function is similar to alloc_pages(), except that it allocates the

2867

* minimum number of pages to satisfy the request. alloc_pages() can only

2870

* minimum number of pages to satisfy the request. alloc_pages() can only

2868

* allocate memory in power-of-two pages.

2871

* allocate memory in power-of-two pages.

2869

*

2872

*

2870

* This function is also limited by MAX_ORDER.

2873

* This function is also limited by MAX_ORDER.

2871

*

2874

*

2872

* Memory allocated by this function must be released by free_pages_exact().

2875

* Memory allocated by this function must be released by free_pages_exact().

2873

*/

2876

*/

2874

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2877

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2875

{

2878

{

2876

unsigned int order = get_order(size);

2879

unsigned int order = get_order(size);

2877

unsigned long addr;

2880

unsigned long addr;

2878

2881

2879

addr = __get_free_pages(gfp_mask, order);

2882

addr = __get_free_pages(gfp_mask, order);

2880

return make_alloc_exact(addr, order, size);

2883

return make_alloc_exact(addr, order, size);

2881

}

2884

}

2882

EXPORT_SYMBOL(alloc_pages_exact);

2885

EXPORT_SYMBOL(alloc_pages_exact);

2883

2886

2884

/**

2887

/**

2885

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2888

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2886

* pages on a node.

2889

* pages on a node.

2887

* @nid: the preferred node ID where memory should be allocated

2890

* @nid: the preferred node ID where memory should be allocated

2888

* @size: the number of bytes to allocate

2891

* @size: the number of bytes to allocate

2889

* @gfp_mask: GFP flags for the allocation

2892

* @gfp_mask: GFP flags for the allocation

2890

*

2893

*

2891

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2894

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2892

* back.

2895

* back.

2893

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2896

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2894

* but is not exact.

2897

* but is not exact.

2895

*/

2898

*/

2896

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2899

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2897

{

2900

{

2898

unsigned order = get_order(size);

2901

unsigned order = get_order(size);

2899

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2902

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2900

if (!p)

2903

if (!p)

2901

return NULL;

2904

return NULL;

2902

return make_alloc_exact((unsigned long)page_address(p), order, size);

2905

return make_alloc_exact((unsigned long)page_address(p), order, size);

2903

}

2906

}

2904

EXPORT_SYMBOL(alloc_pages_exact_nid);

2907

EXPORT_SYMBOL(alloc_pages_exact_nid);

2905

2908

2906

/**

2909

/**

2907

* free_pages_exact - release memory allocated via alloc_pages_exact()

2910

* free_pages_exact - release memory allocated via alloc_pages_exact()

2908

* @virt: the value returned by alloc_pages_exact.

2911

* @virt: the value returned by alloc_pages_exact.

2909

* @size: size of allocation, same value as passed to alloc_pages_exact().

2912

* @size: size of allocation, same value as passed to alloc_pages_exact().

2910

*

2913

*

2911

* Release the memory allocated by a previous call to alloc_pages_exact.

2914

* Release the memory allocated by a previous call to alloc_pages_exact.

2912

*/

2915

*/

2913

void free_pages_exact(void *virt, size_t size)

2916

void free_pages_exact(void *virt, size_t size)

2914

{

2917

{

2915

unsigned long addr = (unsigned long)virt;

2918

unsigned long addr = (unsigned long)virt;

2916

unsigned long end = addr + PAGE_ALIGN(size);

2919

unsigned long end = addr + PAGE_ALIGN(size);

2917

2920

2918

while (addr < end) {

2921

while (addr < end) {

2919

free_page(addr);

2922

free_page(addr);

2920

addr += PAGE_SIZE;

2923

addr += PAGE_SIZE;

2921

}

2924

}

2922

}

2925

}

2923

EXPORT_SYMBOL(free_pages_exact);

2926

EXPORT_SYMBOL(free_pages_exact);

2924

2927

2925

/**

2928

/**

2926

* nr_free_zone_pages - count number of pages beyond high watermark

2929

* nr_free_zone_pages - count number of pages beyond high watermark

2927

* @offset: The zone index of the highest zone

2930

* @offset: The zone index of the highest zone

2928

*

2931

*

2929

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2932

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2930

* high watermark within all zones at or below a given zone index. For each

2933

* high watermark within all zones at or below a given zone index. For each

2931

* zone, the number of pages is calculated as:

2934

* zone, the number of pages is calculated as:

2932

* managed_pages - high_pages

2935

* managed_pages - high_pages

2933

*/

2936

*/

2934

static unsigned long nr_free_zone_pages(int offset)

2937

static unsigned long nr_free_zone_pages(int offset)

2935

{

2938

{

2936

struct zoneref *z;

2939

struct zoneref *z;

2937

struct zone *zone;

2940

struct zone *zone;

2938

2941

2939

/* Just pick one node, since fallback list is circular */

2942

/* Just pick one node, since fallback list is circular */

2940

unsigned long sum = 0;

2943

unsigned long sum = 0;

2941

2944

2942

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2945

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2943

2946

2944

for_each_zone_zonelist(zone, z, zonelist, offset) {

2947

for_each_zone_zonelist(zone, z, zonelist, offset) {

2945

unsigned long size = zone->managed_pages;

2948

unsigned long size = zone->managed_pages;

2946

unsigned long high = high_wmark_pages(zone);

2949

unsigned long high = high_wmark_pages(zone);

2947

if (size > high)

2950

if (size > high)

2948

sum += size - high;

2951

sum += size - high;

2949

}

2952

}

2950

2953

2951

return sum;

2954

return sum;

2952

}

2955

}

2953

2956

2954

/**

2957

/**

2955

* nr_free_buffer_pages - count number of pages beyond high watermark

2958

* nr_free_buffer_pages - count number of pages beyond high watermark

2956

*

2959

*

2957

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2960

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2958

* watermark within ZONE_DMA and ZONE_NORMAL.

2961

* watermark within ZONE_DMA and ZONE_NORMAL.

2959

*/

2962

*/

2960

unsigned long nr_free_buffer_pages(void)

2963

unsigned long nr_free_buffer_pages(void)

2961

{

2964

{

2962

return nr_free_zone_pages(gfp_zone(GFP_USER));

2965

return nr_free_zone_pages(gfp_zone(GFP_USER));

2963

}

2966

}

2964

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2967

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2965

2968

2966

/**

2969

/**

2967

* nr_free_pagecache_pages - count number of pages beyond high watermark

2970

* nr_free_pagecache_pages - count number of pages beyond high watermark

2968

*

2971

*

2969

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2972

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2970

* high watermark within all zones.

2973

* high watermark within all zones.

2971

*/

2974

*/

2972

unsigned long nr_free_pagecache_pages(void)

2975

unsigned long nr_free_pagecache_pages(void)

2973

{

2976

{

2974

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2977

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2975

}

2978

}

2976

2979

2977

static inline void show_node(struct zone *zone)

2980

static inline void show_node(struct zone *zone)

2978

{

2981

{

2979

if (IS_ENABLED(CONFIG_NUMA))

2982

if (IS_ENABLED(CONFIG_NUMA))

2980

printk("Node %d ", zone_to_nid(zone));

2983

printk("Node %d ", zone_to_nid(zone));

2981

}

2984

}

2982

2985

2983

void si_meminfo(struct sysinfo *val)

2986

void si_meminfo(struct sysinfo *val)

2984

{

2987

{

2985

val->totalram = totalram_pages;

2988

val->totalram = totalram_pages;

2986

val->sharedram = 0;

2989

val->sharedram = 0;

2987

val->freeram = global_page_state(NR_FREE_PAGES);

2990

val->freeram = global_page_state(NR_FREE_PAGES);

2988

val->bufferram = nr_blockdev_pages();

2991

val->bufferram = nr_blockdev_pages();

2989

val->totalhigh = totalhigh_pages;

2992

val->totalhigh = totalhigh_pages;

2990

val->freehigh = nr_free_highpages();

2993

val->freehigh = nr_free_highpages();

2991

val->mem_unit = PAGE_SIZE;

2994

val->mem_unit = PAGE_SIZE;

2992

}

2995

}

2993

2996

2994

EXPORT_SYMBOL(si_meminfo);

2997

EXPORT_SYMBOL(si_meminfo);

2995

2998

2996

#ifdef CONFIG_NUMA

2999

#ifdef CONFIG_NUMA

2997

void si_meminfo_node(struct sysinfo *val, int nid)

3000

void si_meminfo_node(struct sysinfo *val, int nid)

2998

{

3001

{

2999

int zone_type; /* needs to be signed */

3002

int zone_type; /* needs to be signed */

3000

unsigned long managed_pages = 0;

3003

unsigned long managed_pages = 0;

3001

pg_data_t *pgdat = NODE_DATA(nid);

3004

pg_data_t *pgdat = NODE_DATA(nid);

3002

3005

3003

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3006

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3004

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3007

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3005

val->totalram = managed_pages;

3008

val->totalram = managed_pages;

3006

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3009

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3007

#ifdef CONFIG_HIGHMEM

3010

#ifdef CONFIG_HIGHMEM

3008

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3011

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3009

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3012

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3010

NR_FREE_PAGES);

3013

NR_FREE_PAGES);

3011

#else

3014

#else

3012

val->totalhigh = 0;

3015

val->totalhigh = 0;

3013

val->freehigh = 0;

3016

val->freehigh = 0;

3014

#endif

3017

#endif

3015

val->mem_unit = PAGE_SIZE;

3018

val->mem_unit = PAGE_SIZE;

3016

}

3019

}

3017

#endif

3020

#endif

3018

3021

3019

/*

3022

/*

3020

* Determine whether the node should be displayed or not, depending on whether

3023

* Determine whether the node should be displayed or not, depending on whether

3021

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3024

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3022

*/

3025

*/

3023

bool skip_free_areas_node(unsigned int flags, int nid)

3026

bool skip_free_areas_node(unsigned int flags, int nid)

3024

{

3027

{

3025

bool ret = false;

3028

bool ret = false;

3026

unsigned int cpuset_mems_cookie;

3029

unsigned int cpuset_mems_cookie;

3027

3030

3028

if (!(flags & SHOW_MEM_FILTER_NODES))

3031

if (!(flags & SHOW_MEM_FILTER_NODES))

3029

goto out;

3032

goto out;

3030

3033

3031

do {

3034

do {

3032

cpuset_mems_cookie = read_mems_allowed_begin();

3035

cpuset_mems_cookie = read_mems_allowed_begin();

3033

ret = !node_isset(nid, cpuset_current_mems_allowed);

3036

ret = !node_isset(nid, cpuset_current_mems_allowed);

3034

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3037

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3035

out:

3038

out:

3036

return ret;

3039

return ret;

3037

}

3040

}

3038

3041

3039

#define K(x) ((x) << (PAGE_SHIFT-10))

3042

#define K(x) ((x) << (PAGE_SHIFT-10))

3040

3043

3041

static void show_migration_types(unsigned char type)

3044

static void show_migration_types(unsigned char type)

3042

{

3045

{

3043

static const char types[MIGRATE_TYPES] = {

3046

static const char types[MIGRATE_TYPES] = {

3044

[MIGRATE_UNMOVABLE] = 'U',

3047

[MIGRATE_UNMOVABLE] = 'U',

3045

[MIGRATE_RECLAIMABLE] = 'E',

3048

[MIGRATE_RECLAIMABLE] = 'E',

3046

[MIGRATE_MOVABLE] = 'M',

3049

[MIGRATE_MOVABLE] = 'M',

3047

[MIGRATE_RESERVE] = 'R',

3050

[MIGRATE_RESERVE] = 'R',

3048

#ifdef CONFIG_CMA

3051

#ifdef CONFIG_CMA

3049

[MIGRATE_CMA] = 'C',

3052

[MIGRATE_CMA] = 'C',

3050

#endif

3053

#endif

3051

#ifdef CONFIG_MEMORY_ISOLATION

3054

#ifdef CONFIG_MEMORY_ISOLATION

3052

[MIGRATE_ISOLATE] = 'I',

3055

[MIGRATE_ISOLATE] = 'I',

3053

#endif

3056

#endif

3054

};

3057

};

3055

char tmp[MIGRATE_TYPES + 1];

3058

char tmp[MIGRATE_TYPES + 1];

3056

char *p = tmp;

3059

char *p = tmp;

3057

int i;

3060

int i;

3058

3061

3059

for (i = 0; i < MIGRATE_TYPES; i++) {

3062

for (i = 0; i < MIGRATE_TYPES; i++) {

3060

if (type & (1 << i))

3063

if (type & (1 << i))

3061

*p++ = types[i];

3064

*p++ = types[i];

3062

}

3065

}

3063

3066

3064

*p = '\0';

3067

*p = '\0';

3065

printk("(%s) ", tmp);

3068

printk("(%s) ", tmp);

3066

}

3069

}

3067

3070

3068

/*

3071

/*

3069

* Show free area list (used inside shift_scroll-lock stuff)

3072

* Show free area list (used inside shift_scroll-lock stuff)

3070

* We also calculate the percentage fragmentation. We do this by counting the

3073

* We also calculate the percentage fragmentation. We do this by counting the

3071

* memory on each free list with the exception of the first item on the list.

3074

* memory on each free list with the exception of the first item on the list.

3072

* Suppresses nodes that are not allowed by current's cpuset if

3075

* Suppresses nodes that are not allowed by current's cpuset if

3073

* SHOW_MEM_FILTER_NODES is passed.

3076

* SHOW_MEM_FILTER_NODES is passed.

3074

*/

3077

*/

3075

void show_free_areas(unsigned int filter)

3078

void show_free_areas(unsigned int filter)

3076

{

3079

{

3077

int cpu;

3080

int cpu;

3078

struct zone *zone;

3081

struct zone *zone;

3079

3082

3080

for_each_populated_zone(zone) {

3083

for_each_populated_zone(zone) {

3081

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3084

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3082

continue;

3085

continue;

3083

show_node(zone);

3086

show_node(zone);

3084

printk("%s per-cpu:\n", zone->name);

3087

printk("%s per-cpu:\n", zone->name);

3085

3088

3086

for_each_online_cpu(cpu) {

3089

for_each_online_cpu(cpu) {

3087

struct per_cpu_pageset *pageset;

3090

struct per_cpu_pageset *pageset;

3088

3091

3089

pageset = per_cpu_ptr(zone->pageset, cpu);

3092

pageset = per_cpu_ptr(zone->pageset, cpu);

3090

3093

3091

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3094

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3092

cpu, pageset->pcp.high,

3095

cpu, pageset->pcp.high,

3093

pageset->pcp.batch, pageset->pcp.count);

3096

pageset->pcp.batch, pageset->pcp.count);

3094

}

3097

}

3095

}

3098

}

3096

3099

3097

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3100

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3098

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3101

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3099

" unevictable:%lu"

3102

" unevictable:%lu"

3100

" dirty:%lu writeback:%lu unstable:%lu\n"

3103

" dirty:%lu writeback:%lu unstable:%lu\n"

3101

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3104

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3102

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3105

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3103

" free_cma:%lu\n",

3106

" free_cma:%lu\n",

3104

global_page_state(NR_ACTIVE_ANON),

3107

global_page_state(NR_ACTIVE_ANON),

3105

global_page_state(NR_INACTIVE_ANON),

3108

global_page_state(NR_INACTIVE_ANON),

3106

global_page_state(NR_ISOLATED_ANON),

3109

global_page_state(NR_ISOLATED_ANON),

3107

global_page_state(NR_ACTIVE_FILE),

3110

global_page_state(NR_ACTIVE_FILE),

3108

global_page_state(NR_INACTIVE_FILE),

3111

global_page_state(NR_INACTIVE_FILE),

3109

global_page_state(NR_ISOLATED_FILE),

3112

global_page_state(NR_ISOLATED_FILE),

3110

global_page_state(NR_UNEVICTABLE),

3113

global_page_state(NR_UNEVICTABLE),

3111

global_page_state(NR_FILE_DIRTY),

3114

global_page_state(NR_FILE_DIRTY),

3112

global_page_state(NR_WRITEBACK),

3115

global_page_state(NR_WRITEBACK),

3113

global_page_state(NR_UNSTABLE_NFS),

3116

global_page_state(NR_UNSTABLE_NFS),

3114

global_page_state(NR_FREE_PAGES),

3117

global_page_state(NR_FREE_PAGES),

3115

global_page_state(NR_SLAB_RECLAIMABLE),

3118

global_page_state(NR_SLAB_RECLAIMABLE),

3116

global_page_state(NR_SLAB_UNRECLAIMABLE),

3119

global_page_state(NR_SLAB_UNRECLAIMABLE),

3117

global_page_state(NR_FILE_MAPPED),

3120

global_page_state(NR_FILE_MAPPED),

3118

global_page_state(NR_SHMEM),

3121

global_page_state(NR_SHMEM),

3119

global_page_state(NR_PAGETABLE),

3122

global_page_state(NR_PAGETABLE),

3120

global_page_state(NR_BOUNCE),

3123

global_page_state(NR_BOUNCE),

3121

global_page_state(NR_FREE_CMA_PAGES));

3124

global_page_state(NR_FREE_CMA_PAGES));

3122

3125

3123

for_each_populated_zone(zone) {

3126

for_each_populated_zone(zone) {

3124

int i;

3127

int i;

3125

3128

3126

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3129

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3127

continue;

3130

continue;

3128

show_node(zone);

3131

show_node(zone);

3129

printk("%s"

3132

printk("%s"

3130

" free:%lukB"

3133

" free:%lukB"

3131

" min:%lukB"

3134

" min:%lukB"

3132

" low:%lukB"

3135

" low:%lukB"

3133

" high:%lukB"

3136

" high:%lukB"

3134

" active_anon:%lukB"

3137

" active_anon:%lukB"

3135

" inactive_anon:%lukB"

3138

" inactive_anon:%lukB"

3136

" active_file:%lukB"

3139

" active_file:%lukB"

3137

" inactive_file:%lukB"

3140

" inactive_file:%lukB"

3138

" unevictable:%lukB"

3141

" unevictable:%lukB"

3139

" isolated(anon):%lukB"

3142

" isolated(anon):%lukB"

3140

" isolated(file):%lukB"

3143

" isolated(file):%lukB"

3141

" present:%lukB"

3144

" present:%lukB"

3142

" managed:%lukB"

3145

" managed:%lukB"

3143

" mlocked:%lukB"

3146

" mlocked:%lukB"

3144

" dirty:%lukB"

3147

" dirty:%lukB"

3145

" writeback:%lukB"

3148

" writeback:%lukB"

3146

" mapped:%lukB"

3149

" mapped:%lukB"

3147

" shmem:%lukB"

3150

" shmem:%lukB"

3148

" slab_reclaimable:%lukB"

3151

" slab_reclaimable:%lukB"

3149

" slab_unreclaimable:%lukB"

3152

" slab_unreclaimable:%lukB"

3150

" kernel_stack:%lukB"

3153

" kernel_stack:%lukB"

3151

" pagetables:%lukB"

3154

" pagetables:%lukB"

3152

" unstable:%lukB"

3155

" unstable:%lukB"

3153

" bounce:%lukB"

3156

" bounce:%lukB"

3154

" free_cma:%lukB"

3157

" free_cma:%lukB"

3155

" writeback_tmp:%lukB"

3158

" writeback_tmp:%lukB"

3156

" pages_scanned:%lu"

3159

" pages_scanned:%lu"

3157

" all_unreclaimable? %s"

3160

" all_unreclaimable? %s"

3158

"\n",

3161

"\n",

3159

zone->name,

3162

zone->name,

3160

K(zone_page_state(zone, NR_FREE_PAGES)),

3163

K(zone_page_state(zone, NR_FREE_PAGES)),

3161

K(min_wmark_pages(zone)),

3164

K(min_wmark_pages(zone)),

3162

K(low_wmark_pages(zone)),

3165

K(low_wmark_pages(zone)),

3163

K(high_wmark_pages(zone)),

3166

K(high_wmark_pages(zone)),

3164

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3167

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3165

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3168

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3166

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3169

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3167

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3170

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3168

K(zone_page_state(zone, NR_UNEVICTABLE)),

3171

K(zone_page_state(zone, NR_UNEVICTABLE)),

3169

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3172

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3170

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3173

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3171

K(zone->present_pages),

3174

K(zone->present_pages),

3172

K(zone->managed_pages),

3175

K(zone->managed_pages),

3173

K(zone_page_state(zone, NR_MLOCK)),

3176

K(zone_page_state(zone, NR_MLOCK)),

3174

K(zone_page_state(zone, NR_FILE_DIRTY)),

3177

K(zone_page_state(zone, NR_FILE_DIRTY)),

3175

K(zone_page_state(zone, NR_WRITEBACK)),

3178

K(zone_page_state(zone, NR_WRITEBACK)),

3176

K(zone_page_state(zone, NR_FILE_MAPPED)),

3179

K(zone_page_state(zone, NR_FILE_MAPPED)),

3177

K(zone_page_state(zone, NR_SHMEM)),

3180

K(zone_page_state(zone, NR_SHMEM)),

3178

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3181

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3179

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3182

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3180

zone_page_state(zone, NR_KERNEL_STACK) *

3183

zone_page_state(zone, NR_KERNEL_STACK) *

3181

THREAD_SIZE / 1024,

3184

THREAD_SIZE / 1024,

3182

K(zone_page_state(zone, NR_PAGETABLE)),

3185

K(zone_page_state(zone, NR_PAGETABLE)),

3183

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3186

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3184

K(zone_page_state(zone, NR_BOUNCE)),

3187

K(zone_page_state(zone, NR_BOUNCE)),

3185

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3188

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3186

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3189

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3187

zone->pages_scanned,

3190

zone->pages_scanned,

3188

(!zone_reclaimable(zone) ? "yes" : "no")

3191

(!zone_reclaimable(zone) ? "yes" : "no")

3189

);

3192

);

3190

printk("lowmem_reserve[]:");

3193

printk("lowmem_reserve[]:");

3191

for (i = 0; i < MAX_NR_ZONES; i++)

3194

for (i = 0; i < MAX_NR_ZONES; i++)

3192

printk(" %lu", zone->lowmem_reserve[i]);

3195

printk(" %lu", zone->lowmem_reserve[i]);

3193

printk("\n");

3196

printk("\n");

3194

}

3197

}

3195

3198

3196

for_each_populated_zone(zone) {

3199

for_each_populated_zone(zone) {

3197

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3200

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3198

unsigned char types[MAX_ORDER];

3201

unsigned char types[MAX_ORDER];

3199

3202

3200

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3203

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3201

continue;

3204

continue;

3202

show_node(zone);

3205

show_node(zone);

3203

printk("%s: ", zone->name);

3206

printk("%s: ", zone->name);

3204

3207

3205

spin_lock_irqsave(&zone->lock, flags);

3208

spin_lock_irqsave(&zone->lock, flags);

3206

for (order = 0; order < MAX_ORDER; order++) {

3209

for (order = 0; order < MAX_ORDER; order++) {

3207

struct free_area *area = &zone->free_area[order];

3210

struct free_area *area = &zone->free_area[order];

3208

int type;

3211

int type;

3209

3212

3210

nr[order] = area->nr_free;

3213

nr[order] = area->nr_free;

3211

total += nr[order] << order;

3214

total += nr[order] << order;

3212

3215

3213

types[order] = 0;

3216

types[order] = 0;

3214

for (type = 0; type < MIGRATE_TYPES; type++) {

3217

for (type = 0; type < MIGRATE_TYPES; type++) {

3215

if (!list_empty(&area->free_list[type]))

3218

if (!list_empty(&area->free_list[type]))

3216

types[order] |= 1 << type;

3219

types[order] |= 1 << type;

3217

}

3220

}

3218

}

3221

}

3219

spin_unlock_irqrestore(&zone->lock, flags);

3222

spin_unlock_irqrestore(&zone->lock, flags);

3220

for (order = 0; order < MAX_ORDER; order++) {

3223

for (order = 0; order < MAX_ORDER; order++) {

3221

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3224

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3222

if (nr[order])

3225

if (nr[order])

3223

show_migration_types(types[order]);

3226

show_migration_types(types[order]);

3224

}

3227

}

3225

printk("= %lukB\n", K(total));

3228

printk("= %lukB\n", K(total));

3226

}

3229

}

3227

3230

3228

hugetlb_show_meminfo();

3231

hugetlb_show_meminfo();

3229

3232

3230

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3233

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3231

3234

3232

show_swap_cache_info();

3235

show_swap_cache_info();

3233

}

3236

}

3234

3237

3235

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3238

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3236

{

3239

{

3237

zoneref->zone = zone;

3240

zoneref->zone = zone;

3238

zoneref->zone_idx = zone_idx(zone);

3241

zoneref->zone_idx = zone_idx(zone);

3239

}

3242

}

3240

3243

3241

/*

3244

/*

3242

* Builds allocation fallback zone lists.

3245

* Builds allocation fallback zone lists.

3243

*

3246

*

3244

* Add all populated zones of a node to the zonelist.

3247

* Add all populated zones of a node to the zonelist.

3245

*/

3248

*/

3246

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3249

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3247

int nr_zones)

3250

int nr_zones)

3248

{

3251

{

3249

struct zone *zone;

3252

struct zone *zone;

3250

enum zone_type zone_type = MAX_NR_ZONES;

3253

enum zone_type zone_type = MAX_NR_ZONES;

3251

3254

3252

do {

3255

do {

3253

zone_type--;

3256

zone_type--;

3254

zone = pgdat->node_zones + zone_type;

3257

zone = pgdat->node_zones + zone_type;

3255

if (populated_zone(zone)) {

3258

if (populated_zone(zone)) {

3256

zoneref_set_zone(zone,

3259

zoneref_set_zone(zone,

3257

&zonelist->_zonerefs[nr_zones++]);

3260

&zonelist->_zonerefs[nr_zones++]);

3258

check_highest_zone(zone_type);

3261

check_highest_zone(zone_type);

3259

}

3262

}

3260

} while (zone_type);

3263

} while (zone_type);

3261

3264

3262

return nr_zones;

3265

return nr_zones;

3263

}

3266

}

3264

3267

3265

3268

3266

/*

3269

/*

3267

* zonelist_order:

3270

* zonelist_order:

3268

* 0 = automatic detection of better ordering.

3271

* 0 = automatic detection of better ordering.

3269

* 1 = order by ([node] distance, -zonetype)

3272

* 1 = order by ([node] distance, -zonetype)

3270

* 2 = order by (-zonetype, [node] distance)

3273

* 2 = order by (-zonetype, [node] distance)

3271

*

3274

*

3272

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3275

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3273

* the same zonelist. So only NUMA can configure this param.

3276

* the same zonelist. So only NUMA can configure this param.

3274

*/

3277

*/

3275

#define ZONELIST_ORDER_DEFAULT 0

3278

#define ZONELIST_ORDER_DEFAULT 0

3276

#define ZONELIST_ORDER_NODE 1

3279

#define ZONELIST_ORDER_NODE 1

3277

#define ZONELIST_ORDER_ZONE 2

3280

#define ZONELIST_ORDER_ZONE 2

3278

3281

3279

/* zonelist order in the kernel.

3282

/* zonelist order in the kernel.

3280

* set_zonelist_order() will set this to NODE or ZONE.

3283

* set_zonelist_order() will set this to NODE or ZONE.

3281

*/

3284

*/

3282

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3285

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3283

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3286

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3284

3287

3285

3288

3286

#ifdef CONFIG_NUMA

3289

#ifdef CONFIG_NUMA

3287

/* The value user specified ....changed by config */

3290

/* The value user specified ....changed by config */

3288

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3291

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3289

/* string for sysctl */

3292

/* string for sysctl */

3290

#define NUMA_ZONELIST_ORDER_LEN 16

3293

#define NUMA_ZONELIST_ORDER_LEN 16

3291

char numa_zonelist_order[16] = "default";

3294

char numa_zonelist_order[16] = "default";

3292

3295

3293

/*

3296

/*

3294

* interface for configure zonelist ordering.

3297

* interface for configure zonelist ordering.

3295

* command line option "numa_zonelist_order"

3298

* command line option "numa_zonelist_order"

3296

* = "[dD]efault - default, automatic configuration.

3299

* = "[dD]efault - default, automatic configuration.

3297

* = "[nN]ode - order by node locality, then by zone within node

3300

* = "[nN]ode - order by node locality, then by zone within node

3298

* = "[zZ]one - order by zone, then by locality within zone

3301

* = "[zZ]one - order by zone, then by locality within zone

3299

*/

3302

*/

3300

3303

3301

static int __parse_numa_zonelist_order(char *s)

3304

static int __parse_numa_zonelist_order(char *s)

3302

{

3305

{

3303

if (*s == 'd' || *s == 'D') {

3306

if (*s == 'd' || *s == 'D') {

3304

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3307

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3305

} else if (*s == 'n' || *s == 'N') {

3308

} else if (*s == 'n' || *s == 'N') {

3306

user_zonelist_order = ZONELIST_ORDER_NODE;

3309

user_zonelist_order = ZONELIST_ORDER_NODE;

3307

} else if (*s == 'z' || *s == 'Z') {

3310

} else if (*s == 'z' || *s == 'Z') {

3308

user_zonelist_order = ZONELIST_ORDER_ZONE;

3311

user_zonelist_order = ZONELIST_ORDER_ZONE;

3309

} else {

3312

} else {

3310

printk(KERN_WARNING

3313

printk(KERN_WARNING

3311

"Ignoring invalid numa_zonelist_order value: "

3314

"Ignoring invalid numa_zonelist_order value: "

3312

"%s\n", s);

3315

"%s\n", s);

3313

return -EINVAL;

3316

return -EINVAL;

3314

}

3317

}

3315

return 0;

3318

return 0;

3316

}

3319

}

3317

3320

3318

static __init int setup_numa_zonelist_order(char *s)

3321

static __init int setup_numa_zonelist_order(char *s)

3319

{

3322

{

3320

int ret;

3323

int ret;

3321

3324

3322

if (!s)

3325

if (!s)

3323

return 0;

3326

return 0;

3324

3327

3325

ret = __parse_numa_zonelist_order(s);

3328

ret = __parse_numa_zonelist_order(s);

3326

if (ret == 0)

3329

if (ret == 0)

3327

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3330

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3328

3331

3329

return ret;

3332

return ret;

3330

}

3333

}

3331

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3334

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3332

3335

3333

/*

3336

/*

3334

* sysctl handler for numa_zonelist_order

3337

* sysctl handler for numa_zonelist_order

3335

*/

3338

*/

3336

int numa_zonelist_order_handler(ctl_table *table, int write,

3339

int numa_zonelist_order_handler(ctl_table *table, int write,

3337

void __user *buffer, size_t *length,

3340

void __user *buffer, size_t *length,

3338

loff_t *ppos)

3341

loff_t *ppos)

3339

{

3342

{

3340

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3343

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3341

int ret;

3344

int ret;

3342

static DEFINE_MUTEX(zl_order_mutex);

3345

static DEFINE_MUTEX(zl_order_mutex);

3343

3346

3344

mutex_lock(&zl_order_mutex);

3347

mutex_lock(&zl_order_mutex);

3345

if (write) {

3348

if (write) {

3346

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3349

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3347

ret = -EINVAL;

3350

ret = -EINVAL;

3348

goto out;

3351

goto out;

3349

}

3352

}

3350

strcpy(saved_string, (char *)table->data);

3353

strcpy(saved_string, (char *)table->data);

3351

}

3354

}

3352

ret = proc_dostring(table, write, buffer, length, ppos);

3355

ret = proc_dostring(table, write, buffer, length, ppos);

3353

if (ret)

3356

if (ret)

3354

goto out;

3357

goto out;

3355

if (write) {

3358

if (write) {

3356

int oldval = user_zonelist_order;

3359

int oldval = user_zonelist_order;

3357

3360

3358

ret = __parse_numa_zonelist_order((char *)table->data);

3361

ret = __parse_numa_zonelist_order((char *)table->data);

3359

if (ret) {

3362

if (ret) {

3360

/*

3363

/*

3361

* bogus value. restore saved string

3364

* bogus value. restore saved string

3362

*/

3365

*/

3363

strncpy((char *)table->data, saved_string,

3366

strncpy((char *)table->data, saved_string,

3364

NUMA_ZONELIST_ORDER_LEN);

3367

NUMA_ZONELIST_ORDER_LEN);

3365

user_zonelist_order = oldval;

3368

user_zonelist_order = oldval;

3366

} else if (oldval != user_zonelist_order) {

3369

} else if (oldval != user_zonelist_order) {

3367

mutex_lock(&zonelists_mutex);

3370

mutex_lock(&zonelists_mutex);

3368

build_all_zonelists(NULL, NULL);

3371

build_all_zonelists(NULL, NULL);

3369

mutex_unlock(&zonelists_mutex);

3372

mutex_unlock(&zonelists_mutex);

3370

}

3373

}

3371

}

3374

}

3372

out:

3375

out:

3373

mutex_unlock(&zl_order_mutex);

3376

mutex_unlock(&zl_order_mutex);

3374

return ret;

3377

return ret;

3375

}

3378

}

3376

3379

3377

3380

3378

#define MAX_NODE_LOAD (nr_online_nodes)

3381

#define MAX_NODE_LOAD (nr_online_nodes)

3379

static int node_load[MAX_NUMNODES];

3382

static int node_load[MAX_NUMNODES];

3380

3383

3381

/**

3384

/**

3382

* find_next_best_node - find the next node that should appear in a given node's fallback list

3385

* find_next_best_node - find the next node that should appear in a given node's fallback list

3383

* @node: node whose fallback list we're appending

3386

* @node: node whose fallback list we're appending

3384

* @used_node_mask: nodemask_t of already used nodes

3387

* @used_node_mask: nodemask_t of already used nodes

3385

*

3388

*

3386

* We use a number of factors to determine which is the next node that should

3389

* We use a number of factors to determine which is the next node that should

3387

* appear on a given node's fallback list. The node should not have appeared

3390

* appear on a given node's fallback list. The node should not have appeared

3388

* already in @node's fallback list, and it should be the next closest node

3391

* already in @node's fallback list, and it should be the next closest node

3389

* according to the distance array (which contains arbitrary distance values

3392

* according to the distance array (which contains arbitrary distance values

3390

* from each node to each node in the system), and should also prefer nodes

3393

* from each node to each node in the system), and should also prefer nodes

3391

* with no CPUs, since presumably they'll have very little allocation pressure

3394

* with no CPUs, since presumably they'll have very little allocation pressure

3392

* on them otherwise.

3395

* on them otherwise.

3393

* It returns -1 if no node is found.

3396

* It returns -1 if no node is found.

3394

*/

3397

*/

3395

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3398

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3396

{

3399

{

3397

int n, val;

3400

int n, val;

3398

int min_val = INT_MAX;

3401

int min_val = INT_MAX;

3399

int best_node = NUMA_NO_NODE;

3402

int best_node = NUMA_NO_NODE;

3400

const struct cpumask *tmp = cpumask_of_node(0);

3403

const struct cpumask *tmp = cpumask_of_node(0);

3401

3404

3402

/* Use the local node if we haven't already */

3405

/* Use the local node if we haven't already */

3403

if (!node_isset(node, *used_node_mask)) {

3406

if (!node_isset(node, *used_node_mask)) {

3404

node_set(node, *used_node_mask);

3407

node_set(node, *used_node_mask);

3405

return node;

3408

return node;

3406

}

3409

}

3407

3410

3408

for_each_node_state(n, N_MEMORY) {

3411

for_each_node_state(n, N_MEMORY) {

3409

3412

3410

/* Don't want a node to appear more than once */

3413

/* Don't want a node to appear more than once */

3411

if (node_isset(n, *used_node_mask))

3414

if (node_isset(n, *used_node_mask))

3412

continue;

3415

continue;

3413

3416

3414

/* Use the distance array to find the distance */

3417

/* Use the distance array to find the distance */

3415

val = node_distance(node, n);

3418

val = node_distance(node, n);

3416

3419

3417

/* Penalize nodes under us ("prefer the next node") */

3420

/* Penalize nodes under us ("prefer the next node") */

3418

val += (n < node);

3421

val += (n < node);

3419

3422

3420

/* Give preference to headless and unused nodes */

3423

/* Give preference to headless and unused nodes */

3421

tmp = cpumask_of_node(n);

3424

tmp = cpumask_of_node(n);

3422

if (!cpumask_empty(tmp))

3425

if (!cpumask_empty(tmp))

3423

val += PENALTY_FOR_NODE_WITH_CPUS;

3426

val += PENALTY_FOR_NODE_WITH_CPUS;

3424

3427

3425

/* Slight preference for less loaded node */

3428

/* Slight preference for less loaded node */

3426

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3429

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3427

val += node_load[n];

3430

val += node_load[n];

3428

3431

3429

if (val < min_val) {

3432

if (val < min_val) {

3430

min_val = val;

3433

min_val = val;

3431

best_node = n;

3434

best_node = n;

3432

}

3435

}

3433

}

3436

}

3434

3437

3435

if (best_node >= 0)

3438

if (best_node >= 0)

3436

node_set(best_node, *used_node_mask);

3439

node_set(best_node, *used_node_mask);

3437

3440

3438

return best_node;

3441

return best_node;

3439

}

3442

}

3440

3443

3441

3444

3442

/*

3445

/*

3443

* Build zonelists ordered by node and zones within node.

3446

* Build zonelists ordered by node and zones within node.

3444

* This results in maximum locality--normal zone overflows into local

3447

* This results in maximum locality--normal zone overflows into local

3445

* DMA zone, if any--but risks exhausting DMA zone.

3448

* DMA zone, if any--but risks exhausting DMA zone.

3446

*/

3449

*/

3447

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3450

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3448

{

3451

{

3449

int j;

3452

int j;

3450

struct zonelist *zonelist;

3453

struct zonelist *zonelist;

3451

3454

3452

zonelist = &pgdat->node_zonelists[0];

3455

zonelist = &pgdat->node_zonelists[0];

3453

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3456

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3454

;

3457

;

3455

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3458

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3456

zonelist->_zonerefs[j].zone = NULL;

3459

zonelist->_zonerefs[j].zone = NULL;

3457

zonelist->_zonerefs[j].zone_idx = 0;

3460

zonelist->_zonerefs[j].zone_idx = 0;

3458

}

3461

}

3459

3462

3460

/*

3463

/*

3461

* Build gfp_thisnode zonelists

3464

* Build gfp_thisnode zonelists

3462

*/

3465

*/

3463

static void build_thisnode_zonelists(pg_data_t *pgdat)

3466

static void build_thisnode_zonelists(pg_data_t *pgdat)

3464

{

3467

{

3465

int j;

3468

int j;

3466

struct zonelist *zonelist;

3469

struct zonelist *zonelist;

3467

3470

3468

zonelist = &pgdat->node_zonelists[1];

3471

zonelist = &pgdat->node_zonelists[1];

3469

j = build_zonelists_node(pgdat, zonelist, 0);

3472

j = build_zonelists_node(pgdat, zonelist, 0);

3470

zonelist->_zonerefs[j].zone = NULL;

3473

zonelist->_zonerefs[j].zone = NULL;

3471

zonelist->_zonerefs[j].zone_idx = 0;

3474

zonelist->_zonerefs[j].zone_idx = 0;

3472

}

3475

}

3473

3476

3474

/*

3477

/*

3475

* Build zonelists ordered by zone and nodes within zones.

3478

* Build zonelists ordered by zone and nodes within zones.

3476

* This results in conserving DMA zone[s] until all Normal memory is

3479

* This results in conserving DMA zone[s] until all Normal memory is

3477

* exhausted, but results in overflowing to remote node while memory

3480

* exhausted, but results in overflowing to remote node while memory

3478

* may still exist in local DMA zone.

3481

* may still exist in local DMA zone.

3479

*/

3482

*/

3480

static int node_order[MAX_NUMNODES];

3483

static int node_order[MAX_NUMNODES];

3481

3484

3482

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3485

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3483

{

3486

{

3484

int pos, j, node;

3487

int pos, j, node;

3485

int zone_type; /* needs to be signed */

3488

int zone_type; /* needs to be signed */

3486

struct zone *z;

3489

struct zone *z;

3487

struct zonelist *zonelist;

3490

struct zonelist *zonelist;

3488

3491

3489

zonelist = &pgdat->node_zonelists[0];

3492

zonelist = &pgdat->node_zonelists[0];

3490

pos = 0;

3493

pos = 0;

3491

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3494

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3492

for (j = 0; j < nr_nodes; j++) {

3495

for (j = 0; j < nr_nodes; j++) {

3493

node = node_order[j];

3496

node = node_order[j];

3494

z = &NODE_DATA(node)->node_zones[zone_type];

3497

z = &NODE_DATA(node)->node_zones[zone_type];

3495

if (populated_zone(z)) {

3498

if (populated_zone(z)) {

3496

zoneref_set_zone(z,

3499

zoneref_set_zone(z,

3497

&zonelist->_zonerefs[pos++]);

3500

&zonelist->_zonerefs[pos++]);

3498

check_highest_zone(zone_type);

3501

check_highest_zone(zone_type);

3499

}

3502

}

3500

}

3503

}

3501

}

3504

}

3502

zonelist->_zonerefs[pos].zone = NULL;

3505

zonelist->_zonerefs[pos].zone = NULL;

3503

zonelist->_zonerefs[pos].zone_idx = 0;

3506

zonelist->_zonerefs[pos].zone_idx = 0;

3504

}

3507

}

3505

3508

3506

static int default_zonelist_order(void)

3509

static int default_zonelist_order(void)

3507

{

3510

{

3508

int nid, zone_type;

3511

int nid, zone_type;

3509

unsigned long low_kmem_size, total_size;

3512

unsigned long low_kmem_size, total_size;

3510

struct zone *z;

3513

struct zone *z;

3511

int average_size;

3514

int average_size;

3512

/*

3515

/*

3513

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3516

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3514

* If they are really small and used heavily, the system can fall

3517

* If they are really small and used heavily, the system can fall

3515

* into OOM very easily.

3518

* into OOM very easily.

3516

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3519

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3517

*/

3520

*/

3518

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3521

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3519

low_kmem_size = 0;

3522

low_kmem_size = 0;

3520

total_size = 0;

3523

total_size = 0;

3521

for_each_online_node(nid) {

3524

for_each_online_node(nid) {

3522

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3525

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3523

z = &NODE_DATA(nid)->node_zones[zone_type];

3526

z = &NODE_DATA(nid)->node_zones[zone_type];

3524

if (populated_zone(z)) {

3527

if (populated_zone(z)) {

3525

if (zone_type < ZONE_NORMAL)

3528

if (zone_type < ZONE_NORMAL)

3526

low_kmem_size += z->managed_pages;

3529

low_kmem_size += z->managed_pages;

3527

total_size += z->managed_pages;

3530

total_size += z->managed_pages;

3528

} else if (zone_type == ZONE_NORMAL) {

3531

} else if (zone_type == ZONE_NORMAL) {

3529

/*

3532

/*

3530

* If any node has only lowmem, then node order

3533

* If any node has only lowmem, then node order

3531

* is preferred to allow kernel allocations

3534

* is preferred to allow kernel allocations

3532

* locally; otherwise, they can easily infringe

3535

* locally; otherwise, they can easily infringe

3533

* on other nodes when there is an abundance of

3536

* on other nodes when there is an abundance of

3534

* lowmem available to allocate from.

3537

* lowmem available to allocate from.

3535

*/

3538

*/

3536

return ZONELIST_ORDER_NODE;

3539

return ZONELIST_ORDER_NODE;

3537

}

3540

}

3538

}

3541

}

3539

}

3542

}

3540

if (!low_kmem_size || /* there are no DMA area. */

3543

if (!low_kmem_size || /* there are no DMA area. */

3541

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3544

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3542

return ZONELIST_ORDER_NODE;

3545

return ZONELIST_ORDER_NODE;

3543

/*

3546

/*

3544

* look into each node's config.

3547

* look into each node's config.

3545

* If there is a node whose DMA/DMA32 memory is very big area on

3548

* If there is a node whose DMA/DMA32 memory is very big area on

3546

* local memory, NODE_ORDER may be suitable.

3549

* local memory, NODE_ORDER may be suitable.

3547

*/

3550

*/

3548

average_size = total_size /

3551

average_size = total_size /

3549

(nodes_weight(node_states[N_MEMORY]) + 1);

3552

(nodes_weight(node_states[N_MEMORY]) + 1);

3550

for_each_online_node(nid) {

3553

for_each_online_node(nid) {

3551

low_kmem_size = 0;

3554

low_kmem_size = 0;

3552

total_size = 0;

3555

total_size = 0;

3553

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3556

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3554

z = &NODE_DATA(nid)->node_zones[zone_type];

3557

z = &NODE_DATA(nid)->node_zones[zone_type];

3555

if (populated_zone(z)) {

3558

if (populated_zone(z)) {

3556

if (zone_type < ZONE_NORMAL)

3559

if (zone_type < ZONE_NORMAL)

3557

low_kmem_size += z->present_pages;

3560

low_kmem_size += z->present_pages;

3558

total_size += z->present_pages;

3561

total_size += z->present_pages;

3559

}

3562

}

3560

}

3563

}

3561

if (low_kmem_size &&

3564

if (low_kmem_size &&

3562

total_size > average_size && /* ignore small node */

3565

total_size > average_size && /* ignore small node */

3563

low_kmem_size > total_size * 70/100)

3566

low_kmem_size > total_size * 70/100)

3564

return ZONELIST_ORDER_NODE;

3567

return ZONELIST_ORDER_NODE;

3565

}

3568

}

3566

return ZONELIST_ORDER_ZONE;

3569

return ZONELIST_ORDER_ZONE;

3567

}

3570

}

3568

3571

3569

static void set_zonelist_order(void)

3572

static void set_zonelist_order(void)

3570

{

3573

{

3571

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3574

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3572

current_zonelist_order = default_zonelist_order();

3575

current_zonelist_order = default_zonelist_order();

3573

else

3576

else

3574

current_zonelist_order = user_zonelist_order;

3577

current_zonelist_order = user_zonelist_order;

3575

}

3578

}

3576

3579

3577

static void build_zonelists(pg_data_t *pgdat)

3580

static void build_zonelists(pg_data_t *pgdat)

3578

{

3581

{

3579

int j, node, load;

3582

int j, node, load;

3580

enum zone_type i;

3583

enum zone_type i;

3581

nodemask_t used_mask;

3584

nodemask_t used_mask;

3582

int local_node, prev_node;

3585

int local_node, prev_node;

3583

struct zonelist *zonelist;

3586

struct zonelist *zonelist;

3584

int order = current_zonelist_order;

3587

int order = current_zonelist_order;

3585

3588

3586

/* initialize zonelists */

3589

/* initialize zonelists */

3587

for (i = 0; i < MAX_ZONELISTS; i++) {

3590

for (i = 0; i < MAX_ZONELISTS; i++) {

3588

zonelist = pgdat->node_zonelists + i;

3591

zonelist = pgdat->node_zonelists + i;

3589

zonelist->_zonerefs[0].zone = NULL;

3592

zonelist->_zonerefs[0].zone = NULL;

3590

zonelist->_zonerefs[0].zone_idx = 0;

3593

zonelist->_zonerefs[0].zone_idx = 0;

3591

}

3594

}

3592

3595

3593

/* NUMA-aware ordering of nodes */

3596

/* NUMA-aware ordering of nodes */

3594

local_node = pgdat->node_id;

3597

local_node = pgdat->node_id;

3595

load = nr_online_nodes;

3598

load = nr_online_nodes;

3596

prev_node = local_node;

3599

prev_node = local_node;

3597

nodes_clear(used_mask);

3600

nodes_clear(used_mask);

3598

3601

3599

memset(node_order, 0, sizeof(node_order));

3602

memset(node_order, 0, sizeof(node_order));

3600

j = 0;

3603

j = 0;

3601

3604

3602

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3605

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3603

/*

3606

/*

3604

* We don't want to pressure a particular node.

3607

* We don't want to pressure a particular node.

3605

* So adding penalty to the first node in same

3608

* So adding penalty to the first node in same

3606

* distance group to make it round-robin.

3609

* distance group to make it round-robin.

3607

*/

3610

*/

3608

if (node_distance(local_node, node) !=

3611

if (node_distance(local_node, node) !=

3609

node_distance(local_node, prev_node))

3612

node_distance(local_node, prev_node))

3610

node_load[node] = load;

3613

node_load[node] = load;

3611

3614

3612

prev_node = node;

3615

prev_node = node;

3613

load--;

3616

load--;

3614

if (order == ZONELIST_ORDER_NODE)

3617

if (order == ZONELIST_ORDER_NODE)

3615

build_zonelists_in_node_order(pgdat, node);

3618

build_zonelists_in_node_order(pgdat, node);

3616

else

3619

else

3617

node_order[j++] = node; /* remember order */

3620

node_order[j++] = node; /* remember order */

3618

}

3621

}

3619

3622

3620

if (order == ZONELIST_ORDER_ZONE) {

3623

if (order == ZONELIST_ORDER_ZONE) {

3621

/* calculate node order -- i.e., DMA last! */

3624

/* calculate node order -- i.e., DMA last! */

3622

build_zonelists_in_zone_order(pgdat, j);

3625

build_zonelists_in_zone_order(pgdat, j);

3623

}

3626

}

3624

3627

3625

build_thisnode_zonelists(pgdat);

3628

build_thisnode_zonelists(pgdat);

3626

}

3629

}

3627

3630

3628

/* Construct the zonelist performance cache - see further mmzone.h */

3631

/* Construct the zonelist performance cache - see further mmzone.h */

3629

static void build_zonelist_cache(pg_data_t *pgdat)

3632

static void build_zonelist_cache(pg_data_t *pgdat)

3630

{

3633

{

3631

struct zonelist *zonelist;

3634

struct zonelist *zonelist;

3632

struct zonelist_cache *zlc;

3635

struct zonelist_cache *zlc;

3633

struct zoneref *z;

3636

struct zoneref *z;

3634

3637

3635

zonelist = &pgdat->node_zonelists[0];

3638

zonelist = &pgdat->node_zonelists[0];

3636

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3639

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3637

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3640

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3638

for (z = zonelist->_zonerefs; z->zone; z++)

3641

for (z = zonelist->_zonerefs; z->zone; z++)

3639

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3642

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3640

}

3643

}

3641

3644

3642

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3645

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3643

/*

3646

/*

3644

* Return node id of node used for "local" allocations.

3647

* Return node id of node used for "local" allocations.

3645

* I.e., first node id of first zone in arg node's generic zonelist.

3648

* I.e., first node id of first zone in arg node's generic zonelist.

3646

* Used for initializing percpu 'numa_mem', which is used primarily

3649

* Used for initializing percpu 'numa_mem', which is used primarily

3647

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3650

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3648

*/

3651

*/

3649

int local_memory_node(int node)

3652

int local_memory_node(int node)

3650

{

3653

{

3651

struct zone *zone;

3654

struct zone *zone;

3652

3655

3653

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3656

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3654

gfp_zone(GFP_KERNEL),

3657

gfp_zone(GFP_KERNEL),

3655

NULL,

3658

NULL,

3656

&zone);

3659

&zone);

3657

return zone->node;

3660

return zone->node;

3658

}

3661

}

3659

#endif

3662

#endif

3660

3663

3661

#else /* CONFIG_NUMA */

3664

#else /* CONFIG_NUMA */

3662

3665

3663

static void set_zonelist_order(void)

3666

static void set_zonelist_order(void)

3664

{

3667

{

3665

current_zonelist_order = ZONELIST_ORDER_ZONE;

3668

current_zonelist_order = ZONELIST_ORDER_ZONE;

3666

}

3669

}

3667

3670

3668

static void build_zonelists(pg_data_t *pgdat)

3671

static void build_zonelists(pg_data_t *pgdat)

3669

{

3672

{

3670

int node, local_node;

3673

int node, local_node;

3671

enum zone_type j;

3674

enum zone_type j;

3672

struct zonelist *zonelist;

3675

struct zonelist *zonelist;

3673

3676

3674

local_node = pgdat->node_id;

3677

local_node = pgdat->node_id;

3675

3678

3676

zonelist = &pgdat->node_zonelists[0];

3679

zonelist = &pgdat->node_zonelists[0];

3677

j = build_zonelists_node(pgdat, zonelist, 0);

3680

j = build_zonelists_node(pgdat, zonelist, 0);

3678

3681

3679

/*

3682

/*

3680

* Now we build the zonelist so that it contains the zones

3683

* Now we build the zonelist so that it contains the zones

3681

* of all the other nodes.

3684

* of all the other nodes.

3682

* We don't want to pressure a particular node, so when

3685

* We don't want to pressure a particular node, so when

3683

* building the zones for node N, we make sure that the

3686

* building the zones for node N, we make sure that the

3684

* zones coming right after the local ones are those from

3687

* zones coming right after the local ones are those from

3685

* node N+1 (modulo N)

3688

* node N+1 (modulo N)

3686

*/

3689

*/

3687

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3690

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3688

if (!node_online(node))

3691

if (!node_online(node))

3689

continue;

3692

continue;

3690

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3693

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3691

}

3694

}

3692

for (node = 0; node < local_node; node++) {

3695

for (node = 0; node < local_node; node++) {

3693

if (!node_online(node))

3696

if (!node_online(node))

3694

continue;

3697

continue;

3695

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3698

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3696

}

3699

}

3697

3700

3698

zonelist->_zonerefs[j].zone = NULL;

3701

zonelist->_zonerefs[j].zone = NULL;

3699

zonelist->_zonerefs[j].zone_idx = 0;

3702

zonelist->_zonerefs[j].zone_idx = 0;

3700

}

3703

}

3701

3704

3702

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3705

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3703

static void build_zonelist_cache(pg_data_t *pgdat)

3706

static void build_zonelist_cache(pg_data_t *pgdat)

3704

{

3707

{

3705

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3708

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3706

}

3709

}

3707

3710

3708

#endif /* CONFIG_NUMA */

3711

#endif /* CONFIG_NUMA */

3709

3712

3710

/*

3713

/*

3711

* Boot pageset table. One per cpu which is going to be used for all

3714

* Boot pageset table. One per cpu which is going to be used for all

3712

* zones and all nodes. The parameters will be set in such a way

3715

* zones and all nodes. The parameters will be set in such a way

3713

* that an item put on a list will immediately be handed over to

3716

* that an item put on a list will immediately be handed over to

3714

* the buddy list. This is safe since pageset manipulation is done

3717

* the buddy list. This is safe since pageset manipulation is done

3715

* with interrupts disabled.

3718

* with interrupts disabled.

3716

*

3719

*

3717

* The boot_pagesets must be kept even after bootup is complete for

3720

* The boot_pagesets must be kept even after bootup is complete for

3718

* unused processors and/or zones. They do play a role for bootstrapping

3721

* unused processors and/or zones. They do play a role for bootstrapping

3719

* hotplugged processors.

3722

* hotplugged processors.

3720

*

3723

*

3721

* zoneinfo_show() and maybe other functions do

3724

* zoneinfo_show() and maybe other functions do

3722

* not check if the processor is online before following the pageset pointer.

3725

* not check if the processor is online before following the pageset pointer.

3723

* Other parts of the kernel may not check if the zone is available.

3726

* Other parts of the kernel may not check if the zone is available.

3724

*/

3727

*/

3725

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3728

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3726

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3729

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3727

static void setup_zone_pageset(struct zone *zone);

3730

static void setup_zone_pageset(struct zone *zone);

3728

3731

3729

/*

3732

/*

3730

* Global mutex to protect against size modification of zonelists

3733

* Global mutex to protect against size modification of zonelists

3731

* as well as to serialize pageset setup for the new populated zone.

3734

* as well as to serialize pageset setup for the new populated zone.

3732

*/

3735

*/

3733

DEFINE_MUTEX(zonelists_mutex);

3736

DEFINE_MUTEX(zonelists_mutex);

3734

3737

3735

/* return values int ....just for stop_machine() */

3738

/* return values int ....just for stop_machine() */

3736

static int __build_all_zonelists(void *data)

3739

static int __build_all_zonelists(void *data)

3737

{

3740

{

3738

int nid;

3741

int nid;

3739

int cpu;

3742

int cpu;

3740

pg_data_t *self = data;

3743

pg_data_t *self = data;

3741

3744

3742

#ifdef CONFIG_NUMA

3745

#ifdef CONFIG_NUMA

3743

memset(node_load, 0, sizeof(node_load));

3746

memset(node_load, 0, sizeof(node_load));

3744

#endif

3747

#endif

3745

3748

3746

if (self && !node_online(self->node_id)) {

3749

if (self && !node_online(self->node_id)) {

3747

build_zonelists(self);

3750

build_zonelists(self);

3748

build_zonelist_cache(self);

3751

build_zonelist_cache(self);

3749

}

3752

}

3750

3753

3751

for_each_online_node(nid) {

3754

for_each_online_node(nid) {

3752

pg_data_t *pgdat = NODE_DATA(nid);

3755

pg_data_t *pgdat = NODE_DATA(nid);

3753

3756

3754

build_zonelists(pgdat);

3757

build_zonelists(pgdat);

3755

build_zonelist_cache(pgdat);

3758

build_zonelist_cache(pgdat);

3756

}

3759

}

3757

3760

3758

/*

3761

/*

3759

* Initialize the boot_pagesets that are going to be used

3762

* Initialize the boot_pagesets that are going to be used

3760

* for bootstrapping processors. The real pagesets for

3763

* for bootstrapping processors. The real pagesets for

3761

* each zone will be allocated later when the per cpu

3764

* each zone will be allocated later when the per cpu

3762

* allocator is available.

3765

* allocator is available.

3763

*

3766

*

3764

* boot_pagesets are used also for bootstrapping offline

3767

* boot_pagesets are used also for bootstrapping offline

3765

* cpus if the system is already booted because the pagesets

3768

* cpus if the system is already booted because the pagesets

3766

* are needed to initialize allocators on a specific cpu too.

3769

* are needed to initialize allocators on a specific cpu too.

3767

* F.e. the percpu allocator needs the page allocator which

3770

* F.e. the percpu allocator needs the page allocator which

3768

* needs the percpu allocator in order to allocate its pagesets

3771

* needs the percpu allocator in order to allocate its pagesets

3769

* (a chicken-egg dilemma).

3772

* (a chicken-egg dilemma).

3770

*/

3773

*/

3771

for_each_possible_cpu(cpu) {

3774

for_each_possible_cpu(cpu) {

3772

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3775

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3773

3776

3774

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3777

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3775

/*

3778

/*

3776

* We now know the "local memory node" for each node--

3779

* We now know the "local memory node" for each node--

3777

* i.e., the node of the first zone in the generic zonelist.

3780

* i.e., the node of the first zone in the generic zonelist.

3778

* Set up numa_mem percpu variable for on-line cpus. During

3781

* Set up numa_mem percpu variable for on-line cpus. During

3779

* boot, only the boot cpu should be on-line; we'll init the

3782

* boot, only the boot cpu should be on-line; we'll init the

3780

* secondary cpus' numa_mem as they come on-line. During

3783

* secondary cpus' numa_mem as they come on-line. During

3781

* node/memory hotplug, we'll fixup all on-line cpus.

3784

* node/memory hotplug, we'll fixup all on-line cpus.

3782

*/

3785

*/

3783

if (cpu_online(cpu))

3786

if (cpu_online(cpu))

3784

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3787

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3785

#endif

3788

#endif

3786

}

3789

}

3787

3790

3788

return 0;

3791

return 0;

3789

}

3792

}

3790

3793

3791

/*

3794

/*

3792

* Called with zonelists_mutex held always

3795

* Called with zonelists_mutex held always

3793

* unless system_state == SYSTEM_BOOTING.

3796

* unless system_state == SYSTEM_BOOTING.

3794

*/

3797

*/

3795

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3798

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3796

{

3799

{

3797

set_zonelist_order();

3800

set_zonelist_order();

3798

3801

3799

if (system_state == SYSTEM_BOOTING) {

3802

if (system_state == SYSTEM_BOOTING) {

3800

__build_all_zonelists(NULL);

3803

__build_all_zonelists(NULL);

3801

mminit_verify_zonelist();

3804

mminit_verify_zonelist();

3802

cpuset_init_current_mems_allowed();

3805

cpuset_init_current_mems_allowed();

3803

} else {

3806

} else {

3804

#ifdef CONFIG_MEMORY_HOTPLUG

3807

#ifdef CONFIG_MEMORY_HOTPLUG

3805

if (zone)

3808

if (zone)

3806

setup_zone_pageset(zone);

3809

setup_zone_pageset(zone);

3807

#endif

3810

#endif

3808

/* we have to stop all cpus to guarantee there is no user

3811

/* we have to stop all cpus to guarantee there is no user

3809

of zonelist */

3812

of zonelist */

3810

stop_machine(__build_all_zonelists, pgdat, NULL);

3813

stop_machine(__build_all_zonelists, pgdat, NULL);

3811

/* cpuset refresh routine should be here */

3814

/* cpuset refresh routine should be here */

3812

}

3815

}

3813

vm_total_pages = nr_free_pagecache_pages();

3816

vm_total_pages = nr_free_pagecache_pages();

3814

/*

3817

/*

3815

* Disable grouping by mobility if the number of pages in the

3818

* Disable grouping by mobility if the number of pages in the

3816

* system is too low to allow the mechanism to work. It would be

3819

* system is too low to allow the mechanism to work. It would be

3817

* more accurate, but expensive to check per-zone. This check is

3820

* more accurate, but expensive to check per-zone. This check is

3818

* made on memory-hotadd so a system can start with mobility

3821

* made on memory-hotadd so a system can start with mobility

3819

* disabled and enable it later

3822

* disabled and enable it later

3820

*/

3823

*/

3821

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3824

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3822

page_group_by_mobility_disabled = 1;

3825

page_group_by_mobility_disabled = 1;

3823

else

3826

else

3824

page_group_by_mobility_disabled = 0;

3827

page_group_by_mobility_disabled = 0;

3825

3828

3826

printk("Built %i zonelists in %s order, mobility grouping %s. "

3829

printk("Built %i zonelists in %s order, mobility grouping %s. "

3827

"Total pages: %ld\n",

3830

"Total pages: %ld\n",

3828

nr_online_nodes,

3831

nr_online_nodes,

3829

zonelist_order_name[current_zonelist_order],

3832

zonelist_order_name[current_zonelist_order],

3830

page_group_by_mobility_disabled ? "off" : "on",

3833

page_group_by_mobility_disabled ? "off" : "on",

3831

vm_total_pages);

3834

vm_total_pages);

3832

#ifdef CONFIG_NUMA

3835

#ifdef CONFIG_NUMA

3833

printk("Policy zone: %s\n", zone_names[policy_zone]);

3836

printk("Policy zone: %s\n", zone_names[policy_zone]);

3834

#endif

3837

#endif

3835

}

3838

}

3836

3839

3837

/*

3840

/*

3838

* Helper functions to size the waitqueue hash table.

3841

* Helper functions to size the waitqueue hash table.

3839

* Essentially these want to choose hash table sizes sufficiently

3842

* Essentially these want to choose hash table sizes sufficiently

3840

* large so that collisions trying to wait on pages are rare.

3843

* large so that collisions trying to wait on pages are rare.

3841

* But in fact, the number of active page waitqueues on typical

3844

* But in fact, the number of active page waitqueues on typical

3842

* systems is ridiculously low, less than 200. So this is even

3845

* systems is ridiculously low, less than 200. So this is even

3843

* conservative, even though it seems large.

3846

* conservative, even though it seems large.

3844

*

3847

*

3845

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3848

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3846

* waitqueues, i.e. the size of the waitq table given the number of pages.

3849

* waitqueues, i.e. the size of the waitq table given the number of pages.

3847

*/

3850

*/

3848

#define PAGES_PER_WAITQUEUE 256

3851

#define PAGES_PER_WAITQUEUE 256

3849

3852

3850

#ifndef CONFIG_MEMORY_HOTPLUG

3853

#ifndef CONFIG_MEMORY_HOTPLUG

3851

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3854

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3852

{

3855

{

3853

unsigned long size = 1;

3856

unsigned long size = 1;

3854

3857

3855

pages /= PAGES_PER_WAITQUEUE;

3858

pages /= PAGES_PER_WAITQUEUE;

3856

3859

3857

while (size < pages)

3860

while (size < pages)

3858

size <<= 1;

3861

size <<= 1;

3859

3862

3860

/*

3863

/*

3861

* Once we have dozens or even hundreds of threads sleeping

3864

* Once we have dozens or even hundreds of threads sleeping

3862

* on IO we've got bigger problems than wait queue collision.

3865

* on IO we've got bigger problems than wait queue collision.

3863

* Limit the size of the wait table to a reasonable size.

3866

* Limit the size of the wait table to a reasonable size.

3864

*/

3867

*/

3865

size = min(size, 4096UL);

3868

size = min(size, 4096UL);

3866

3869

3867

return max(size, 4UL);

3870

return max(size, 4UL);

3868

}

3871

}

3869

#else

3872

#else

3870

/*

3873

/*

3871

* A zone's size might be changed by hot-add, so it is not possible to determine

3874

* A zone's size might be changed by hot-add, so it is not possible to determine

3872

* a suitable size for its wait_table. So we use the maximum size now.

3875

* a suitable size for its wait_table. So we use the maximum size now.

3873

*

3876

*

3874

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3877

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3875

*

3878

*

3876

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3879

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3877

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3880

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3878

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3881

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3879

*

3882

*

3880

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3883

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3881

* or more by the traditional way. (See above). It equals:

3884

* or more by the traditional way. (See above). It equals:

3882

*

3885

*

3883

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3886

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3884

* ia64(16K page size) : = ( 8G + 4M)byte.

3887

* ia64(16K page size) : = ( 8G + 4M)byte.

3885

* powerpc (64K page size) : = (32G +16M)byte.

3888

* powerpc (64K page size) : = (32G +16M)byte.

3886

*/

3889

*/

3887

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3890

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3888

{

3891

{

3889

return 4096UL;

3892

return 4096UL;

3890

}

3893

}

3891

#endif

3894

#endif

3892

3895

3893

/*

3896

/*

3894

* This is an integer logarithm so that shifts can be used later

3897

* This is an integer logarithm so that shifts can be used later

3895

* to extract the more random high bits from the multiplicative

3898

* to extract the more random high bits from the multiplicative

3896

* hash function before the remainder is taken.

3899

* hash function before the remainder is taken.

3897

*/

3900

*/

3898

static inline unsigned long wait_table_bits(unsigned long size)

3901

static inline unsigned long wait_table_bits(unsigned long size)

3899

{

3902

{

3900

return ffz(~size);

3903

return ffz(~size);

3901

}

3904

}

3902

3905

3903

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3906

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3904

3907

3905

/*

3908

/*

3906

* Check if a pageblock contains reserved pages

3909

* Check if a pageblock contains reserved pages

3907

*/

3910

*/

3908

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3911

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3909

{

3912

{

3910

unsigned long pfn;

3913

unsigned long pfn;

3911

3914

3912

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3915

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3913

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3916

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3914

return 1;

3917

return 1;

3915

}

3918

}

3916

return 0;

3919

return 0;

3917

}

3920

}

3918

3921

3919

/*

3922

/*

3920

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3923

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3921

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3924

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3922

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3925

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3923

* higher will lead to a bigger reserve which will get freed as contiguous

3926

* higher will lead to a bigger reserve which will get freed as contiguous

3924

* blocks as reclaim kicks in

3927

* blocks as reclaim kicks in

3925

*/

3928

*/

3926

static void setup_zone_migrate_reserve(struct zone *zone)

3929

static void setup_zone_migrate_reserve(struct zone *zone)

3927

{

3930

{

3928

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3931

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3929

struct page *page;

3932

struct page *page;

3930

unsigned long block_migratetype;

3933

unsigned long block_migratetype;

3931

int reserve;

3934

int reserve;

3932

int old_reserve;

3935

int old_reserve;

3933

3936

3934

/*

3937

/*

3935

* Get the start pfn, end pfn and the number of blocks to reserve

3938

* Get the start pfn, end pfn and the number of blocks to reserve

3936

* We have to be careful to be aligned to pageblock_nr_pages to

3939

* We have to be careful to be aligned to pageblock_nr_pages to

3937

* make sure that we always check pfn_valid for the first page in

3940

* make sure that we always check pfn_valid for the first page in

3938

* the block.

3941

* the block.

3939

*/

3942

*/

3940

start_pfn = zone->zone_start_pfn;

3943

start_pfn = zone->zone_start_pfn;

3941

end_pfn = zone_end_pfn(zone);

3944

end_pfn = zone_end_pfn(zone);

3942

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3945

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3943

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3946

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3944

pageblock_order;

3947

pageblock_order;

3945

3948

3946

/*

3949

/*

3947

* Reserve blocks are generally in place to help high-order atomic

3950

* Reserve blocks are generally in place to help high-order atomic

3948

* allocations that are short-lived. A min_free_kbytes value that

3951

* allocations that are short-lived. A min_free_kbytes value that

3949

* would result in more than 2 reserve blocks for atomic allocations

3952

* would result in more than 2 reserve blocks for atomic allocations

3950

* is assumed to be in place to help anti-fragmentation for the

3953

* is assumed to be in place to help anti-fragmentation for the

3951

* future allocation of hugepages at runtime.

3954

* future allocation of hugepages at runtime.

3952

*/

3955

*/

3953

reserve = min(2, reserve);

3956

reserve = min(2, reserve);

3954

old_reserve = zone->nr_migrate_reserve_block;

3957

old_reserve = zone->nr_migrate_reserve_block;

3955

3958

3956

/* When memory hot-add, we almost always need to do nothing */

3959

/* When memory hot-add, we almost always need to do nothing */

3957

if (reserve == old_reserve)

3960

if (reserve == old_reserve)

3958

return;

3961

return;

3959

zone->nr_migrate_reserve_block = reserve;

3962

zone->nr_migrate_reserve_block = reserve;

3960

3963

3961

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3964

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3962

if (!pfn_valid(pfn))

3965

if (!pfn_valid(pfn))

3963

continue;

3966

continue;

3964

page = pfn_to_page(pfn);

3967

page = pfn_to_page(pfn);

3965

3968

3966

/* Watch out for overlapping nodes */

3969

/* Watch out for overlapping nodes */

3967

if (page_to_nid(page) != zone_to_nid(zone))

3970

if (page_to_nid(page) != zone_to_nid(zone))

3968

continue;

3971

continue;

3969

3972

3970

block_migratetype = get_pageblock_migratetype(page);

3973

block_migratetype = get_pageblock_migratetype(page);

3971

3974

3972

/* Only test what is necessary when the reserves are not met */

3975

/* Only test what is necessary when the reserves are not met */

3973

if (reserve > 0) {

3976

if (reserve > 0) {

3974

/*

3977

/*

3975

* Blocks with reserved pages will never free, skip

3978

* Blocks with reserved pages will never free, skip

3976

* them.

3979

* them.

3977

*/

3980

*/

3978

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3981

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3979

if (pageblock_is_reserved(pfn, block_end_pfn))

3982

if (pageblock_is_reserved(pfn, block_end_pfn))

3980

continue;

3983

continue;

3981

3984

3982

/* If this block is reserved, account for it */

3985

/* If this block is reserved, account for it */

3983

if (block_migratetype == MIGRATE_RESERVE) {

3986

if (block_migratetype == MIGRATE_RESERVE) {

3984

reserve--;

3987

reserve--;

3985

continue;

3988

continue;

3986

}

3989

}

3987

3990

3988

/* Suitable for reserving if this block is movable */

3991

/* Suitable for reserving if this block is movable */

3989

if (block_migratetype == MIGRATE_MOVABLE) {

3992

if (block_migratetype == MIGRATE_MOVABLE) {

3990

set_pageblock_migratetype(page,

3993

set_pageblock_migratetype(page,

3991

MIGRATE_RESERVE);

3994

MIGRATE_RESERVE);

3992

move_freepages_block(zone, page,

3995

move_freepages_block(zone, page,

3993

MIGRATE_RESERVE);

3996

MIGRATE_RESERVE);

3994

reserve--;

3997

reserve--;

3995

continue;

3998

continue;

3996

}

3999

}

3997

} else if (!old_reserve) {

4000

} else if (!old_reserve) {

3998

/*

4001

/*

3999

* At boot time we don't need to scan the whole zone

4002

* At boot time we don't need to scan the whole zone

4000

* for turning off MIGRATE_RESERVE.

4003

* for turning off MIGRATE_RESERVE.

4001

*/

4004

*/

4002

break;

4005

break;

4003

}

4006

}

4004

4007

4005

/*

4008

/*

4006

* If the reserve is met and this is a previous reserved block,

4009

* If the reserve is met and this is a previous reserved block,

4007

* take it back

4010

* take it back

4008

*/

4011

*/

4009

if (block_migratetype == MIGRATE_RESERVE) {

4012

if (block_migratetype == MIGRATE_RESERVE) {

4010

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4013

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4011

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4014

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4012

}

4015

}

4013

}

4016

}

4014

}

4017

}

4015

4018

4016

/*

4019

/*

4017

* Initially all pages are reserved - free ones are freed

4020

* Initially all pages are reserved - free ones are freed

4018

* up by free_all_bootmem() once the early boot process is

4021

* up by free_all_bootmem() once the early boot process is

4019

* done. Non-atomic initialization, single-pass.

4022

* done. Non-atomic initialization, single-pass.

4020

*/

4023

*/

4021

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4024

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4022

unsigned long start_pfn, enum memmap_context context)

4025

unsigned long start_pfn, enum memmap_context context)

4023

{

4026

{

4024

struct page *page;

4027

struct page *page;

4025

unsigned long end_pfn = start_pfn + size;

4028

unsigned long end_pfn = start_pfn + size;

4026

unsigned long pfn;

4029

unsigned long pfn;

4027

struct zone *z;

4030

struct zone *z;

4028

4031

4029

if (highest_memmap_pfn < end_pfn - 1)

4032

if (highest_memmap_pfn < end_pfn - 1)

4030

highest_memmap_pfn = end_pfn - 1;

4033

highest_memmap_pfn = end_pfn - 1;

4031

4034

4032

z = &NODE_DATA(nid)->node_zones[zone];

4035

z = &NODE_DATA(nid)->node_zones[zone];

4033

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4036

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4034

/*

4037

/*

4035

* There can be holes in boot-time mem_map[]s

4038

* There can be holes in boot-time mem_map[]s

4036

* handed to this function. They do not

4039

* handed to this function. They do not

4037

* exist on hotplugged memory.

4040

* exist on hotplugged memory.

4038

*/

4041

*/

4039

if (context == MEMMAP_EARLY) {

4042

if (context == MEMMAP_EARLY) {

4040

if (!early_pfn_valid(pfn))

4043

if (!early_pfn_valid(pfn))

4041

continue;

4044

continue;

4042

if (!early_pfn_in_nid(pfn, nid))

4045

if (!early_pfn_in_nid(pfn, nid))

4043

continue;

4046

continue;

4044

}

4047

}

4045

page = pfn_to_page(pfn);

4048

page = pfn_to_page(pfn);

4046

set_page_links(page, zone, nid, pfn);

4049

set_page_links(page, zone, nid, pfn);

4047

mminit_verify_page_links(page, zone, nid, pfn);

4050

mminit_verify_page_links(page, zone, nid, pfn);

4048

init_page_count(page);

4051

init_page_count(page);

4049

page_mapcount_reset(page);

4052

page_mapcount_reset(page);

4050

page_nid_reset_last(page);

4053

page_nid_reset_last(page);

4051

SetPageReserved(page);

4054

SetPageReserved(page);

4052

/*

4055

/*

4053

* Mark the block movable so that blocks are reserved for

4056

* Mark the block movable so that blocks are reserved for

4054

* movable at startup. This will force kernel allocations

4057

* movable at startup. This will force kernel allocations

4055

* to reserve their blocks rather than leaking throughout

4058

* to reserve their blocks rather than leaking throughout

4056

* the address space during boot when many long-lived

4059

* the address space during boot when many long-lived

4057

* kernel allocations are made. Later some blocks near

4060

* kernel allocations are made. Later some blocks near

4058

* the start are marked MIGRATE_RESERVE by

4061

* the start are marked MIGRATE_RESERVE by

4059

* setup_zone_migrate_reserve()

4062

* setup_zone_migrate_reserve()

4060

*

4063

*

4061

* bitmap is created for zone's valid pfn range. but memmap

4064

* bitmap is created for zone's valid pfn range. but memmap

4062

* can be created for invalid pages (for alignment)

4065

* can be created for invalid pages (for alignment)

4063

* check here not to call set_pageblock_migratetype() against

4066

* check here not to call set_pageblock_migratetype() against

4064

* pfn out of zone.

4067

* pfn out of zone.

4065

*/

4068

*/

4066

if ((z->zone_start_pfn <= pfn)

4069

if ((z->zone_start_pfn <= pfn)

4067

&& (pfn < zone_end_pfn(z))

4070

&& (pfn < zone_end_pfn(z))

4068

&& !(pfn & (pageblock_nr_pages - 1)))

4071

&& !(pfn & (pageblock_nr_pages - 1)))

4069

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4072

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4070

4073

4071

INIT_LIST_HEAD(&page->lru);

4074

INIT_LIST_HEAD(&page->lru);

4072

#ifdef WANT_PAGE_VIRTUAL

4075

#ifdef WANT_PAGE_VIRTUAL

4073

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4076

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4074

if (!is_highmem_idx(zone))

4077

if (!is_highmem_idx(zone))

4075

set_page_address(page, __va(pfn << PAGE_SHIFT));

4078

set_page_address(page, __va(pfn << PAGE_SHIFT));

4076

#endif

4079

#endif

4077

}

4080

}

4078

}

4081

}

4079

4082

4080

static void __meminit zone_init_free_lists(struct zone *zone)

4083

static void __meminit zone_init_free_lists(struct zone *zone)

4081

{

4084

{

4082

int order, t;

4085

int order, t;

4083

for_each_migratetype_order(order, t) {

4086

for_each_migratetype_order(order, t) {

4084

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4087

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4085

zone->free_area[order].nr_free = 0;

4088

zone->free_area[order].nr_free = 0;

4086

}

4089

}

4087

}

4090

}

4088

4091

4089

#ifndef __HAVE_ARCH_MEMMAP_INIT

4092

#ifndef __HAVE_ARCH_MEMMAP_INIT

4090

#define memmap_init(size, nid, zone, start_pfn) \

4093

#define memmap_init(size, nid, zone, start_pfn) \

4091

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4094

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4092

#endif

4095

#endif

4093

4096

4094

static int zone_batchsize(struct zone *zone)

4097

static int zone_batchsize(struct zone *zone)

4095

{

4098

{

4096

#ifdef CONFIG_MMU

4099

#ifdef CONFIG_MMU

4097

int batch;

4100

int batch;

4098

4101

4099

/*

4102

/*

4100

* The per-cpu-pages pools are set to around 1000th of the

4103

* The per-cpu-pages pools are set to around 1000th of the

4101

* size of the zone. But no more than 1/2 of a meg.

4104

* size of the zone. But no more than 1/2 of a meg.

4102

*

4105

*

4103

* OK, so we don't know how big the cache is. So guess.

4106

* OK, so we don't know how big the cache is. So guess.

4104

*/

4107

*/

4105

batch = zone->managed_pages / 1024;

4108

batch = zone->managed_pages / 1024;

4106

if (batch * PAGE_SIZE > 512 * 1024)

4109

if (batch * PAGE_SIZE > 512 * 1024)

4107

batch = (512 * 1024) / PAGE_SIZE;

4110

batch = (512 * 1024) / PAGE_SIZE;

4108

batch /= 4; /* We effectively *= 4 below */

4111

batch /= 4; /* We effectively *= 4 below */

4109

if (batch < 1)

4112

if (batch < 1)

4110

batch = 1;

4113

batch = 1;

4111

4114

4112

/*

4115

/*

4113

* Clamp the batch to a 2^n - 1 value. Having a power

4116

* Clamp the batch to a 2^n - 1 value. Having a power

4114

* of 2 value was found to be more likely to have

4117

* of 2 value was found to be more likely to have

4115

* suboptimal cache aliasing properties in some cases.

4118

* suboptimal cache aliasing properties in some cases.

4116

*

4119

*

4117

* For example if 2 tasks are alternately allocating

4120

* For example if 2 tasks are alternately allocating

4118

* batches of pages, one task can end up with a lot

4121

* batches of pages, one task can end up with a lot

4119

* of pages of one half of the possible page colors

4122

* of pages of one half of the possible page colors

4120

* and the other with pages of the other colors.

4123

* and the other with pages of the other colors.

4121

*/

4124

*/

4122

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4125

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4123

4126

4124

return batch;

4127

return batch;

4125

4128

4126

#else

4129

#else

4127

/* The deferral and batching of frees should be suppressed under NOMMU

4130

/* The deferral and batching of frees should be suppressed under NOMMU

4128

* conditions.

4131

* conditions.

4129

*

4132

*

4130

* The problem is that NOMMU needs to be able to allocate large chunks

4133

* The problem is that NOMMU needs to be able to allocate large chunks

4131

* of contiguous memory as there's no hardware page translation to

4134

* of contiguous memory as there's no hardware page translation to

4132

* assemble apparent contiguous memory from discontiguous pages.

4135

* assemble apparent contiguous memory from discontiguous pages.

4133

*

4136

*

4134

* Queueing large contiguous runs of pages for batching, however,

4137

* Queueing large contiguous runs of pages for batching, however,

4135

* causes the pages to actually be freed in smaller chunks. As there

4138

* causes the pages to actually be freed in smaller chunks. As there

4136

* can be a significant delay between the individual batches being

4139

* can be a significant delay between the individual batches being

4137

* recycled, this leads to the once large chunks of space being

4140

* recycled, this leads to the once large chunks of space being

4138

* fragmented and becoming unavailable for high-order allocations.

4141

* fragmented and becoming unavailable for high-order allocations.

4139

*/

4142

*/

4140

return 0;

4143

return 0;

4141

#endif

4144

#endif

4142

}

4145

}

4143

4146

4144

/*

4147

/*

4145

* pcp->high and pcp->batch values are related and dependent on one another:

4148

* pcp->high and pcp->batch values are related and dependent on one another:

4146

* ->batch must never be higher then ->high.

4149

* ->batch must never be higher then ->high.

4147

* The following function updates them in a safe manner without read side

4150

* The following function updates them in a safe manner without read side

4148

* locking.

4151

* locking.

4149

*

4152

*

4150

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4153

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4151

* those fields changing asynchronously (acording the the above rule).

4154

* those fields changing asynchronously (acording the the above rule).

4152

*

4155

*

4153

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4156

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4154

* outside of boot time (or some other assurance that no concurrent updaters

4157

* outside of boot time (or some other assurance that no concurrent updaters

4155

* exist).

4158

* exist).

4156

*/

4159

*/

4157

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4160

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4158

unsigned long batch)

4161

unsigned long batch)

4159

{

4162

{

4160

/* start with a fail safe value for batch */

4163

/* start with a fail safe value for batch */

4161

pcp->batch = 1;

4164

pcp->batch = 1;

4162

smp_wmb();

4165

smp_wmb();

4163

4166

4164

/* Update high, then batch, in order */

4167

/* Update high, then batch, in order */

4165

pcp->high = high;

4168

pcp->high = high;

4166

smp_wmb();

4169

smp_wmb();

4167

4170

4168

pcp->batch = batch;

4171

pcp->batch = batch;

4169

}

4172

}

4170

4173

4171

/* a companion to pageset_set_high() */

4174

/* a companion to pageset_set_high() */

4172

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4175

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4173

{

4176

{

4174

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4177

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4175

}

4178

}

4176

4179

4177

static void pageset_init(struct per_cpu_pageset *p)

4180

static void pageset_init(struct per_cpu_pageset *p)

4178

{

4181

{

4179

struct per_cpu_pages *pcp;

4182

struct per_cpu_pages *pcp;

4180

int migratetype;

4183

int migratetype;

4181

4184

4182

memset(p, 0, sizeof(*p));

4185

memset(p, 0, sizeof(*p));

4183

4186

4184

pcp = &p->pcp;

4187

pcp = &p->pcp;

4185

pcp->count = 0;

4188

pcp->count = 0;

4186

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4189

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4187

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4190

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4188

}

4191

}

4189

4192

4190

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4193

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4191

{

4194

{

4192

pageset_init(p);

4195

pageset_init(p);

4193

pageset_set_batch(p, batch);

4196

pageset_set_batch(p, batch);

4194

}

4197

}

4195

4198

4196

/*

4199

/*

4197

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4200

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4198

* to the value high for the pageset p.

4201

* to the value high for the pageset p.

4199

*/

4202

*/

4200

static void pageset_set_high(struct per_cpu_pageset *p,

4203

static void pageset_set_high(struct per_cpu_pageset *p,

4201

unsigned long high)

4204

unsigned long high)

4202

{

4205

{

4203

unsigned long batch = max(1UL, high / 4);

4206

unsigned long batch = max(1UL, high / 4);

4204

if ((high / 4) > (PAGE_SHIFT * 8))

4207

if ((high / 4) > (PAGE_SHIFT * 8))

4205

batch = PAGE_SHIFT * 8;

4208

batch = PAGE_SHIFT * 8;

4206

4209

4207

pageset_update(&p->pcp, high, batch);

4210

pageset_update(&p->pcp, high, batch);

4208

}

4211

}

4209

4212

4210

static void pageset_set_high_and_batch(struct zone *zone,

4213

static void pageset_set_high_and_batch(struct zone *zone,

4211

struct per_cpu_pageset *pcp)

4214

struct per_cpu_pageset *pcp)

4212

{

4215

{

4213

if (percpu_pagelist_fraction)

4216

if (percpu_pagelist_fraction)

4214

pageset_set_high(pcp,

4217

pageset_set_high(pcp,

4215

(zone->managed_pages /

4218

(zone->managed_pages /

4216

percpu_pagelist_fraction));

4219

percpu_pagelist_fraction));

4217

else

4220

else

4218

pageset_set_batch(pcp, zone_batchsize(zone));

4221

pageset_set_batch(pcp, zone_batchsize(zone));

4219

}

4222

}

4220

4223

4221

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4224

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4222

{

4225

{

4223

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4226

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4224

4227

4225

pageset_init(pcp);

4228

pageset_init(pcp);

4226

pageset_set_high_and_batch(zone, pcp);

4229

pageset_set_high_and_batch(zone, pcp);

4227

}

4230

}

4228

4231

4229

static void __meminit setup_zone_pageset(struct zone *zone)

4232

static void __meminit setup_zone_pageset(struct zone *zone)

4230

{

4233

{

4231

int cpu;

4234

int cpu;

4232

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4235

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4233

for_each_possible_cpu(cpu)

4236

for_each_possible_cpu(cpu)

4234

zone_pageset_init(zone, cpu);

4237

zone_pageset_init(zone, cpu);

4235

}

4238

}

4236

4239

4237

/*

4240

/*

4238

* Allocate per cpu pagesets and initialize them.

4241

* Allocate per cpu pagesets and initialize them.

4239

* Before this call only boot pagesets were available.

4242

* Before this call only boot pagesets were available.

4240

*/

4243

*/

4241

void __init setup_per_cpu_pageset(void)

4244

void __init setup_per_cpu_pageset(void)

4242

{

4245

{

4243

struct zone *zone;

4246

struct zone *zone;

4244

4247

4245

for_each_populated_zone(zone)

4248

for_each_populated_zone(zone)

4246

setup_zone_pageset(zone);

4249

setup_zone_pageset(zone);

4247

}

4250

}

4248

4251

4249

static noinline __init_refok

4252

static noinline __init_refok

4250

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4253

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4251

{

4254

{

4252

int i;

4255

int i;

4253

struct pglist_data *pgdat = zone->zone_pgdat;

4256

struct pglist_data *pgdat = zone->zone_pgdat;

4254

size_t alloc_size;

4257

size_t alloc_size;

4255

4258

4256

/*

4259

/*

4257

* The per-page waitqueue mechanism uses hashed waitqueues

4260

* The per-page waitqueue mechanism uses hashed waitqueues

4258

* per zone.

4261

* per zone.

4259

*/

4262

*/

4260

zone->wait_table_hash_nr_entries =

4263

zone->wait_table_hash_nr_entries =

4261

wait_table_hash_nr_entries(zone_size_pages);

4264

wait_table_hash_nr_entries(zone_size_pages);

4262

zone->wait_table_bits =

4265

zone->wait_table_bits =

4263

wait_table_bits(zone->wait_table_hash_nr_entries);

4266

wait_table_bits(zone->wait_table_hash_nr_entries);

4264

alloc_size = zone->wait_table_hash_nr_entries

4267

alloc_size = zone->wait_table_hash_nr_entries

4265

* sizeof(wait_queue_head_t);

4268

* sizeof(wait_queue_head_t);

4266

4269

4267

if (!slab_is_available()) {

4270

if (!slab_is_available()) {

4268

zone->wait_table = (wait_queue_head_t *)

4271

zone->wait_table = (wait_queue_head_t *)

4269

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4272

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4270

} else {

4273

} else {

4271

/*

4274

/*

4272

* This case means that a zone whose size was 0 gets new memory

4275

* This case means that a zone whose size was 0 gets new memory

4273

* via memory hot-add.

4276

* via memory hot-add.

4274

* But it may be the case that a new node was hot-added. In

4277

* But it may be the case that a new node was hot-added. In

4275

* this case vmalloc() will not be able to use this new node's

4278

* this case vmalloc() will not be able to use this new node's

4276

* memory - this wait_table must be initialized to use this new

4279

* memory - this wait_table must be initialized to use this new

4277

* node itself as well.

4280

* node itself as well.

4278

* To use this new node's memory, further consideration will be

4281

* To use this new node's memory, further consideration will be

4279

* necessary.

4282

* necessary.

4280

*/

4283

*/

4281

zone->wait_table = vmalloc(alloc_size);

4284

zone->wait_table = vmalloc(alloc_size);

4282

}

4285

}

4283

if (!zone->wait_table)

4286

if (!zone->wait_table)

4284

return -ENOMEM;

4287

return -ENOMEM;

4285

4288

4286

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4289

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4287

init_waitqueue_head(zone->wait_table + i);

4290

init_waitqueue_head(zone->wait_table + i);

4288

4291

4289

return 0;

4292

return 0;

4290

}

4293

}

4291

4294

4292

static __meminit void zone_pcp_init(struct zone *zone)

4295

static __meminit void zone_pcp_init(struct zone *zone)

4293

{

4296

{

4294

/*

4297

/*

4295

* per cpu subsystem is not up at this point. The following code

4298

* per cpu subsystem is not up at this point. The following code

4296

* relies on the ability of the linker to provide the

4299

* relies on the ability of the linker to provide the

4297

* offset of a (static) per cpu variable into the per cpu area.

4300

* offset of a (static) per cpu variable into the per cpu area.

4298

*/

4301

*/

4299

zone->pageset = &boot_pageset;

4302

zone->pageset = &boot_pageset;

4300

4303

4301

if (zone->present_pages)

4304

if (zone->present_pages)

4302

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4305

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4303

zone->name, zone->present_pages,

4306

zone->name, zone->present_pages,

4304

zone_batchsize(zone));

4307

zone_batchsize(zone));

4305

}

4308

}

4306

4309

4307

int __meminit init_currently_empty_zone(struct zone *zone,

4310

int __meminit init_currently_empty_zone(struct zone *zone,

4308

unsigned long zone_start_pfn,

4311

unsigned long zone_start_pfn,

4309

unsigned long size,

4312

unsigned long size,

4310

enum memmap_context context)

4313

enum memmap_context context)

4311

{

4314

{

4312

struct pglist_data *pgdat = zone->zone_pgdat;

4315

struct pglist_data *pgdat = zone->zone_pgdat;

4313

int ret;

4316

int ret;

4314

ret = zone_wait_table_init(zone, size);

4317

ret = zone_wait_table_init(zone, size);

4315

if (ret)

4318

if (ret)

4316

return ret;

4319

return ret;

4317

pgdat->nr_zones = zone_idx(zone) + 1;

4320

pgdat->nr_zones = zone_idx(zone) + 1;

4318

4321

4319

zone->zone_start_pfn = zone_start_pfn;

4322

zone->zone_start_pfn = zone_start_pfn;

4320

4323

4321

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4324

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4322

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4325

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4323

pgdat->node_id,

4326

pgdat->node_id,

4324

(unsigned long)zone_idx(zone),

4327

(unsigned long)zone_idx(zone),

4325

zone_start_pfn, (zone_start_pfn + size));

4328

zone_start_pfn, (zone_start_pfn + size));

4326

4329

4327

zone_init_free_lists(zone);

4330

zone_init_free_lists(zone);

4328

4331

4329

return 0;

4332

return 0;

4330

}

4333

}

4331

4334

4332

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4335

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4333

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4336

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4334

/*

4337

/*

4335

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4338

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4336

* Architectures may implement their own version but if add_active_range()

4339

* Architectures may implement their own version but if add_active_range()

4337

* was used and there are no special requirements, this is a convenient

4340

* was used and there are no special requirements, this is a convenient

4338

* alternative

4341

* alternative

4339

*/

4342

*/

4340

int __meminit __early_pfn_to_nid(unsigned long pfn)

4343

int __meminit __early_pfn_to_nid(unsigned long pfn)

4341

{

4344

{

4342

unsigned long start_pfn, end_pfn;

4345

unsigned long start_pfn, end_pfn;

4343

int nid;

4346

int nid;

4344

/*

4347

/*

4345

* NOTE: The following SMP-unsafe globals are only used early in boot

4348

* NOTE: The following SMP-unsafe globals are only used early in boot

4346

* when the kernel is running single-threaded.

4349

* when the kernel is running single-threaded.

4347

*/

4350

*/

4348

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4351

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4349

static int __meminitdata last_nid;

4352

static int __meminitdata last_nid;

4350

4353

4351

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4354

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4352

return last_nid;

4355

return last_nid;

4353

4356

4354

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4357

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4355

if (nid != -1) {

4358

if (nid != -1) {

4356

last_start_pfn = start_pfn;

4359

last_start_pfn = start_pfn;

4357

last_end_pfn = end_pfn;

4360

last_end_pfn = end_pfn;

4358

last_nid = nid;

4361

last_nid = nid;

4359

}

4362

}

4360

4363

4361

return nid;

4364

return nid;

4362

}

4365

}

4363

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4366

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4364

4367

4365

int __meminit early_pfn_to_nid(unsigned long pfn)

4368

int __meminit early_pfn_to_nid(unsigned long pfn)

4366

{

4369

{

4367

int nid;

4370

int nid;

4368

4371

4369

nid = __early_pfn_to_nid(pfn);

4372

nid = __early_pfn_to_nid(pfn);

4370

if (nid >= 0)

4373

if (nid >= 0)

4371

return nid;

4374

return nid;

4372

/* just returns 0 */

4375

/* just returns 0 */

4373

return 0;

4376

return 0;

4374

}

4377

}

4375

4378

4376

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4379

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4377

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4380

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4378

{

4381

{

4379

int nid;

4382

int nid;

4380

4383

4381

nid = __early_pfn_to_nid(pfn);

4384

nid = __early_pfn_to_nid(pfn);

4382

if (nid >= 0 && nid != node)

4385

if (nid >= 0 && nid != node)

4383

return false;

4386

return false;

4384

return true;

4387

return true;

4385

}

4388

}

4386

#endif

4389

#endif

4387

4390

4388

/**

4391

/**

4389

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4392

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4390

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4393

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4391

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4394

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4392

*

4395

*

4393

* If an architecture guarantees that all ranges registered with

4396

* If an architecture guarantees that all ranges registered with

4394

* add_active_ranges() contain no holes and may be freed, this

4397

* add_active_ranges() contain no holes and may be freed, this

4395

* this function may be used instead of calling free_bootmem() manually.

4398

* this function may be used instead of calling free_bootmem() manually.

4396

*/

4399

*/

4397

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4400

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4398

{

4401

{

4399

unsigned long start_pfn, end_pfn;

4402

unsigned long start_pfn, end_pfn;

4400

int i, this_nid;

4403

int i, this_nid;

4401

4404

4402

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4405

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4403

start_pfn = min(start_pfn, max_low_pfn);

4406

start_pfn = min(start_pfn, max_low_pfn);

4404

end_pfn = min(end_pfn, max_low_pfn);

4407

end_pfn = min(end_pfn, max_low_pfn);

4405

4408

4406

if (start_pfn < end_pfn)

4409

if (start_pfn < end_pfn)

4407

free_bootmem_node(NODE_DATA(this_nid),

4410

free_bootmem_node(NODE_DATA(this_nid),

4408

PFN_PHYS(start_pfn),

4411

PFN_PHYS(start_pfn),

4409

(end_pfn - start_pfn) << PAGE_SHIFT);

4412

(end_pfn - start_pfn) << PAGE_SHIFT);

4410

}

4413

}

4411

}

4414

}

4412

4415

4413

/**

4416

/**

4414

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4417

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4415

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4418

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4416

*

4419

*

4417

* If an architecture guarantees that all ranges registered with

4420

* If an architecture guarantees that all ranges registered with

4418

* add_active_ranges() contain no holes and may be freed, this

4421

* add_active_ranges() contain no holes and may be freed, this

4419

* function may be used instead of calling memory_present() manually.

4422

* function may be used instead of calling memory_present() manually.

4420

*/

4423

*/

4421

void __init sparse_memory_present_with_active_regions(int nid)

4424

void __init sparse_memory_present_with_active_regions(int nid)

4422

{

4425

{

4423

unsigned long start_pfn, end_pfn;

4426

unsigned long start_pfn, end_pfn;

4424

int i, this_nid;

4427

int i, this_nid;

4425

4428

4426

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4429

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4427

memory_present(this_nid, start_pfn, end_pfn);

4430

memory_present(this_nid, start_pfn, end_pfn);

4428

}

4431

}

4429

4432

4430

/**

4433

/**

4431

* get_pfn_range_for_nid - Return the start and end page frames for a node

4434

* get_pfn_range_for_nid - Return the start and end page frames for a node

4432

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4435

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4433

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4436

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4434

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4437

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4435

*

4438

*

4436

* It returns the start and end page frame of a node based on information

4439

* It returns the start and end page frame of a node based on information

4437

* provided by an arch calling add_active_range(). If called for a node

4440

* provided by an arch calling add_active_range(). If called for a node

4438

* with no available memory, a warning is printed and the start and end

4441

* with no available memory, a warning is printed and the start and end

4439

* PFNs will be 0.

4442

* PFNs will be 0.

4440

*/

4443

*/

4441

void __meminit get_pfn_range_for_nid(unsigned int nid,

4444

void __meminit get_pfn_range_for_nid(unsigned int nid,

4442

unsigned long *start_pfn, unsigned long *end_pfn)

4445

unsigned long *start_pfn, unsigned long *end_pfn)

4443

{

4446

{

4444

unsigned long this_start_pfn, this_end_pfn;

4447

unsigned long this_start_pfn, this_end_pfn;

4445

int i;

4448

int i;

4446

4449

4447

*start_pfn = -1UL;

4450

*start_pfn = -1UL;

4448

*end_pfn = 0;

4451

*end_pfn = 0;

4449

4452

4450

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4453

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4451

*start_pfn = min(*start_pfn, this_start_pfn);

4454

*start_pfn = min(*start_pfn, this_start_pfn);

4452

*end_pfn = max(*end_pfn, this_end_pfn);

4455

*end_pfn = max(*end_pfn, this_end_pfn);

4453

}

4456

}

4454

4457

4455

if (*start_pfn == -1UL)

4458

if (*start_pfn == -1UL)

4456

*start_pfn = 0;

4459

*start_pfn = 0;

4457

}

4460

}

4458

4461

4459

/*

4462

/*

4460

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4463

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4461

* assumption is made that zones within a node are ordered in monotonic

4464

* assumption is made that zones within a node are ordered in monotonic

4462

* increasing memory addresses so that the "highest" populated zone is used

4465

* increasing memory addresses so that the "highest" populated zone is used

4463

*/

4466

*/

4464

static void __init find_usable_zone_for_movable(void)

4467

static void __init find_usable_zone_for_movable(void)

4465

{

4468

{

4466

int zone_index;

4469

int zone_index;

4467

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4470

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4468

if (zone_index == ZONE_MOVABLE)

4471

if (zone_index == ZONE_MOVABLE)

4469

continue;

4472

continue;

4470

4473

4471

if (arch_zone_highest_possible_pfn[zone_index] >

4474

if (arch_zone_highest_possible_pfn[zone_index] >

4472

arch_zone_lowest_possible_pfn[zone_index])

4475

arch_zone_lowest_possible_pfn[zone_index])

4473

break;

4476

break;

4474

}

4477

}

4475

4478

4476

VM_BUG_ON(zone_index == -1);

4479

VM_BUG_ON(zone_index == -1);

4477

movable_zone = zone_index;

4480

movable_zone = zone_index;

4478

}

4481

}

4479

4482

4480

/*

4483

/*

4481

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4484

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4482

* because it is sized independent of architecture. Unlike the other zones,

4485

* because it is sized independent of architecture. Unlike the other zones,

4483

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4486

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4484

* in each node depending on the size of each node and how evenly kernelcore

4487

* in each node depending on the size of each node and how evenly kernelcore

4485

* is distributed. This helper function adjusts the zone ranges

4488

* is distributed. This helper function adjusts the zone ranges

4486

* provided by the architecture for a given node by using the end of the

4489

* provided by the architecture for a given node by using the end of the

4487

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4490

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4488

* zones within a node are in order of monotonic increases memory addresses

4491

* zones within a node are in order of monotonic increases memory addresses

4489

*/

4492

*/

4490

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4493

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4491

unsigned long zone_type,

4494

unsigned long zone_type,

4492

unsigned long node_start_pfn,

4495

unsigned long node_start_pfn,

4493

unsigned long node_end_pfn,

4496

unsigned long node_end_pfn,

4494

unsigned long *zone_start_pfn,

4497

unsigned long *zone_start_pfn,

4495

unsigned long *zone_end_pfn)

4498

unsigned long *zone_end_pfn)

4496

{

4499

{

4497

/* Only adjust if ZONE_MOVABLE is on this node */

4500

/* Only adjust if ZONE_MOVABLE is on this node */

4498

if (zone_movable_pfn[nid]) {

4501

if (zone_movable_pfn[nid]) {

4499

/* Size ZONE_MOVABLE */

4502

/* Size ZONE_MOVABLE */

4500

if (zone_type == ZONE_MOVABLE) {

4503

if (zone_type == ZONE_MOVABLE) {

4501

*zone_start_pfn = zone_movable_pfn[nid];

4504

*zone_start_pfn = zone_movable_pfn[nid];

4502

*zone_end_pfn = min(node_end_pfn,

4505

*zone_end_pfn = min(node_end_pfn,

4503

arch_zone_highest_possible_pfn[movable_zone]);

4506

arch_zone_highest_possible_pfn[movable_zone]);

4504

4507

4505

/* Adjust for ZONE_MOVABLE starting within this range */

4508

/* Adjust for ZONE_MOVABLE starting within this range */

4506

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4509

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4507

*zone_end_pfn > zone_movable_pfn[nid]) {

4510

*zone_end_pfn > zone_movable_pfn[nid]) {

4508

*zone_end_pfn = zone_movable_pfn[nid];

4511

*zone_end_pfn = zone_movable_pfn[nid];

4509

4512

4510

/* Check if this whole range is within ZONE_MOVABLE */

4513

/* Check if this whole range is within ZONE_MOVABLE */

4511

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4514

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4512

*zone_start_pfn = *zone_end_pfn;

4515

*zone_start_pfn = *zone_end_pfn;

4513

}

4516

}

4514

}

4517

}

4515

4518

4516

/*

4519

/*

4517

* Return the number of pages a zone spans in a node, including holes

4520

* Return the number of pages a zone spans in a node, including holes

4518

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4521

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4519

*/

4522

*/

4520

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4523

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4521

unsigned long zone_type,

4524

unsigned long zone_type,

4522

unsigned long node_start_pfn,

4525

unsigned long node_start_pfn,

4523

unsigned long node_end_pfn,

4526

unsigned long node_end_pfn,

4524

unsigned long *ignored)

4527

unsigned long *ignored)

4525

{

4528

{

4526

unsigned long zone_start_pfn, zone_end_pfn;

4529

unsigned long zone_start_pfn, zone_end_pfn;

4527

4530

4528

/* Get the start and end of the zone */

4531

/* Get the start and end of the zone */

4529

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4532

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4530

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4533

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4531

adjust_zone_range_for_zone_movable(nid, zone_type,

4534

adjust_zone_range_for_zone_movable(nid, zone_type,

4532

node_start_pfn, node_end_pfn,

4535

node_start_pfn, node_end_pfn,

4533

&zone_start_pfn, &zone_end_pfn);

4536

&zone_start_pfn, &zone_end_pfn);

4534

4537

4535

/* Check that this node has pages within the zone's required range */

4538

/* Check that this node has pages within the zone's required range */

4536

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4539

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4537

return 0;

4540

return 0;

4538

4541

4539

/* Move the zone boundaries inside the node if necessary */

4542

/* Move the zone boundaries inside the node if necessary */

4540

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4543

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4541

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4544

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4542

4545

4543

/* Return the spanned pages */

4546

/* Return the spanned pages */

4544

return zone_end_pfn - zone_start_pfn;

4547

return zone_end_pfn - zone_start_pfn;

4545

}

4548

}

4546

4549

4547

/*

4550

/*

4548

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4551

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4549

* then all holes in the requested range will be accounted for.

4552

* then all holes in the requested range will be accounted for.

4550

*/

4553

*/

4551

unsigned long __meminit __absent_pages_in_range(int nid,

4554

unsigned long __meminit __absent_pages_in_range(int nid,

4552

unsigned long range_start_pfn,

4555

unsigned long range_start_pfn,

4553

unsigned long range_end_pfn)

4556

unsigned long range_end_pfn)

4554

{

4557

{

4555

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4558

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4556

unsigned long start_pfn, end_pfn;

4559

unsigned long start_pfn, end_pfn;

4557

int i;

4560

int i;

4558

4561

4559

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4562

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4560

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4563

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4561

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4564

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4562

nr_absent -= end_pfn - start_pfn;

4565

nr_absent -= end_pfn - start_pfn;

4563

}

4566

}

4564

return nr_absent;

4567

return nr_absent;

4565

}

4568

}

4566

4569

4567

/**

4570

/**

4568

* absent_pages_in_range - Return number of page frames in holes within a range

4571

* absent_pages_in_range - Return number of page frames in holes within a range

4569

* @start_pfn: The start PFN to start searching for holes

4572

* @start_pfn: The start PFN to start searching for holes

4570

* @end_pfn: The end PFN to stop searching for holes

4573

* @end_pfn: The end PFN to stop searching for holes

4571

*

4574

*

4572

* It returns the number of pages frames in memory holes within a range.

4575

* It returns the number of pages frames in memory holes within a range.

4573

*/

4576

*/

4574

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4577

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4575

unsigned long end_pfn)

4578

unsigned long end_pfn)

4576

{

4579

{

4577

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4580

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4578

}

4581

}

4579

4582

4580

/* Return the number of page frames in holes in a zone on a node */

4583

/* Return the number of page frames in holes in a zone on a node */

4581

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4584

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4582

unsigned long zone_type,

4585

unsigned long zone_type,

4583

unsigned long node_start_pfn,

4586

unsigned long node_start_pfn,

4584

unsigned long node_end_pfn,

4587

unsigned long node_end_pfn,

4585

unsigned long *ignored)

4588

unsigned long *ignored)

4586

{

4589

{

4587

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4590

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4588

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4591

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4589

unsigned long zone_start_pfn, zone_end_pfn;

4592

unsigned long zone_start_pfn, zone_end_pfn;

4590

4593

4591

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4594

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4592

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4595

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4593

4596

4594

adjust_zone_range_for_zone_movable(nid, zone_type,

4597

adjust_zone_range_for_zone_movable(nid, zone_type,

4595

node_start_pfn, node_end_pfn,

4598

node_start_pfn, node_end_pfn,

4596

&zone_start_pfn, &zone_end_pfn);

4599

&zone_start_pfn, &zone_end_pfn);

4597

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4600

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4598

}

4601

}

4599

4602

4600

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4603

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4601

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4604

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4602

unsigned long zone_type,

4605

unsigned long zone_type,

4603

unsigned long node_start_pfn,

4606

unsigned long node_start_pfn,

4604

unsigned long node_end_pfn,

4607

unsigned long node_end_pfn,

4605

unsigned long *zones_size)

4608

unsigned long *zones_size)

4606

{

4609

{

4607

return zones_size[zone_type];

4610

return zones_size[zone_type];

4608

}

4611

}

4609

4612

4610

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4613

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4611

unsigned long zone_type,

4614

unsigned long zone_type,

4612

unsigned long node_start_pfn,

4615

unsigned long node_start_pfn,

4613

unsigned long node_end_pfn,

4616

unsigned long node_end_pfn,

4614

unsigned long *zholes_size)

4617

unsigned long *zholes_size)

4615

{

4618

{

4616

if (!zholes_size)

4619

if (!zholes_size)

4617

return 0;

4620

return 0;

4618

4621

4619

return zholes_size[zone_type];

4622

return zholes_size[zone_type];

4620

}

4623

}

4621

4624

4622

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4625

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4623

4626

4624

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4627

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4625

unsigned long node_start_pfn,

4628

unsigned long node_start_pfn,

4626

unsigned long node_end_pfn,

4629

unsigned long node_end_pfn,

4627

unsigned long *zones_size,

4630

unsigned long *zones_size,

4628

unsigned long *zholes_size)

4631

unsigned long *zholes_size)

4629

{

4632

{

4630

unsigned long realtotalpages, totalpages = 0;

4633

unsigned long realtotalpages, totalpages = 0;

4631

enum zone_type i;

4634

enum zone_type i;

4632

4635

4633

for (i = 0; i < MAX_NR_ZONES; i++)

4636

for (i = 0; i < MAX_NR_ZONES; i++)

4634

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4637

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4635

node_start_pfn,

4638

node_start_pfn,

4636

node_end_pfn,

4639

node_end_pfn,

4637

zones_size);

4640

zones_size);

4638

pgdat->node_spanned_pages = totalpages;

4641

pgdat->node_spanned_pages = totalpages;

4639

4642

4640

realtotalpages = totalpages;

4643

realtotalpages = totalpages;

4641

for (i = 0; i < MAX_NR_ZONES; i++)

4644

for (i = 0; i < MAX_NR_ZONES; i++)

4642

realtotalpages -=

4645

realtotalpages -=

4643

zone_absent_pages_in_node(pgdat->node_id, i,

4646

zone_absent_pages_in_node(pgdat->node_id, i,

4644

node_start_pfn, node_end_pfn,

4647

node_start_pfn, node_end_pfn,

4645

zholes_size);

4648

zholes_size);

4646

pgdat->node_present_pages = realtotalpages;

4649

pgdat->node_present_pages = realtotalpages;

4647

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4650

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4648

realtotalpages);

4651

realtotalpages);

4649

}

4652

}

4650

4653

4651

#ifndef CONFIG_SPARSEMEM

4654

#ifndef CONFIG_SPARSEMEM

4652

/*

4655

/*

4653

* Calculate the size of the zone->blockflags rounded to an unsigned long

4656

* Calculate the size of the zone->blockflags rounded to an unsigned long

4654

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4657

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4655

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4658

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4656

* round what is now in bits to nearest long in bits, then return it in

4659

* round what is now in bits to nearest long in bits, then return it in

4657

* bytes.

4660

* bytes.

4658

*/

4661

*/

4659

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4662

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4660

{

4663

{

4661

unsigned long usemapsize;

4664

unsigned long usemapsize;

4662

4665

4663

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4666

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4664

usemapsize = roundup(zonesize, pageblock_nr_pages);

4667

usemapsize = roundup(zonesize, pageblock_nr_pages);

4665

usemapsize = usemapsize >> pageblock_order;

4668

usemapsize = usemapsize >> pageblock_order;

4666

usemapsize *= NR_PAGEBLOCK_BITS;

4669

usemapsize *= NR_PAGEBLOCK_BITS;

4667

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4670

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4668

4671

4669

return usemapsize / 8;

4672

return usemapsize / 8;

4670

}

4673

}

4671

4674

4672

static void __init setup_usemap(struct pglist_data *pgdat,

4675

static void __init setup_usemap(struct pglist_data *pgdat,

4673

struct zone *zone,

4676

struct zone *zone,

4674

unsigned long zone_start_pfn,

4677

unsigned long zone_start_pfn,

4675

unsigned long zonesize)

4678

unsigned long zonesize)

4676

{

4679

{

4677

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4680

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4678

zone->pageblock_flags = NULL;

4681

zone->pageblock_flags = NULL;

4679

if (usemapsize)

4682

if (usemapsize)

4680

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4683

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4681

usemapsize);

4684

usemapsize);

4682

}

4685

}

4683

#else

4686

#else

4684

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4687

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4685

unsigned long zone_start_pfn, unsigned long zonesize) {}

4688

unsigned long zone_start_pfn, unsigned long zonesize) {}

4686

#endif /* CONFIG_SPARSEMEM */

4689

#endif /* CONFIG_SPARSEMEM */

4687

4690

4688

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4691

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4689

4692

4690

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4693

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4691

void __paginginit set_pageblock_order(void)

4694

void __paginginit set_pageblock_order(void)

4692

{

4695

{

4693

unsigned int order;

4696

unsigned int order;

4694

4697

4695

/* Check that pageblock_nr_pages has not already been setup */

4698

/* Check that pageblock_nr_pages has not already been setup */

4696

if (pageblock_order)

4699

if (pageblock_order)

4697

return;

4700

return;

4698

4701

4699

if (HPAGE_SHIFT > PAGE_SHIFT)

4702

if (HPAGE_SHIFT > PAGE_SHIFT)

4700

order = HUGETLB_PAGE_ORDER;

4703

order = HUGETLB_PAGE_ORDER;

4701

else

4704

else

4702

order = MAX_ORDER - 1;

4705

order = MAX_ORDER - 1;

4703

4706

4704

/*

4707

/*

4705

* Assume the largest contiguous order of interest is a huge page.

4708

* Assume the largest contiguous order of interest is a huge page.

4706

* This value may be variable depending on boot parameters on IA64 and

4709

* This value may be variable depending on boot parameters on IA64 and

4707

* powerpc.

4710

* powerpc.

4708

*/

4711

*/

4709

pageblock_order = order;

4712

pageblock_order = order;

4710

}

4713

}

4711

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4714

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4712

4715

4713

/*

4716

/*

4714

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4717

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4715

* is unused as pageblock_order is set at compile-time. See

4718

* is unused as pageblock_order is set at compile-time. See

4716

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4719

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4717

* the kernel config

4720

* the kernel config

4718

*/

4721

*/

4719

void __paginginit set_pageblock_order(void)

4722

void __paginginit set_pageblock_order(void)

4720

{

4723

{

4721

}

4724

}

4722

4725

4723

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4726

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4724

4727

4725

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4728

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4726

unsigned long present_pages)

4729

unsigned long present_pages)

4727

{

4730

{

4728

unsigned long pages = spanned_pages;

4731

unsigned long pages = spanned_pages;

4729

4732

4730

/*

4733

/*

4731

* Provide a more accurate estimation if there are holes within

4734

* Provide a more accurate estimation if there are holes within

4732

* the zone and SPARSEMEM is in use. If there are holes within the

4735

* the zone and SPARSEMEM is in use. If there are holes within the

4733

* zone, each populated memory region may cost us one or two extra

4736

* zone, each populated memory region may cost us one or two extra

4734

* memmap pages due to alignment because memmap pages for each

4737

* memmap pages due to alignment because memmap pages for each

4735

* populated regions may not naturally algined on page boundary.

4738

* populated regions may not naturally algined on page boundary.

4736

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4739

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4737

*/

4740

*/

4738

if (spanned_pages > present_pages + (present_pages >> 4) &&

4741

if (spanned_pages > present_pages + (present_pages >> 4) &&

4739

IS_ENABLED(CONFIG_SPARSEMEM))

4742

IS_ENABLED(CONFIG_SPARSEMEM))

4740

pages = present_pages;

4743

pages = present_pages;

4741

4744

4742

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4745

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4743

}

4746

}

4744

4747

4745

/*

4748

/*

4746

* Set up the zone data structures:

4749

* Set up the zone data structures:

4747

* - mark all pages reserved

4750

* - mark all pages reserved

4748

* - mark all memory queues empty

4751

* - mark all memory queues empty

4749

* - clear the memory bitmaps

4752

* - clear the memory bitmaps

4750

*

4753

*

4751

* NOTE: pgdat should get zeroed by caller.

4754

* NOTE: pgdat should get zeroed by caller.

4752

*/

4755

*/

4753

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4756

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4754

unsigned long node_start_pfn, unsigned long node_end_pfn,

4757

unsigned long node_start_pfn, unsigned long node_end_pfn,

4755

unsigned long *zones_size, unsigned long *zholes_size)

4758

unsigned long *zones_size, unsigned long *zholes_size)

4756

{

4759

{

4757

enum zone_type j;

4760

enum zone_type j;

4758

int nid = pgdat->node_id;

4761

int nid = pgdat->node_id;

4759

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4762

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4760

int ret;

4763

int ret;

4761

4764

4762

pgdat_resize_init(pgdat);

4765

pgdat_resize_init(pgdat);

4763

#ifdef CONFIG_NUMA_BALANCING

4766

#ifdef CONFIG_NUMA_BALANCING

4764

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4767

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4765

pgdat->numabalancing_migrate_nr_pages = 0;

4768

pgdat->numabalancing_migrate_nr_pages = 0;

4766

pgdat->numabalancing_migrate_next_window = jiffies;

4769

pgdat->numabalancing_migrate_next_window = jiffies;

4767

#endif

4770

#endif

4768

init_waitqueue_head(&pgdat->kswapd_wait);

4771

init_waitqueue_head(&pgdat->kswapd_wait);

4769

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4772

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4770

pgdat_page_cgroup_init(pgdat);

4773

pgdat_page_cgroup_init(pgdat);

4771

4774

4772

for (j = 0; j < MAX_NR_ZONES; j++) {

4775

for (j = 0; j < MAX_NR_ZONES; j++) {

4773

struct zone *zone = pgdat->node_zones + j;

4776

struct zone *zone = pgdat->node_zones + j;

4774

unsigned long size, realsize, freesize, memmap_pages;

4777

unsigned long size, realsize, freesize, memmap_pages;

4775

4778

4776

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4779

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4777

node_end_pfn, zones_size);

4780

node_end_pfn, zones_size);

4778

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4781

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4779

node_start_pfn,

4782

node_start_pfn,

4780

node_end_pfn,

4783

node_end_pfn,

4781

zholes_size);

4784

zholes_size);

4782

4785

4783

/*

4786

/*

4784

* Adjust freesize so that it accounts for how much memory

4787

* Adjust freesize so that it accounts for how much memory

4785

* is used by this zone for memmap. This affects the watermark

4788

* is used by this zone for memmap. This affects the watermark

4786

* and per-cpu initialisations

4789

* and per-cpu initialisations

4787

*/

4790

*/

4788

memmap_pages = calc_memmap_size(size, realsize);

4791

memmap_pages = calc_memmap_size(size, realsize);

4789

if (freesize >= memmap_pages) {

4792

if (freesize >= memmap_pages) {

4790

freesize -= memmap_pages;

4793

freesize -= memmap_pages;

4791

if (memmap_pages)

4794

if (memmap_pages)

4792

printk(KERN_DEBUG

4795

printk(KERN_DEBUG

4793

" %s zone: %lu pages used for memmap\n",

4796

" %s zone: %lu pages used for memmap\n",

4794

zone_names[j], memmap_pages);

4797

zone_names[j], memmap_pages);

4795

} else

4798

} else

4796

printk(KERN_WARNING

4799

printk(KERN_WARNING

4797

" %s zone: %lu pages exceeds freesize %lu\n",

4800

" %s zone: %lu pages exceeds freesize %lu\n",

4798

zone_names[j], memmap_pages, freesize);

4801

zone_names[j], memmap_pages, freesize);

4799

4802

4800

/* Account for reserved pages */

4803

/* Account for reserved pages */

4801

if (j == 0 && freesize > dma_reserve) {

4804

if (j == 0 && freesize > dma_reserve) {

4802

freesize -= dma_reserve;

4805

freesize -= dma_reserve;

4803

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4806

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4804

zone_names[0], dma_reserve);

4807

zone_names[0], dma_reserve);

4805

}

4808

}

4806

4809

4807

if (!is_highmem_idx(j))

4810

if (!is_highmem_idx(j))

4808

nr_kernel_pages += freesize;

4811

nr_kernel_pages += freesize;

4809

/* Charge for highmem memmap if there are enough kernel pages */

4812

/* Charge for highmem memmap if there are enough kernel pages */

4810

else if (nr_kernel_pages > memmap_pages * 2)

4813

else if (nr_kernel_pages > memmap_pages * 2)

4811

nr_kernel_pages -= memmap_pages;

4814

nr_kernel_pages -= memmap_pages;

4812

nr_all_pages += freesize;

4815

nr_all_pages += freesize;

4813

4816

4814

zone->spanned_pages = size;

4817

zone->spanned_pages = size;

4815

zone->present_pages = realsize;

4818

zone->present_pages = realsize;

4816

/*

4819

/*

4817

* Set an approximate value for lowmem here, it will be adjusted

4820

* Set an approximate value for lowmem here, it will be adjusted

4818

* when the bootmem allocator frees pages into the buddy system.

4821

* when the bootmem allocator frees pages into the buddy system.

4819

* And all highmem pages will be managed by the buddy system.

4822

* And all highmem pages will be managed by the buddy system.

4820

*/

4823

*/

4821

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4824

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4822

#ifdef CONFIG_NUMA

4825

#ifdef CONFIG_NUMA

4823

zone->node = nid;

4826

zone->node = nid;

4824

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4827

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4825

/ 100;

4828

/ 100;

4826

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4829

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4827

#endif

4830

#endif

4828

zone->name = zone_names[j];

4831

zone->name = zone_names[j];

4829

spin_lock_init(&zone->lock);

4832

spin_lock_init(&zone->lock);

4830

spin_lock_init(&zone->lru_lock);

4833

spin_lock_init(&zone->lru_lock);

4831

zone_seqlock_init(zone);

4834

zone_seqlock_init(zone);

4832

zone->zone_pgdat = pgdat;

4835

zone->zone_pgdat = pgdat;

4833

zone_pcp_init(zone);

4836

zone_pcp_init(zone);

4834

4837

4835

/* For bootup, initialized properly in watermark setup */

4838

/* For bootup, initialized properly in watermark setup */

4836

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4839

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4837

4840

4838

lruvec_init(&zone->lruvec);

4841

lruvec_init(&zone->lruvec);

4839

if (!size)

4842

if (!size)

4840

continue;

4843

continue;

4841

4844

4842

set_pageblock_order();

4845

set_pageblock_order();

4843

setup_usemap(pgdat, zone, zone_start_pfn, size);

4846

setup_usemap(pgdat, zone, zone_start_pfn, size);

4844

ret = init_currently_empty_zone(zone, zone_start_pfn,

4847

ret = init_currently_empty_zone(zone, zone_start_pfn,

4845

size, MEMMAP_EARLY);

4848

size, MEMMAP_EARLY);

4846

BUG_ON(ret);

4849

BUG_ON(ret);

4847

memmap_init(size, nid, j, zone_start_pfn);

4850

memmap_init(size, nid, j, zone_start_pfn);

4848

zone_start_pfn += size;

4851

zone_start_pfn += size;

4849

}

4852

}

4850

}

4853

}

4851

4854

4852

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4855

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4853

{

4856

{

4854

/* Skip empty nodes */

4857

/* Skip empty nodes */

4855

if (!pgdat->node_spanned_pages)

4858

if (!pgdat->node_spanned_pages)

4856

return;

4859

return;

4857

4860

4858

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4861

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4859

/* ia64 gets its own node_mem_map, before this, without bootmem */

4862

/* ia64 gets its own node_mem_map, before this, without bootmem */

4860

if (!pgdat->node_mem_map) {

4863

if (!pgdat->node_mem_map) {

4861

unsigned long size, start, end;

4864

unsigned long size, start, end;

4862

struct page *map;

4865

struct page *map;

4863

4866

4864

/*

4867

/*

4865

* The zone's endpoints aren't required to be MAX_ORDER

4868

* The zone's endpoints aren't required to be MAX_ORDER

4866

* aligned but the node_mem_map endpoints must be in order

4869

* aligned but the node_mem_map endpoints must be in order

4867

* for the buddy allocator to function correctly.

4870

* for the buddy allocator to function correctly.

4868

*/

4871

*/

4869

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4872

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4870

end = pgdat_end_pfn(pgdat);

4873

end = pgdat_end_pfn(pgdat);

4871

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4874

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4872

size = (end - start) * sizeof(struct page);

4875

size = (end - start) * sizeof(struct page);

4873

map = alloc_remap(pgdat->node_id, size);

4876

map = alloc_remap(pgdat->node_id, size);

4874

if (!map)

4877

if (!map)

4875

map = alloc_bootmem_node_nopanic(pgdat, size);

4878

map = alloc_bootmem_node_nopanic(pgdat, size);

4876

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4879

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4877

}

4880

}

4878

#ifndef CONFIG_NEED_MULTIPLE_NODES

4881

#ifndef CONFIG_NEED_MULTIPLE_NODES

4879

/*

4882

/*

4880

* With no DISCONTIG, the global mem_map is just set as node 0's

4883

* With no DISCONTIG, the global mem_map is just set as node 0's

4881

*/

4884

*/

4882

if (pgdat == NODE_DATA(0)) {

4885

if (pgdat == NODE_DATA(0)) {

4883

mem_map = NODE_DATA(0)->node_mem_map;

4886

mem_map = NODE_DATA(0)->node_mem_map;

4884

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4887

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4885

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4888

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4886

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4889

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4887

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4890

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4888

}

4891

}

4889

#endif

4892

#endif

4890

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4893

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4891

}

4894

}

4892

4895

4893

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4896

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4894

unsigned long node_start_pfn, unsigned long *zholes_size)

4897

unsigned long node_start_pfn, unsigned long *zholes_size)

4895

{

4898

{

4896

pg_data_t *pgdat = NODE_DATA(nid);

4899

pg_data_t *pgdat = NODE_DATA(nid);

4897

unsigned long start_pfn = 0;

4900

unsigned long start_pfn = 0;

4898

unsigned long end_pfn = 0;

4901

unsigned long end_pfn = 0;

4899

4902

4900

/* pg_data_t should be reset to zero when it's allocated */

4903

/* pg_data_t should be reset to zero when it's allocated */

4901

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4904

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4902

4905

4903

pgdat->node_id = nid;

4906

pgdat->node_id = nid;

4904

pgdat->node_start_pfn = node_start_pfn;

4907

pgdat->node_start_pfn = node_start_pfn;

4905

if (node_state(nid, N_MEMORY))

4908

if (node_state(nid, N_MEMORY))

4906

init_zone_allows_reclaim(nid);

4909

init_zone_allows_reclaim(nid);

4907

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4910

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4908

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4911

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4909

#endif

4912

#endif

4910

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4913

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4911

zones_size, zholes_size);

4914

zones_size, zholes_size);

4912

4915

4913

alloc_node_mem_map(pgdat);

4916

alloc_node_mem_map(pgdat);

4914

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4917

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4915

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4918

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4916

nid, (unsigned long)pgdat,

4919

nid, (unsigned long)pgdat,

4917

(unsigned long)pgdat->node_mem_map);

4920

(unsigned long)pgdat->node_mem_map);

4918

#endif

4921

#endif

4919

4922

4920

free_area_init_core(pgdat, start_pfn, end_pfn,

4923

free_area_init_core(pgdat, start_pfn, end_pfn,

4921

zones_size, zholes_size);

4924

zones_size, zholes_size);

4922

}

4925

}

4923

4926

4924

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4927

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4925

4928

4926

#if MAX_NUMNODES > 1

4929

#if MAX_NUMNODES > 1

4927

/*

4930

/*

4928

* Figure out the number of possible node ids.

4931

* Figure out the number of possible node ids.

4929

*/

4932

*/

4930

void __init setup_nr_node_ids(void)

4933

void __init setup_nr_node_ids(void)

4931

{

4934

{

4932

unsigned int node;

4935

unsigned int node;

4933

unsigned int highest = 0;

4936

unsigned int highest = 0;

4934

4937

4935

for_each_node_mask(node, node_possible_map)

4938

for_each_node_mask(node, node_possible_map)

4936

highest = node;

4939

highest = node;

4937

nr_node_ids = highest + 1;

4940

nr_node_ids = highest + 1;

4938

}

4941

}

4939

#endif

4942

#endif

4940

4943

4941

/**

4944

/**

4942

* node_map_pfn_alignment - determine the maximum internode alignment

4945

* node_map_pfn_alignment - determine the maximum internode alignment

4943

*

4946

*

4944

* This function should be called after node map is populated and sorted.

4947

* This function should be called after node map is populated and sorted.

4945

* It calculates the maximum power of two alignment which can distinguish

4948

* It calculates the maximum power of two alignment which can distinguish

4946

* all the nodes.

4949

* all the nodes.

4947

*

4950

*

4948

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4951

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4949

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4952

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4950

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4953

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4951

* shifted, 1GiB is enough and this function will indicate so.

4954

* shifted, 1GiB is enough and this function will indicate so.

4952

*

4955

*

4953

* This is used to test whether pfn -> nid mapping of the chosen memory

4956

* This is used to test whether pfn -> nid mapping of the chosen memory

4954

* model has fine enough granularity to avoid incorrect mapping for the

4957

* model has fine enough granularity to avoid incorrect mapping for the

4955

* populated node map.

4958

* populated node map.

4956

*

4959

*

4957

* Returns the determined alignment in pfn's. 0 if there is no alignment

4960

* Returns the determined alignment in pfn's. 0 if there is no alignment

4958

* requirement (single node).

4961

* requirement (single node).

4959

*/

4962

*/

4960

unsigned long __init node_map_pfn_alignment(void)

4963

unsigned long __init node_map_pfn_alignment(void)

4961

{

4964

{

4962

unsigned long accl_mask = 0, last_end = 0;

4965

unsigned long accl_mask = 0, last_end = 0;

4963

unsigned long start, end, mask;

4966

unsigned long start, end, mask;

4964

int last_nid = -1;

4967

int last_nid = -1;

4965

int i, nid;

4968

int i, nid;

4966

4969

4967

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4970

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4968

if (!start || last_nid < 0 || last_nid == nid) {

4971

if (!start || last_nid < 0 || last_nid == nid) {

4969

last_nid = nid;

4972

last_nid = nid;

4970

last_end = end;

4973

last_end = end;

4971

continue;

4974

continue;

4972

}

4975

}

4973

4976

4974

/*

4977

/*

4975

* Start with a mask granular enough to pin-point to the

4978

* Start with a mask granular enough to pin-point to the

4976

* start pfn and tick off bits one-by-one until it becomes

4979

* start pfn and tick off bits one-by-one until it becomes

4977

* too coarse to separate the current node from the last.

4980

* too coarse to separate the current node from the last.

4978

*/

4981

*/

4979

mask = ~((1 << __ffs(start)) - 1);

4982

mask = ~((1 << __ffs(start)) - 1);

4980

while (mask && last_end <= (start & (mask << 1)))

4983

while (mask && last_end <= (start & (mask << 1)))

4981

mask <<= 1;

4984

mask <<= 1;

4982

4985

4983

/* accumulate all internode masks */

4986

/* accumulate all internode masks */

4984

accl_mask |= mask;

4987

accl_mask |= mask;

4985

}

4988

}

4986

4989

4987

/* convert mask to number of pages */

4990

/* convert mask to number of pages */

4988

return ~accl_mask + 1;

4991

return ~accl_mask + 1;

4989

}

4992

}

4990

4993

4991

/* Find the lowest pfn for a node */

4994

/* Find the lowest pfn for a node */

4992

static unsigned long __init find_min_pfn_for_node(int nid)

4995

static unsigned long __init find_min_pfn_for_node(int nid)

4993

{

4996

{

4994

unsigned long min_pfn = ULONG_MAX;

4997

unsigned long min_pfn = ULONG_MAX;

4995

unsigned long start_pfn;

4998

unsigned long start_pfn;

4996

int i;

4999

int i;

4997

5000

4998

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5001

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4999

min_pfn = min(min_pfn, start_pfn);

5002

min_pfn = min(min_pfn, start_pfn);

5000

5003

5001

if (min_pfn == ULONG_MAX) {

5004

if (min_pfn == ULONG_MAX) {

5002

printk(KERN_WARNING

5005

printk(KERN_WARNING

5003

"Could not find start_pfn for node %d\n", nid);

5006

"Could not find start_pfn for node %d\n", nid);

5004

return 0;

5007

return 0;

5005

}

5008

}

5006

5009

5007

return min_pfn;

5010

return min_pfn;

5008

}

5011

}

5009

5012

5010

/**

5013

/**

5011

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5014

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5012

*

5015

*

5013

* It returns the minimum PFN based on information provided via

5016

* It returns the minimum PFN based on information provided via

5014

* add_active_range().

5017

* add_active_range().

5015

*/

5018

*/

5016

unsigned long __init find_min_pfn_with_active_regions(void)

5019

unsigned long __init find_min_pfn_with_active_regions(void)

5017

{

5020

{

5018

return find_min_pfn_for_node(MAX_NUMNODES);

5021

return find_min_pfn_for_node(MAX_NUMNODES);

5019

}

5022

}

5020

5023

5021

/*

5024

/*

5022

* early_calculate_totalpages()

5025

* early_calculate_totalpages()

5023

* Sum pages in active regions for movable zone.

5026

* Sum pages in active regions for movable zone.

5024

* Populate N_MEMORY for calculating usable_nodes.

5027

* Populate N_MEMORY for calculating usable_nodes.

5025

*/

5028

*/

5026

static unsigned long __init early_calculate_totalpages(void)

5029

static unsigned long __init early_calculate_totalpages(void)

5027

{

5030

{

5028

unsigned long totalpages = 0;

5031

unsigned long totalpages = 0;

5029

unsigned long start_pfn, end_pfn;

5032

unsigned long start_pfn, end_pfn;

5030

int i, nid;

5033

int i, nid;

5031

5034

5032

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5035

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5033

unsigned long pages = end_pfn - start_pfn;

5036

unsigned long pages = end_pfn - start_pfn;

5034

5037

5035

totalpages += pages;

5038

totalpages += pages;

5036

if (pages)

5039

if (pages)

5037

node_set_state(nid, N_MEMORY);

5040

node_set_state(nid, N_MEMORY);

5038

}

5041

}

5039

return totalpages;

5042

return totalpages;

5040

}

5043

}

5041

5044

5042

/*

5045

/*

5043

* Find the PFN the Movable zone begins in each node. Kernel memory

5046

* Find the PFN the Movable zone begins in each node. Kernel memory

5044

* is spread evenly between nodes as long as the nodes have enough

5047

* is spread evenly between nodes as long as the nodes have enough

5045

* memory. When they don't, some nodes will have more kernelcore than

5048

* memory. When they don't, some nodes will have more kernelcore than

5046

* others

5049

* others

5047

*/

5050

*/

5048

static void __init find_zone_movable_pfns_for_nodes(void)

5051

static void __init find_zone_movable_pfns_for_nodes(void)

5049

{

5052

{

5050

int i, nid;

5053

int i, nid;

5051

unsigned long usable_startpfn;

5054

unsigned long usable_startpfn;

5052

unsigned long kernelcore_node, kernelcore_remaining;

5055

unsigned long kernelcore_node, kernelcore_remaining;

5053

/* save the state before borrow the nodemask */

5056

/* save the state before borrow the nodemask */

5054

nodemask_t saved_node_state = node_states[N_MEMORY];

5057

nodemask_t saved_node_state = node_states[N_MEMORY];

5055

unsigned long totalpages = early_calculate_totalpages();

5058

unsigned long totalpages = early_calculate_totalpages();

5056

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5059

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5057

5060

5058

/*

5061

/*

5059

* If movablecore was specified, calculate what size of

5062

* If movablecore was specified, calculate what size of

5060

* kernelcore that corresponds so that memory usable for

5063

* kernelcore that corresponds so that memory usable for

5061

* any allocation type is evenly spread. If both kernelcore

5064

* any allocation type is evenly spread. If both kernelcore

5062

* and movablecore are specified, then the value of kernelcore

5065

* and movablecore are specified, then the value of kernelcore

5063

* will be used for required_kernelcore if it's greater than

5066

* will be used for required_kernelcore if it's greater than

5064

* what movablecore would have allowed.

5067

* what movablecore would have allowed.

5065

*/

5068

*/

5066

if (required_movablecore) {

5069

if (required_movablecore) {

5067

unsigned long corepages;

5070

unsigned long corepages;

5068

5071

5069

/*

5072

/*

5070

* Round-up so that ZONE_MOVABLE is at least as large as what

5073

* Round-up so that ZONE_MOVABLE is at least as large as what

5071

* was requested by the user

5074

* was requested by the user

5072

*/

5075

*/

5073

required_movablecore =

5076

required_movablecore =

5074

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5077

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5075

corepages = totalpages - required_movablecore;

5078

corepages = totalpages - required_movablecore;

5076

5079

5077

required_kernelcore = max(required_kernelcore, corepages);

5080

required_kernelcore = max(required_kernelcore, corepages);

5078

}

5081

}

5079

5082

5080

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5083

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5081

if (!required_kernelcore)

5084

if (!required_kernelcore)

5082

goto out;

5085

goto out;

5083

5086

5084

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5087

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5085

find_usable_zone_for_movable();

5088

find_usable_zone_for_movable();

5086

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5089

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5087

5090

5088

restart:

5091

restart:

5089

/* Spread kernelcore memory as evenly as possible throughout nodes */

5092

/* Spread kernelcore memory as evenly as possible throughout nodes */

5090

kernelcore_node = required_kernelcore / usable_nodes;

5093

kernelcore_node = required_kernelcore / usable_nodes;

5091

for_each_node_state(nid, N_MEMORY) {

5094

for_each_node_state(nid, N_MEMORY) {

5092

unsigned long start_pfn, end_pfn;

5095

unsigned long start_pfn, end_pfn;

5093

5096

5094

/*

5097

/*

5095

* Recalculate kernelcore_node if the division per node

5098

* Recalculate kernelcore_node if the division per node

5096

* now exceeds what is necessary to satisfy the requested

5099

* now exceeds what is necessary to satisfy the requested

5097

* amount of memory for the kernel

5100

* amount of memory for the kernel

5098

*/

5101

*/

5099

if (required_kernelcore < kernelcore_node)

5102

if (required_kernelcore < kernelcore_node)

5100

kernelcore_node = required_kernelcore / usable_nodes;

5103

kernelcore_node = required_kernelcore / usable_nodes;

5101

5104

5102

/*

5105

/*

5103

* As the map is walked, we track how much memory is usable

5106

* As the map is walked, we track how much memory is usable

5104

* by the kernel using kernelcore_remaining. When it is

5107

* by the kernel using kernelcore_remaining. When it is

5105

* 0, the rest of the node is usable by ZONE_MOVABLE

5108

* 0, the rest of the node is usable by ZONE_MOVABLE

5106

*/

5109

*/

5107

kernelcore_remaining = kernelcore_node;

5110

kernelcore_remaining = kernelcore_node;

5108

5111

5109

/* Go through each range of PFNs within this node */

5112

/* Go through each range of PFNs within this node */

5110

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5113

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5111

unsigned long size_pages;

5114

unsigned long size_pages;

5112

5115

5113

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5116

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5114

if (start_pfn >= end_pfn)

5117

if (start_pfn >= end_pfn)

5115

continue;

5118

continue;

5116

5119

5117

/* Account for what is only usable for kernelcore */

5120

/* Account for what is only usable for kernelcore */

5118

if (start_pfn < usable_startpfn) {

5121

if (start_pfn < usable_startpfn) {

5119

unsigned long kernel_pages;

5122

unsigned long kernel_pages;

5120

kernel_pages = min(end_pfn, usable_startpfn)

5123

kernel_pages = min(end_pfn, usable_startpfn)

5121

- start_pfn;

5124

- start_pfn;

5122

5125

5123

kernelcore_remaining -= min(kernel_pages,

5126

kernelcore_remaining -= min(kernel_pages,

5124

kernelcore_remaining);

5127

kernelcore_remaining);

5125

required_kernelcore -= min(kernel_pages,

5128

required_kernelcore -= min(kernel_pages,

5126

required_kernelcore);

5129

required_kernelcore);

5127

5130

5128

/* Continue if range is now fully accounted */

5131

/* Continue if range is now fully accounted */

5129

if (end_pfn <= usable_startpfn) {

5132

if (end_pfn <= usable_startpfn) {

5130

5133

5131

/*

5134

/*

5132

* Push zone_movable_pfn to the end so

5135

* Push zone_movable_pfn to the end so

5133

* that if we have to rebalance

5136

* that if we have to rebalance

5134

* kernelcore across nodes, we will

5137

* kernelcore across nodes, we will

5135

* not double account here

5138

* not double account here

5136

*/

5139

*/

5137

zone_movable_pfn[nid] = end_pfn;

5140

zone_movable_pfn[nid] = end_pfn;

5138

continue;

5141

continue;

5139

}

5142

}

5140

start_pfn = usable_startpfn;

5143

start_pfn = usable_startpfn;

5141

}

5144

}

5142

5145

5143

/*

5146

/*

5144

* The usable PFN range for ZONE_MOVABLE is from

5147

* The usable PFN range for ZONE_MOVABLE is from

5145

* start_pfn->end_pfn. Calculate size_pages as the

5148

* start_pfn->end_pfn. Calculate size_pages as the

5146

* number of pages used as kernelcore

5149

* number of pages used as kernelcore

5147

*/

5150

*/

5148

size_pages = end_pfn - start_pfn;

5151

size_pages = end_pfn - start_pfn;

5149

if (size_pages > kernelcore_remaining)

5152

if (size_pages > kernelcore_remaining)

5150

size_pages = kernelcore_remaining;

5153

size_pages = kernelcore_remaining;

5151

zone_movable_pfn[nid] = start_pfn + size_pages;

5154

zone_movable_pfn[nid] = start_pfn + size_pages;

5152

5155

5153

/*

5156

/*

5154

* Some kernelcore has been met, update counts and

5157

* Some kernelcore has been met, update counts and

5155

* break if the kernelcore for this node has been

5158

* break if the kernelcore for this node has been

5156

* satisfied

5159

* satisfied

5157

*/

5160

*/

5158

required_kernelcore -= min(required_kernelcore,

5161

required_kernelcore -= min(required_kernelcore,

5159

size_pages);

5162

size_pages);

5160

kernelcore_remaining -= size_pages;

5163

kernelcore_remaining -= size_pages;

5161

if (!kernelcore_remaining)

5164

if (!kernelcore_remaining)

5162

break;

5165

break;

5163

}

5166

}

5164

}

5167

}

5165

5168

5166

/*

5169

/*

5167

* If there is still required_kernelcore, we do another pass with one

5170

* If there is still required_kernelcore, we do another pass with one

5168

* less node in the count. This will push zone_movable_pfn[nid] further

5171

* less node in the count. This will push zone_movable_pfn[nid] further

5169

* along on the nodes that still have memory until kernelcore is

5172

* along on the nodes that still have memory until kernelcore is

5170

* satisfied

5173

* satisfied

5171

*/

5174

*/

5172

usable_nodes--;

5175

usable_nodes--;

5173

if (usable_nodes && required_kernelcore > usable_nodes)

5176

if (usable_nodes && required_kernelcore > usable_nodes)

5174

goto restart;

5177

goto restart;

5175

5178

5176

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5179

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5177

for (nid = 0; nid < MAX_NUMNODES; nid++)

5180

for (nid = 0; nid < MAX_NUMNODES; nid++)

5178

zone_movable_pfn[nid] =

5181

zone_movable_pfn[nid] =

5179

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5182

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5180

5183

5181

out:

5184

out:

5182

/* restore the node_state */

5185

/* restore the node_state */

5183

node_states[N_MEMORY] = saved_node_state;

5186

node_states[N_MEMORY] = saved_node_state;

5184

}

5187

}

5185

5188

5186

/* Any regular or high memory on that node ? */

5189

/* Any regular or high memory on that node ? */

5187

static void check_for_memory(pg_data_t *pgdat, int nid)

5190

static void check_for_memory(pg_data_t *pgdat, int nid)

5188

{

5191

{

5189

enum zone_type zone_type;

5192

enum zone_type zone_type;

5190

5193

5191

if (N_MEMORY == N_NORMAL_MEMORY)

5194

if (N_MEMORY == N_NORMAL_MEMORY)

5192

return;

5195

return;

5193

5196

5194

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5197

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5195

struct zone *zone = &pgdat->node_zones[zone_type];

5198

struct zone *zone = &pgdat->node_zones[zone_type];

5196

if (zone->present_pages) {

5199

if (zone->present_pages) {

5197

node_set_state(nid, N_HIGH_MEMORY);

5200

node_set_state(nid, N_HIGH_MEMORY);

5198

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5201

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5199

zone_type <= ZONE_NORMAL)

5202

zone_type <= ZONE_NORMAL)

5200

node_set_state(nid, N_NORMAL_MEMORY);

5203

node_set_state(nid, N_NORMAL_MEMORY);

5201

break;

5204

break;

5202

}

5205

}

5203

}

5206

}

5204

}

5207

}

5205

5208

5206

/**

5209

/**

5207

* free_area_init_nodes - Initialise all pg_data_t and zone data

5210

* free_area_init_nodes - Initialise all pg_data_t and zone data

5208

* @max_zone_pfn: an array of max PFNs for each zone

5211

* @max_zone_pfn: an array of max PFNs for each zone

5209

*

5212

*

5210

* This will call free_area_init_node() for each active node in the system.

5213

* This will call free_area_init_node() for each active node in the system.

5211

* Using the page ranges provided by add_active_range(), the size of each

5214

* Using the page ranges provided by add_active_range(), the size of each

5212

* zone in each node and their holes is calculated. If the maximum PFN

5215

* zone in each node and their holes is calculated. If the maximum PFN

5213

* between two adjacent zones match, it is assumed that the zone is empty.

5216

* between two adjacent zones match, it is assumed that the zone is empty.

5214

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5217

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5215

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5218

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5216

* starts where the previous one ended. For example, ZONE_DMA32 starts

5219

* starts where the previous one ended. For example, ZONE_DMA32 starts

5217

* at arch_max_dma_pfn.

5220

* at arch_max_dma_pfn.

5218

*/

5221

*/

5219

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5222

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5220

{

5223

{

5221

unsigned long start_pfn, end_pfn;

5224

unsigned long start_pfn, end_pfn;

5222

int i, nid;

5225

int i, nid;

5223

5226

5224

/* Record where the zone boundaries are */

5227

/* Record where the zone boundaries are */

5225

memset(arch_zone_lowest_possible_pfn, 0,

5228

memset(arch_zone_lowest_possible_pfn, 0,

5226

sizeof(arch_zone_lowest_possible_pfn));

5229

sizeof(arch_zone_lowest_possible_pfn));

5227

memset(arch_zone_highest_possible_pfn, 0,

5230

memset(arch_zone_highest_possible_pfn, 0,

5228

sizeof(arch_zone_highest_possible_pfn));

5231

sizeof(arch_zone_highest_possible_pfn));

5229

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5232

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5230

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5233

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5231

for (i = 1; i < MAX_NR_ZONES; i++) {

5234

for (i = 1; i < MAX_NR_ZONES; i++) {

5232

if (i == ZONE_MOVABLE)

5235

if (i == ZONE_MOVABLE)

5233

continue;

5236

continue;

5234

arch_zone_lowest_possible_pfn[i] =

5237

arch_zone_lowest_possible_pfn[i] =

5235

arch_zone_highest_possible_pfn[i-1];

5238

arch_zone_highest_possible_pfn[i-1];

5236

arch_zone_highest_possible_pfn[i] =

5239

arch_zone_highest_possible_pfn[i] =

5237

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5240

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5238

}

5241

}

5239

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5242

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5240

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5243

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5241

5244

5242

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5245

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5243

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5246

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5244

find_zone_movable_pfns_for_nodes();

5247

find_zone_movable_pfns_for_nodes();

5245

5248

5246

/* Print out the zone ranges */

5249

/* Print out the zone ranges */

5247

printk("Zone ranges:\n");

5250

printk("Zone ranges:\n");

5248

for (i = 0; i < MAX_NR_ZONES; i++) {

5251

for (i = 0; i < MAX_NR_ZONES; i++) {

5249

if (i == ZONE_MOVABLE)

5252

if (i == ZONE_MOVABLE)

5250

continue;

5253

continue;

5251

printk(KERN_CONT " %-8s ", zone_names[i]);

5254

printk(KERN_CONT " %-8s ", zone_names[i]);

5252

if (arch_zone_lowest_possible_pfn[i] ==

5255

if (arch_zone_lowest_possible_pfn[i] ==

5253

arch_zone_highest_possible_pfn[i])

5256

arch_zone_highest_possible_pfn[i])

5254

printk(KERN_CONT "empty\n");

5257

printk(KERN_CONT "empty\n");

5255

else

5258

else

5256

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5259

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5257

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5260

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5258

(arch_zone_highest_possible_pfn[i]

5261

(arch_zone_highest_possible_pfn[i]

5259

<< PAGE_SHIFT) - 1);

5262

<< PAGE_SHIFT) - 1);

5260

}

5263

}

5261

5264

5262

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5265

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5263

printk("Movable zone start for each node\n");

5266

printk("Movable zone start for each node\n");

5264

for (i = 0; i < MAX_NUMNODES; i++) {

5267

for (i = 0; i < MAX_NUMNODES; i++) {

5265

if (zone_movable_pfn[i])

5268

if (zone_movable_pfn[i])

5266

printk(" Node %d: %#010lx\n", i,

5269

printk(" Node %d: %#010lx\n", i,

5267

zone_movable_pfn[i] << PAGE_SHIFT);

5270

zone_movable_pfn[i] << PAGE_SHIFT);

5268

}

5271

}

5269

5272

5270

/* Print out the early node map */

5273

/* Print out the early node map */

5271

printk("Early memory node ranges\n");

5274

printk("Early memory node ranges\n");

5272

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5275

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5273

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5276

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5274

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5277

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5275

5278

5276

/* Initialise every node */

5279

/* Initialise every node */

5277

mminit_verify_pageflags_layout();

5280

mminit_verify_pageflags_layout();

5278

setup_nr_node_ids();

5281

setup_nr_node_ids();

5279

for_each_online_node(nid) {

5282

for_each_online_node(nid) {

5280

pg_data_t *pgdat = NODE_DATA(nid);

5283

pg_data_t *pgdat = NODE_DATA(nid);

5281

free_area_init_node(nid, NULL,

5284

free_area_init_node(nid, NULL,

5282

find_min_pfn_for_node(nid), NULL);

5285

find_min_pfn_for_node(nid), NULL);

5283

5286

5284

/* Any memory on that node */

5287

/* Any memory on that node */

5285

if (pgdat->node_present_pages)

5288

if (pgdat->node_present_pages)

5286

node_set_state(nid, N_MEMORY);

5289

node_set_state(nid, N_MEMORY);

5287

check_for_memory(pgdat, nid);

5290

check_for_memory(pgdat, nid);

5288

}

5291

}

5289

}

5292

}

5290

5293

5291

static int __init cmdline_parse_core(char *p, unsigned long *core)

5294

static int __init cmdline_parse_core(char *p, unsigned long *core)

5292

{

5295

{

5293

unsigned long long coremem;

5296

unsigned long long coremem;

5294

if (!p)

5297

if (!p)

5295

return -EINVAL;

5298

return -EINVAL;

5296

5299

5297

coremem = memparse(p, &p);

5300

coremem = memparse(p, &p);

5298

*core = coremem >> PAGE_SHIFT;

5301

*core = coremem >> PAGE_SHIFT;

5299

5302

5300

/* Paranoid check that UL is enough for the coremem value */

5303

/* Paranoid check that UL is enough for the coremem value */

5301

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5304

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5302

5305

5303

return 0;

5306

return 0;

5304

}

5307

}

5305

5308

5306

/*

5309

/*

5307

* kernelcore=size sets the amount of memory for use for allocations that

5310

* kernelcore=size sets the amount of memory for use for allocations that

5308

* cannot be reclaimed or migrated.

5311

* cannot be reclaimed or migrated.

5309

*/

5312

*/

5310

static int __init cmdline_parse_kernelcore(char *p)

5313

static int __init cmdline_parse_kernelcore(char *p)

5311

{

5314

{

5312

return cmdline_parse_core(p, &required_kernelcore);

5315

return cmdline_parse_core(p, &required_kernelcore);

5313

}

5316

}

5314

5317

5315

/*

5318

/*

5316

* movablecore=size sets the amount of memory for use for allocations that

5319

* movablecore=size sets the amount of memory for use for allocations that

5317

* can be reclaimed or migrated.

5320

* can be reclaimed or migrated.

5318

*/

5321

*/

5319

static int __init cmdline_parse_movablecore(char *p)

5322

static int __init cmdline_parse_movablecore(char *p)

5320

{

5323

{

5321

return cmdline_parse_core(p, &required_movablecore);

5324

return cmdline_parse_core(p, &required_movablecore);

5322

}

5325

}

5323

5326

5324

early_param("kernelcore", cmdline_parse_kernelcore);

5327

early_param("kernelcore", cmdline_parse_kernelcore);

5325

early_param("movablecore", cmdline_parse_movablecore);

5328

early_param("movablecore", cmdline_parse_movablecore);

5326

5329

5327

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5330

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5328

5331

5329

void adjust_managed_page_count(struct page *page, long count)

5332

void adjust_managed_page_count(struct page *page, long count)

5330

{

5333

{

5331

spin_lock(&managed_page_count_lock);

5334

spin_lock(&managed_page_count_lock);

5332

page_zone(page)->managed_pages += count;

5335

page_zone(page)->managed_pages += count;

5333

totalram_pages += count;

5336

totalram_pages += count;

5334

#ifdef CONFIG_HIGHMEM

5337

#ifdef CONFIG_HIGHMEM

5335

if (PageHighMem(page))

5338

if (PageHighMem(page))

5336

totalhigh_pages += count;

5339

totalhigh_pages += count;

5337

#endif

5340

#endif

5338

spin_unlock(&managed_page_count_lock);

5341

spin_unlock(&managed_page_count_lock);

5339

}

5342

}

5340

EXPORT_SYMBOL(adjust_managed_page_count);

5343

EXPORT_SYMBOL(adjust_managed_page_count);

5341

5344

5342

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5345

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5343

{

5346

{

5344

void *pos;

5347

void *pos;

5345

unsigned long pages = 0;

5348

unsigned long pages = 0;

5346

5349

5347

start = (void *)PAGE_ALIGN((unsigned long)start);

5350

start = (void *)PAGE_ALIGN((unsigned long)start);

5348

end = (void *)((unsigned long)end & PAGE_MASK);

5351

end = (void *)((unsigned long)end & PAGE_MASK);

5349

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5352

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5350

if ((unsigned int)poison <= 0xFF)

5353

if ((unsigned int)poison <= 0xFF)

5351

memset(pos, poison, PAGE_SIZE);

5354

memset(pos, poison, PAGE_SIZE);

5352

free_reserved_page(virt_to_page(pos));

5355

free_reserved_page(virt_to_page(pos));

5353

}

5356

}

5354

5357

5355

if (pages && s)

5358

if (pages && s)

5356

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5359

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5357

s, pages << (PAGE_SHIFT - 10), start, end);

5360

s, pages << (PAGE_SHIFT - 10), start, end);

5358

5361

5359

return pages;

5362

return pages;

5360

}

5363

}

5361

EXPORT_SYMBOL(free_reserved_area);

5364

EXPORT_SYMBOL(free_reserved_area);

5362

5365

5363

#ifdef CONFIG_HIGHMEM

5366

#ifdef CONFIG_HIGHMEM

5364

void free_highmem_page(struct page *page)

5367

void free_highmem_page(struct page *page)

5365

{

5368

{

5366

__free_reserved_page(page);

5369

__free_reserved_page(page);

5367

totalram_pages++;

5370

totalram_pages++;

5368

page_zone(page)->managed_pages++;

5371

page_zone(page)->managed_pages++;

5369

totalhigh_pages++;

5372

totalhigh_pages++;

5370

}

5373

}

5371

#endif

5374

#endif

5372

5375

5373

5376

5374

void __init mem_init_print_info(const char *str)

5377

void __init mem_init_print_info(const char *str)

5375

{

5378

{

5376

unsigned long physpages, codesize, datasize, rosize, bss_size;

5379

unsigned long physpages, codesize, datasize, rosize, bss_size;

5377

unsigned long init_code_size, init_data_size;

5380

unsigned long init_code_size, init_data_size;

5378

5381

5379

physpages = get_num_physpages();

5382

physpages = get_num_physpages();

5380

codesize = _etext - _stext;

5383

codesize = _etext - _stext;

5381

datasize = _edata - _sdata;

5384

datasize = _edata - _sdata;

5382

rosize = __end_rodata - __start_rodata;

5385

rosize = __end_rodata - __start_rodata;

5383

bss_size = __bss_stop - __bss_start;

5386

bss_size = __bss_stop - __bss_start;

5384

init_data_size = __init_end - __init_begin;

5387

init_data_size = __init_end - __init_begin;

5385

init_code_size = _einittext - _sinittext;

5388

init_code_size = _einittext - _sinittext;

5386

5389

5387

/*

5390

/*

5388

* Detect special cases and adjust section sizes accordingly:

5391

* Detect special cases and adjust section sizes accordingly:

5389

* 1) .init.* may be embedded into .data sections

5392

* 1) .init.* may be embedded into .data sections

5390

* 2) .init.text.* may be out of [__init_begin, __init_end],

5393

* 2) .init.text.* may be out of [__init_begin, __init_end],

5391

* please refer to arch/tile/kernel/vmlinux.lds.S.

5394

* please refer to arch/tile/kernel/vmlinux.lds.S.

5392

* 3) .rodata.* may be embedded into .text or .data sections.

5395

* 3) .rodata.* may be embedded into .text or .data sections.

5393

*/

5396

*/

5394

#define adj_init_size(start, end, size, pos, adj) \

5397

#define adj_init_size(start, end, size, pos, adj) \

5395

do { \

5398

do { \

5396

if (start <= pos && pos < end && size > adj) \

5399

if (start <= pos && pos < end && size > adj) \

5397

size -= adj; \

5400

size -= adj; \

5398

} while (0)

5401

} while (0)

5399

5402

5400

adj_init_size(__init_begin, __init_end, init_data_size,

5403

adj_init_size(__init_begin, __init_end, init_data_size,

5401

_sinittext, init_code_size);

5404

_sinittext, init_code_size);

5402

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5405

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5403

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5406

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5404

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5407

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5405

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5408

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5406

5409

5407

#undef adj_init_size

5410

#undef adj_init_size

5408

5411

5409

printk("Memory: %luK/%luK available "

5412

printk("Memory: %luK/%luK available "

5410

"(%luK kernel code, %luK rwdata, %luK rodata, "

5413

"(%luK kernel code, %luK rwdata, %luK rodata, "

5411

"%luK init, %luK bss, %luK reserved"

5414

"%luK init, %luK bss, %luK reserved"

5412

#ifdef CONFIG_HIGHMEM

5415

#ifdef CONFIG_HIGHMEM

5413

", %luK highmem"

5416

", %luK highmem"

5414

#endif

5417

#endif

5415

"%s%s)\n",

5418

"%s%s)\n",

5416

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5419

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5417

codesize >> 10, datasize >> 10, rosize >> 10,

5420

codesize >> 10, datasize >> 10, rosize >> 10,

5418

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5421

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5419

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5422

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5420

#ifdef CONFIG_HIGHMEM

5423

#ifdef CONFIG_HIGHMEM

5421

totalhigh_pages << (PAGE_SHIFT-10),

5424

totalhigh_pages << (PAGE_SHIFT-10),

5422

#endif

5425

#endif

5423

str ? ", " : "", str ? str : "");

5426

str ? ", " : "", str ? str : "");

5424

}

5427

}

5425

5428

5426

/**

5429

/**

5427

* set_dma_reserve - set the specified number of pages reserved in the first zone

5430

* set_dma_reserve - set the specified number of pages reserved in the first zone

5428

* @new_dma_reserve: The number of pages to mark reserved

5431

* @new_dma_reserve: The number of pages to mark reserved

5429

*

5432

*

5430

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5433

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5431

* In the DMA zone, a significant percentage may be consumed by kernel image

5434

* In the DMA zone, a significant percentage may be consumed by kernel image

5432

* and other unfreeable allocations which can skew the watermarks badly. This

5435

* and other unfreeable allocations which can skew the watermarks badly. This

5433

* function may optionally be used to account for unfreeable pages in the

5436

* function may optionally be used to account for unfreeable pages in the

5434

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5437

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5435

* smaller per-cpu batchsize.

5438

* smaller per-cpu batchsize.

5436

*/

5439

*/

5437

void __init set_dma_reserve(unsigned long new_dma_reserve)

5440

void __init set_dma_reserve(unsigned long new_dma_reserve)

5438

{

5441

{

5439

dma_reserve = new_dma_reserve;

5442

dma_reserve = new_dma_reserve;

5440

}

5443

}

5441

5444

5442

void __init free_area_init(unsigned long *zones_size)

5445

void __init free_area_init(unsigned long *zones_size)

5443

{

5446

{

5444

free_area_init_node(0, zones_size,

5447

free_area_init_node(0, zones_size,

5445

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5448

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5446

}

5449

}

5447

5450

5448

static int page_alloc_cpu_notify(struct notifier_block *self,

5451

static int page_alloc_cpu_notify(struct notifier_block *self,

5449

unsigned long action, void *hcpu)

5452

unsigned long action, void *hcpu)

5450

{

5453

{

5451

int cpu = (unsigned long)hcpu;

5454

int cpu = (unsigned long)hcpu;

5452

5455

5453

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5456

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5454

lru_add_drain_cpu(cpu);

5457

lru_add_drain_cpu(cpu);

5455

drain_pages(cpu);

5458

drain_pages(cpu);

5456

5459

5457

/*

5460

/*

5458

* Spill the event counters of the dead processor

5461

* Spill the event counters of the dead processor

5459

* into the current processors event counters.

5462

* into the current processors event counters.

5460

* This artificially elevates the count of the current

5463

* This artificially elevates the count of the current

5461

* processor.

5464

* processor.

5462

*/

5465

*/

5463

vm_events_fold_cpu(cpu);

5466

vm_events_fold_cpu(cpu);

5464

5467

5465

/*

5468

/*

5466

* Zero the differential counters of the dead processor

5469

* Zero the differential counters of the dead processor

5467

* so that the vm statistics are consistent.

5470

* so that the vm statistics are consistent.

5468

*

5471

*

5469

* This is only okay since the processor is dead and cannot

5472

* This is only okay since the processor is dead and cannot

5470

* race with what we are doing.

5473

* race with what we are doing.

5471

*/

5474

*/

5472

cpu_vm_stats_fold(cpu);

5475

cpu_vm_stats_fold(cpu);

5473

}

5476

}

5474

return NOTIFY_OK;

5477

return NOTIFY_OK;

5475

}

5478

}

5476

5479

5477

void __init page_alloc_init(void)

5480

void __init page_alloc_init(void)

5478

{

5481

{

5479

hotcpu_notifier(page_alloc_cpu_notify, 0);

5482

hotcpu_notifier(page_alloc_cpu_notify, 0);

5480

}

5483

}

5481

5484

5482

/*

5485

/*

5483

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5486

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5484

* or min_free_kbytes changes.

5487

* or min_free_kbytes changes.

5485

*/

5488

*/

5486

static void calculate_totalreserve_pages(void)

5489

static void calculate_totalreserve_pages(void)

5487

{

5490

{

5488

struct pglist_data *pgdat;

5491

struct pglist_data *pgdat;

5489

unsigned long reserve_pages = 0;

5492

unsigned long reserve_pages = 0;

5490

enum zone_type i, j;

5493

enum zone_type i, j;

5491

5494

5492

for_each_online_pgdat(pgdat) {

5495

for_each_online_pgdat(pgdat) {

5493

for (i = 0; i < MAX_NR_ZONES; i++) {

5496

for (i = 0; i < MAX_NR_ZONES; i++) {

5494

struct zone *zone = pgdat->node_zones + i;

5497

struct zone *zone = pgdat->node_zones + i;

5495

unsigned long max = 0;

5498

unsigned long max = 0;

5496

5499

5497

/* Find valid and maximum lowmem_reserve in the zone */

5500

/* Find valid and maximum lowmem_reserve in the zone */

5498

for (j = i; j < MAX_NR_ZONES; j++) {

5501

for (j = i; j < MAX_NR_ZONES; j++) {

5499

if (zone->lowmem_reserve[j] > max)

5502

if (zone->lowmem_reserve[j] > max)

5500

max = zone->lowmem_reserve[j];

5503

max = zone->lowmem_reserve[j];

5501

}

5504

}

5502

5505

5503

/* we treat the high watermark as reserved pages. */

5506

/* we treat the high watermark as reserved pages. */

5504

max += high_wmark_pages(zone);

5507

max += high_wmark_pages(zone);

5505

5508

5506

if (max > zone->managed_pages)

5509

if (max > zone->managed_pages)

5507

max = zone->managed_pages;

5510

max = zone->managed_pages;

5508

reserve_pages += max;

5511

reserve_pages += max;

5509

/*

5512

/*

5510

* Lowmem reserves are not available to

5513

* Lowmem reserves are not available to

5511

* GFP_HIGHUSER page cache allocations and

5514

* GFP_HIGHUSER page cache allocations and

5512

* kswapd tries to balance zones to their high

5515

* kswapd tries to balance zones to their high

5513

* watermark. As a result, neither should be

5516

* watermark. As a result, neither should be

5514

* regarded as dirtyable memory, to prevent a

5517

* regarded as dirtyable memory, to prevent a

5515

* situation where reclaim has to clean pages

5518

* situation where reclaim has to clean pages

5516

* in order to balance the zones.

5519

* in order to balance the zones.

5517

*/

5520

*/

5518

zone->dirty_balance_reserve = max;

5521

zone->dirty_balance_reserve = max;

5519

}

5522

}

5520

}

5523

}

5521

dirty_balance_reserve = reserve_pages;

5524

dirty_balance_reserve = reserve_pages;

5522

totalreserve_pages = reserve_pages;

5525

totalreserve_pages = reserve_pages;

5523

}

5526

}

5524

5527

5525

/*

5528

/*

5526

* setup_per_zone_lowmem_reserve - called whenever

5529

* setup_per_zone_lowmem_reserve - called whenever

5527

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5530

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5528

* has a correct pages reserved value, so an adequate number of

5531

* has a correct pages reserved value, so an adequate number of

5529

* pages are left in the zone after a successful __alloc_pages().

5532

* pages are left in the zone after a successful __alloc_pages().

5530

*/

5533

*/

5531

static void setup_per_zone_lowmem_reserve(void)

5534

static void setup_per_zone_lowmem_reserve(void)

5532

{

5535

{

5533

struct pglist_data *pgdat;

5536

struct pglist_data *pgdat;

5534

enum zone_type j, idx;

5537

enum zone_type j, idx;

5535

5538

5536

for_each_online_pgdat(pgdat) {

5539

for_each_online_pgdat(pgdat) {

5537

for (j = 0; j < MAX_NR_ZONES; j++) {

5540

for (j = 0; j < MAX_NR_ZONES; j++) {

5538

struct zone *zone = pgdat->node_zones + j;

5541

struct zone *zone = pgdat->node_zones + j;

5539

unsigned long managed_pages = zone->managed_pages;

5542

unsigned long managed_pages = zone->managed_pages;

5540

5543

5541

zone->lowmem_reserve[j] = 0;

5544

zone->lowmem_reserve[j] = 0;

5542

5545

5543

idx = j;

5546

idx = j;

5544

while (idx) {

5547

while (idx) {

5545

struct zone *lower_zone;

5548

struct zone *lower_zone;

5546

5549

5547

idx--;

5550

idx--;

5548

5551

5549

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5552

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5550

sysctl_lowmem_reserve_ratio[idx] = 1;

5553

sysctl_lowmem_reserve_ratio[idx] = 1;

5551

5554

5552

lower_zone = pgdat->node_zones + idx;

5555

lower_zone = pgdat->node_zones + idx;

5553

lower_zone->lowmem_reserve[j] = managed_pages /

5556

lower_zone->lowmem_reserve[j] = managed_pages /

5554

sysctl_lowmem_reserve_ratio[idx];

5557

sysctl_lowmem_reserve_ratio[idx];

5555

managed_pages += lower_zone->managed_pages;

5558

managed_pages += lower_zone->managed_pages;

5556

}

5559

}

5557

}

5560

}

5558

}

5561

}

5559

5562

5560

/* update totalreserve_pages */

5563

/* update totalreserve_pages */

5561

calculate_totalreserve_pages();

5564

calculate_totalreserve_pages();

5562

}

5565

}

5563

5566

5564

static void __setup_per_zone_wmarks(void)

5567

static void __setup_per_zone_wmarks(void)

5565

{

5568

{

5566

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5569

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5567

unsigned long lowmem_pages = 0;

5570

unsigned long lowmem_pages = 0;

5568

struct zone *zone;

5571

struct zone *zone;

5569

unsigned long flags;

5572

unsigned long flags;

5570

5573

5571

/* Calculate total number of !ZONE_HIGHMEM pages */

5574

/* Calculate total number of !ZONE_HIGHMEM pages */

5572

for_each_zone(zone) {

5575

for_each_zone(zone) {

5573

if (!is_highmem(zone))

5576

if (!is_highmem(zone))

5574

lowmem_pages += zone->managed_pages;

5577

lowmem_pages += zone->managed_pages;

5575

}

5578

}

5576

5579

5577

for_each_zone(zone) {

5580

for_each_zone(zone) {

5578

u64 tmp;

5581

u64 tmp;

5579

5582

5580

spin_lock_irqsave(&zone->lock, flags);

5583

spin_lock_irqsave(&zone->lock, flags);

5581

tmp = (u64)pages_min * zone->managed_pages;

5584

tmp = (u64)pages_min * zone->managed_pages;

5582

do_div(tmp, lowmem_pages);

5585

do_div(tmp, lowmem_pages);

5583

if (is_highmem(zone)) {

5586

if (is_highmem(zone)) {

5584

/*

5587

/*

5585

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5588

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5586

* need highmem pages, so cap pages_min to a small

5589

* need highmem pages, so cap pages_min to a small

5587

* value here.

5590

* value here.

5588

*

5591

*

5589

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5592

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5590

* deltas controls asynch page reclaim, and so should

5593

* deltas controls asynch page reclaim, and so should

5591

* not be capped for highmem.

5594

* not be capped for highmem.

5592

*/

5595

*/

5593

unsigned long min_pages;

5596

unsigned long min_pages;

5594

5597

5595

min_pages = zone->managed_pages / 1024;

5598

min_pages = zone->managed_pages / 1024;

5596

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5599

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5597

zone->watermark[WMARK_MIN] = min_pages;

5600

zone->watermark[WMARK_MIN] = min_pages;

5598

} else {

5601

} else {

5599

/*

5602

/*

5600

* If it's a lowmem zone, reserve a number of pages

5603

* If it's a lowmem zone, reserve a number of pages

5601

* proportionate to the zone's size.

5604

* proportionate to the zone's size.

5602

*/

5605

*/

5603

zone->watermark[WMARK_MIN] = tmp;

5606

zone->watermark[WMARK_MIN] = tmp;

5604

}

5607

}

5605

5608

5606

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5609

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5607

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5610

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5608

5611

5609

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5612

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5610

high_wmark_pages(zone) -

5613

high_wmark_pages(zone) -

5611

low_wmark_pages(zone) -

5614

low_wmark_pages(zone) -

5612

zone_page_state(zone, NR_ALLOC_BATCH));

5615

zone_page_state(zone, NR_ALLOC_BATCH));

5613

5616

5614

setup_zone_migrate_reserve(zone);

5617

setup_zone_migrate_reserve(zone);

5615

spin_unlock_irqrestore(&zone->lock, flags);

5618

spin_unlock_irqrestore(&zone->lock, flags);

5616

}

5619

}

5617

5620

5618

/* update totalreserve_pages */

5621

/* update totalreserve_pages */

5619

calculate_totalreserve_pages();

5622

calculate_totalreserve_pages();

5620

}

5623

}

5621

5624

5622

/**

5625

/**

5623

* setup_per_zone_wmarks - called when min_free_kbytes changes

5626

* setup_per_zone_wmarks - called when min_free_kbytes changes

5624

* or when memory is hot-{added|removed}

5627

* or when memory is hot-{added|removed}

5625

*

5628

*

5626

* Ensures that the watermark[min,low,high] values for each zone are set

5629

* Ensures that the watermark[min,low,high] values for each zone are set

5627

* correctly with respect to min_free_kbytes.

5630

* correctly with respect to min_free_kbytes.

5628

*/

5631

*/

5629

void setup_per_zone_wmarks(void)

5632

void setup_per_zone_wmarks(void)

5630

{

5633

{

5631

mutex_lock(&zonelists_mutex);

5634

mutex_lock(&zonelists_mutex);

5632

__setup_per_zone_wmarks();

5635

__setup_per_zone_wmarks();

5633

mutex_unlock(&zonelists_mutex);

5636

mutex_unlock(&zonelists_mutex);

5634

}

5637

}

5635

5638

5636

/*

5639

/*

5637

* The inactive anon list should be small enough that the VM never has to

5640

* The inactive anon list should be small enough that the VM never has to

5638

* do too much work, but large enough that each inactive page has a chance

5641

* do too much work, but large enough that each inactive page has a chance

5639

* to be referenced again before it is swapped out.

5642

* to be referenced again before it is swapped out.

5640

*

5643

*

5641

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5644

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5642

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5645

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5643

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5646

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5644

* the anonymous pages are kept on the inactive list.

5647

* the anonymous pages are kept on the inactive list.

5645

*

5648

*

5646

* total target max

5649

* total target max

5647

* memory ratio inactive anon

5650

* memory ratio inactive anon

5648

* -------------------------------------

5651

* -------------------------------------

5649

* 10MB 1 5MB

5652

* 10MB 1 5MB

5650

* 100MB 1 50MB

5653

* 100MB 1 50MB

5651

* 1GB 3 250MB

5654

* 1GB 3 250MB

5652

* 10GB 10 0.9GB

5655

* 10GB 10 0.9GB

5653

* 100GB 31 3GB

5656

* 100GB 31 3GB

5654

* 1TB 101 10GB

5657

* 1TB 101 10GB

5655

* 10TB 320 32GB

5658

* 10TB 320 32GB

5656

*/

5659

*/

5657

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5660

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5658

{

5661

{

5659

unsigned int gb, ratio;

5662

unsigned int gb, ratio;

5660

5663

5661

/* Zone size in gigabytes */

5664

/* Zone size in gigabytes */

5662

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5665

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5663

if (gb)

5666

if (gb)

5664

ratio = int_sqrt(10 * gb);

5667

ratio = int_sqrt(10 * gb);

5665

else

5668

else

5666

ratio = 1;

5669

ratio = 1;

5667

5670

5668

zone->inactive_ratio = ratio;

5671

zone->inactive_ratio = ratio;

5669

}

5672

}

5670

5673

5671

static void __meminit setup_per_zone_inactive_ratio(void)

5674

static void __meminit setup_per_zone_inactive_ratio(void)

5672

{

5675

{

5673

struct zone *zone;

5676

struct zone *zone;

5674

5677

5675

for_each_zone(zone)

5678

for_each_zone(zone)

5676

calculate_zone_inactive_ratio(zone);

5679

calculate_zone_inactive_ratio(zone);

5677

}

5680

}

5678

5681

5679

/*

5682

/*

5680

* Initialise min_free_kbytes.

5683

* Initialise min_free_kbytes.

5681

*

5684

*

5682

* For small machines we want it small (128k min). For large machines

5685

* For small machines we want it small (128k min). For large machines

5683

* we want it large (64MB max). But it is not linear, because network

5686

* we want it large (64MB max). But it is not linear, because network

5684

* bandwidth does not increase linearly with machine size. We use

5687

* bandwidth does not increase linearly with machine size. We use

5685

*

5688

*

5686

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5689

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5687

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5690

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5688

*

5691

*

5689

* which yields

5692

* which yields

5690

*

5693

*

5691

* 16MB: 512k

5694

* 16MB: 512k

5692

* 32MB: 724k

5695

* 32MB: 724k

5693

* 64MB: 1024k

5696

* 64MB: 1024k

5694

* 128MB: 1448k

5697

* 128MB: 1448k

5695

* 256MB: 2048k

5698

* 256MB: 2048k

5696

* 512MB: 2896k

5699

* 512MB: 2896k

5697

* 1024MB: 4096k

5700

* 1024MB: 4096k

5698

* 2048MB: 5792k

5701

* 2048MB: 5792k

5699

* 4096MB: 8192k

5702

* 4096MB: 8192k

5700

* 8192MB: 11584k

5703

* 8192MB: 11584k

5701

* 16384MB: 16384k

5704

* 16384MB: 16384k

5702

*/

5705

*/

5703

int __meminit init_per_zone_wmark_min(void)

5706

int __meminit init_per_zone_wmark_min(void)

5704

{

5707

{

5705

unsigned long lowmem_kbytes;

5708

unsigned long lowmem_kbytes;

5706

int new_min_free_kbytes;

5709

int new_min_free_kbytes;

5707

5710

5708

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5711

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5709

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5712

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5710

5713

5711

if (new_min_free_kbytes > user_min_free_kbytes) {

5714

if (new_min_free_kbytes > user_min_free_kbytes) {

5712

min_free_kbytes = new_min_free_kbytes;

5715

min_free_kbytes = new_min_free_kbytes;

5713

if (min_free_kbytes < 128)

5716

if (min_free_kbytes < 128)

5714

min_free_kbytes = 128;

5717

min_free_kbytes = 128;

5715

if (min_free_kbytes > 65536)

5718

if (min_free_kbytes > 65536)

5716

min_free_kbytes = 65536;

5719

min_free_kbytes = 65536;

5717

} else {

5720

} else {

5718

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5721

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5719

new_min_free_kbytes, user_min_free_kbytes);

5722

new_min_free_kbytes, user_min_free_kbytes);

5720

}

5723

}

5721

setup_per_zone_wmarks();

5724

setup_per_zone_wmarks();

5722

refresh_zone_stat_thresholds();

5725

refresh_zone_stat_thresholds();

5723

setup_per_zone_lowmem_reserve();

5726

setup_per_zone_lowmem_reserve();

5724

setup_per_zone_inactive_ratio();

5727

setup_per_zone_inactive_ratio();

5725

return 0;

5728

return 0;

5726

}

5729

}

5727

module_init(init_per_zone_wmark_min)

5730

module_init(init_per_zone_wmark_min)

5728

5731

5729

/*

5732

/*

5730

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5733

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5731

* that we can call two helper functions whenever min_free_kbytes

5734

* that we can call two helper functions whenever min_free_kbytes

5732

* changes.

5735

* changes.

5733

*/

5736

*/

5734

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5737

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5735

void __user *buffer, size_t *length, loff_t *ppos)

5738

void __user *buffer, size_t *length, loff_t *ppos)

5736

{

5739

{

5737

int rc;

5740

int rc;

5738

5741

5739

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5742

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5740

if (rc)

5743

if (rc)

5741

return rc;

5744

return rc;

5742

5745

5743

if (write) {

5746

if (write) {

5744

user_min_free_kbytes = min_free_kbytes;

5747

user_min_free_kbytes = min_free_kbytes;

5745

setup_per_zone_wmarks();

5748

setup_per_zone_wmarks();

5746

}

5749

}

5747

return 0;

5750

return 0;

5748

}

5751

}

5749

5752

5750

#ifdef CONFIG_NUMA

5753

#ifdef CONFIG_NUMA

5751

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5754

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5752

void __user *buffer, size_t *length, loff_t *ppos)

5755

void __user *buffer, size_t *length, loff_t *ppos)

5753

{

5756

{

5754

struct zone *zone;

5757

struct zone *zone;

5755

int rc;

5758

int rc;

5756

5759

5757

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5760

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5758

if (rc)

5761

if (rc)

5759

return rc;

5762

return rc;

5760

5763

5761

for_each_zone(zone)

5764

for_each_zone(zone)

5762

zone->min_unmapped_pages = (zone->managed_pages *

5765

zone->min_unmapped_pages = (zone->managed_pages *

5763

sysctl_min_unmapped_ratio) / 100;

5766

sysctl_min_unmapped_ratio) / 100;

5764

return 0;

5767

return 0;

5765

}

5768

}

5766

5769

5767

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5770

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5768

void __user *buffer, size_t *length, loff_t *ppos)

5771

void __user *buffer, size_t *length, loff_t *ppos)

5769

{

5772

{

5770

struct zone *zone;

5773

struct zone *zone;

5771

int rc;

5774

int rc;

5772

5775

5773

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5776

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5774

if (rc)

5777

if (rc)

5775

return rc;

5778

return rc;

5776

5779

5777

for_each_zone(zone)

5780

for_each_zone(zone)

5778

zone->min_slab_pages = (zone->managed_pages *

5781

zone->min_slab_pages = (zone->managed_pages *

5779

sysctl_min_slab_ratio) / 100;

5782

sysctl_min_slab_ratio) / 100;

5780

return 0;

5783

return 0;

5781

}

5784

}

5782

#endif

5785

#endif

5783

5786

5784

/*

5787

/*

5785

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5788

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5786

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5789

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5787

* whenever sysctl_lowmem_reserve_ratio changes.

5790

* whenever sysctl_lowmem_reserve_ratio changes.

5788

*

5791

*

5789

* The reserve ratio obviously has absolutely no relation with the

5792

* The reserve ratio obviously has absolutely no relation with the

5790

* minimum watermarks. The lowmem reserve ratio can only make sense

5793

* minimum watermarks. The lowmem reserve ratio can only make sense

5791

* if in function of the boot time zone sizes.

5794

* if in function of the boot time zone sizes.

5792

*/

5795

*/

5793

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5796

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5794

void __user *buffer, size_t *length, loff_t *ppos)

5797

void __user *buffer, size_t *length, loff_t *ppos)

5795

{

5798

{

5796

proc_dointvec_minmax(table, write, buffer, length, ppos);

5799

proc_dointvec_minmax(table, write, buffer, length, ppos);

5797

setup_per_zone_lowmem_reserve();

5800

setup_per_zone_lowmem_reserve();

5798

return 0;

5801

return 0;

5799

}

5802

}

5800

5803

5801

/*

5804

/*

5802

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5805

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5803

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5806

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5804

* pagelist can have before it gets flushed back to buddy allocator.

5807

* pagelist can have before it gets flushed back to buddy allocator.

5805

*/

5808

*/

5806

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5809

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5807

void __user *buffer, size_t *length, loff_t *ppos)

5810

void __user *buffer, size_t *length, loff_t *ppos)

5808

{

5811

{

5809

struct zone *zone;

5812

struct zone *zone;

5810

int old_percpu_pagelist_fraction;

5813

int old_percpu_pagelist_fraction;

5811

int ret;

5814

int ret;

5812

5815

5813

mutex_lock(&pcp_batch_high_lock);

5816

mutex_lock(&pcp_batch_high_lock);

5814

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5817

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5815

5818

5816

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5819

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5817

if (!write || ret < 0)

5820

if (!write || ret < 0)

5818

goto out;

5821

goto out;

5819

5822

5820

/* Sanity checking to avoid pcp imbalance */

5823

/* Sanity checking to avoid pcp imbalance */

5821

if (percpu_pagelist_fraction &&

5824

if (percpu_pagelist_fraction &&

5822

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5825

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5823

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5826

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5824

ret = -EINVAL;

5827

ret = -EINVAL;

5825

goto out;

5828

goto out;

5826

}

5829

}

5827

5830

5828

/* No change? */

5831

/* No change? */

5829

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5832

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5830

goto out;

5833

goto out;

5831

5834

5832

for_each_populated_zone(zone) {

5835

for_each_populated_zone(zone) {

5833

unsigned int cpu;

5836

unsigned int cpu;

5834

5837

5835

for_each_possible_cpu(cpu)

5838

for_each_possible_cpu(cpu)

5836

pageset_set_high_and_batch(zone,

5839

pageset_set_high_and_batch(zone,

5837

per_cpu_ptr(zone->pageset, cpu));

5840

per_cpu_ptr(zone->pageset, cpu));

5838

}

5841

}

5839

out:

5842

out:

5840

mutex_unlock(&pcp_batch_high_lock);

5843

mutex_unlock(&pcp_batch_high_lock);

5841

return ret;

5844

return ret;

5842

}

5845

}

5843

5846

5844

int hashdist = HASHDIST_DEFAULT;

5847

int hashdist = HASHDIST_DEFAULT;

5845

5848

5846

#ifdef CONFIG_NUMA

5849

#ifdef CONFIG_NUMA

5847

static int __init set_hashdist(char *str)

5850

static int __init set_hashdist(char *str)

5848

{

5851

{

5849

if (!str)

5852

if (!str)

5850

return 0;

5853

return 0;

5851

hashdist = simple_strtoul(str, &str, 0);

5854

hashdist = simple_strtoul(str, &str, 0);

5852

return 1;

5855

return 1;

5853

}

5856

}

5854

__setup("hashdist=", set_hashdist);

5857

__setup("hashdist=", set_hashdist);

5855

#endif

5858

#endif

5856

5859

5857

/*

5860

/*

5858

* allocate a large system hash table from bootmem

5861

* allocate a large system hash table from bootmem

5859

* - it is assumed that the hash table must contain an exact power-of-2

5862

* - it is assumed that the hash table must contain an exact power-of-2

5860

* quantity of entries

5863

* quantity of entries

5861

* - limit is the number of hash buckets, not the total allocation size

5864

* - limit is the number of hash buckets, not the total allocation size

5862

*/

5865

*/

5863

void *__init alloc_large_system_hash(const char *tablename,

5866

void *__init alloc_large_system_hash(const char *tablename,

5864

unsigned long bucketsize,

5867

unsigned long bucketsize,

5865

unsigned long numentries,

5868

unsigned long numentries,

5866

int scale,

5869

int scale,

5867

int flags,

5870

int flags,

5868

unsigned int *_hash_shift,

5871

unsigned int *_hash_shift,

5869

unsigned int *_hash_mask,

5872

unsigned int *_hash_mask,

5870

unsigned long low_limit,

5873

unsigned long low_limit,

5871

unsigned long high_limit)

5874

unsigned long high_limit)

5872

{

5875

{

5873

unsigned long long max = high_limit;

5876

unsigned long long max = high_limit;

5874

unsigned long log2qty, size;

5877

unsigned long log2qty, size;

5875

void *table = NULL;

5878

void *table = NULL;

5876

5879

5877

/* allow the kernel cmdline to have a say */

5880

/* allow the kernel cmdline to have a say */

5878

if (!numentries) {

5881

if (!numentries) {

5879

/* round applicable memory size up to nearest megabyte */

5882

/* round applicable memory size up to nearest megabyte */

5880

numentries = nr_kernel_pages;

5883

numentries = nr_kernel_pages;

5881

5884

5882

/* It isn't necessary when PAGE_SIZE >= 1MB */

5885

/* It isn't necessary when PAGE_SIZE >= 1MB */

5883

if (PAGE_SHIFT < 20)

5886

if (PAGE_SHIFT < 20)

5884

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5887

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5885

5888

5886

/* limit to 1 bucket per 2^scale bytes of low memory */

5889

/* limit to 1 bucket per 2^scale bytes of low memory */

5887

if (scale > PAGE_SHIFT)

5890

if (scale > PAGE_SHIFT)

5888

numentries >>= (scale - PAGE_SHIFT);

5891

numentries >>= (scale - PAGE_SHIFT);

5889

else

5892

else

5890

numentries <<= (PAGE_SHIFT - scale);

5893

numentries <<= (PAGE_SHIFT - scale);

5891

5894

5892

/* Make sure we've got at least a 0-order allocation.. */

5895

/* Make sure we've got at least a 0-order allocation.. */

5893

if (unlikely(flags & HASH_SMALL)) {

5896

if (unlikely(flags & HASH_SMALL)) {

5894

/* Makes no sense without HASH_EARLY */

5897

/* Makes no sense without HASH_EARLY */

5895

WARN_ON(!(flags & HASH_EARLY));

5898

WARN_ON(!(flags & HASH_EARLY));

5896

if (!(numentries >> *_hash_shift)) {

5899

if (!(numentries >> *_hash_shift)) {

5897

numentries = 1UL << *_hash_shift;

5900

numentries = 1UL << *_hash_shift;

5898

BUG_ON(!numentries);

5901

BUG_ON(!numentries);

5899

}

5902

}

5900

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5903

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5901

numentries = PAGE_SIZE / bucketsize;

5904

numentries = PAGE_SIZE / bucketsize;

5902

}

5905

}

5903

numentries = roundup_pow_of_two(numentries);

5906

numentries = roundup_pow_of_two(numentries);

5904

5907

5905

/* limit allocation size to 1/16 total memory by default */

5908

/* limit allocation size to 1/16 total memory by default */

5906

if (max == 0) {

5909

if (max == 0) {

5907

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5910

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5908

do_div(max, bucketsize);

5911

do_div(max, bucketsize);

5909

}

5912

}

5910

max = min(max, 0x80000000ULL);

5913

max = min(max, 0x80000000ULL);

5911

5914

5912

if (numentries < low_limit)

5915

if (numentries < low_limit)

5913

numentries = low_limit;

5916

numentries = low_limit;

5914

if (numentries > max)

5917

if (numentries > max)

5915

numentries = max;

5918

numentries = max;

5916

5919

5917

log2qty = ilog2(numentries);

5920

log2qty = ilog2(numentries);

5918

5921

5919

do {

5922

do {

5920

size = bucketsize << log2qty;

5923

size = bucketsize << log2qty;

5921

if (flags & HASH_EARLY)

5924

if (flags & HASH_EARLY)

5922

table = alloc_bootmem_nopanic(size);

5925

table = alloc_bootmem_nopanic(size);

5923

else if (hashdist)

5926

else if (hashdist)

5924

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5927

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5925

else {

5928

else {

5926

/*

5929

/*

5927

* If bucketsize is not a power-of-two, we may free

5930

* If bucketsize is not a power-of-two, we may free

5928

* some pages at the end of hash table which

5931

* some pages at the end of hash table which

5929

* alloc_pages_exact() automatically does

5932

* alloc_pages_exact() automatically does

5930

*/

5933

*/

5931

if (get_order(size) < MAX_ORDER) {

5934

if (get_order(size) < MAX_ORDER) {

5932

table = alloc_pages_exact(size, GFP_ATOMIC);

5935

table = alloc_pages_exact(size, GFP_ATOMIC);

5933

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5936

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5934

}

5937

}

5935

}

5938

}

5936

} while (!table && size > PAGE_SIZE && --log2qty);

5939

} while (!table && size > PAGE_SIZE && --log2qty);

5937

5940

5938

if (!table)

5941

if (!table)

5939

panic("Failed to allocate %s hash table\n", tablename);

5942

panic("Failed to allocate %s hash table\n", tablename);

5940

5943

5941

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5944

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5942

tablename,

5945

tablename,

5943

(1UL << log2qty),

5946

(1UL << log2qty),

5944

ilog2(size) - PAGE_SHIFT,

5947

ilog2(size) - PAGE_SHIFT,

5945

size);

5948

size);

5946

5949

5947

if (_hash_shift)

5950

if (_hash_shift)

5948

*_hash_shift = log2qty;

5951

*_hash_shift = log2qty;

5949

if (_hash_mask)

5952

if (_hash_mask)

5950

*_hash_mask = (1 << log2qty) - 1;

5953

*_hash_mask = (1 << log2qty) - 1;

5951

5954

5952

return table;

5955

return table;

5953

}

5956

}

5954

5957

5955

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5958

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5956

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5959

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5957

unsigned long pfn)

5960

unsigned long pfn)

5958

{

5961

{

5959

#ifdef CONFIG_SPARSEMEM

5962

#ifdef CONFIG_SPARSEMEM

5960

return __pfn_to_section(pfn)->pageblock_flags;

5963

return __pfn_to_section(pfn)->pageblock_flags;

5961

#else

5964

#else

5962

return zone->pageblock_flags;

5965

return zone->pageblock_flags;

5963

#endif /* CONFIG_SPARSEMEM */

5966

#endif /* CONFIG_SPARSEMEM */

5964

}

5967

}

5965

5968

5966

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5969

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5967

{

5970

{

5968

#ifdef CONFIG_SPARSEMEM

5971

#ifdef CONFIG_SPARSEMEM

5969

pfn &= (PAGES_PER_SECTION-1);

5972

pfn &= (PAGES_PER_SECTION-1);

5970

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5973

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5971

#else

5974

#else

5972

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5975

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5973

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5976

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5974

#endif /* CONFIG_SPARSEMEM */

5977

#endif /* CONFIG_SPARSEMEM */

5975

}

5978

}

5976

5979

5977

/**

5980

/**

5978

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5981

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5979

* @page: The page within the block of interest

5982

* @page: The page within the block of interest

5980

* @start_bitidx: The first bit of interest to retrieve

5983

* @start_bitidx: The first bit of interest to retrieve

5981

* @end_bitidx: The last bit of interest

5984

* @end_bitidx: The last bit of interest

5982

* returns pageblock_bits flags

5985

* returns pageblock_bits flags

5983

*/

5986

*/

5984

unsigned long get_pageblock_flags_mask(struct page *page,

5987

unsigned long get_pageblock_flags_mask(struct page *page,

5985

unsigned long end_bitidx,

5988

unsigned long end_bitidx,

5986

unsigned long mask)

5989

unsigned long mask)

5987

{

5990

{

5988

struct zone *zone;

5991

struct zone *zone;

5989

unsigned long *bitmap;

5992

unsigned long *bitmap;

5990

unsigned long pfn, bitidx, word_bitidx;

5993

unsigned long pfn, bitidx, word_bitidx;

5991

unsigned long word;

5994

unsigned long word;

5992

5995

5993

zone = page_zone(page);

5996

zone = page_zone(page);

5994

pfn = page_to_pfn(page);

5997

pfn = page_to_pfn(page);

5995

bitmap = get_pageblock_bitmap(zone, pfn);

5998

bitmap = get_pageblock_bitmap(zone, pfn);

5996

bitidx = pfn_to_bitidx(zone, pfn);

5999

bitidx = pfn_to_bitidx(zone, pfn);

5997

word_bitidx = bitidx / BITS_PER_LONG;

6000

word_bitidx = bitidx / BITS_PER_LONG;

5998

bitidx &= (BITS_PER_LONG-1);

6001

bitidx &= (BITS_PER_LONG-1);

5999

6002

6000

word = bitmap[word_bitidx];

6003

word = bitmap[word_bitidx];

6001

bitidx += end_bitidx;

6004

bitidx += end_bitidx;

6002

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6005

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6003

}

6006

}

6004

6007

6005

/**

6008

/**

6006

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6009

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6007

* @page: The page within the block of interest

6010

* @page: The page within the block of interest

6008

* @start_bitidx: The first bit of interest

6011

* @start_bitidx: The first bit of interest

6009

* @end_bitidx: The last bit of interest

6012

* @end_bitidx: The last bit of interest

6010

* @flags: The flags to set

6013

* @flags: The flags to set

6011

*/

6014

*/

6012

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6015

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6013

unsigned long end_bitidx,

6016

unsigned long end_bitidx,

6014

unsigned long mask)

6017

unsigned long mask)

6015

{

6018

{

6016

struct zone *zone;

6019

struct zone *zone;

6017

unsigned long *bitmap;

6020

unsigned long *bitmap;

6018

unsigned long pfn, bitidx, word_bitidx;

6021

unsigned long pfn, bitidx, word_bitidx;

6019

unsigned long old_word, word;

6022

unsigned long old_word, word;

6020

6023

6021

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6024

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6022

6025

6023

zone = page_zone(page);

6026

zone = page_zone(page);

6024

pfn = page_to_pfn(page);

6027

pfn = page_to_pfn(page);

6025

bitmap = get_pageblock_bitmap(zone, pfn);

6028

bitmap = get_pageblock_bitmap(zone, pfn);

6026

bitidx = pfn_to_bitidx(zone, pfn);

6029

bitidx = pfn_to_bitidx(zone, pfn);

6027

word_bitidx = bitidx / BITS_PER_LONG;

6030

word_bitidx = bitidx / BITS_PER_LONG;

6028

bitidx &= (BITS_PER_LONG-1);

6031

bitidx &= (BITS_PER_LONG-1);

6029

6032

6030

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6033

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6031

6034

6032

bitidx += end_bitidx;

6035

bitidx += end_bitidx;

6033

mask <<= (BITS_PER_LONG - bitidx - 1);

6036

mask <<= (BITS_PER_LONG - bitidx - 1);

6034

flags <<= (BITS_PER_LONG - bitidx - 1);

6037

flags <<= (BITS_PER_LONG - bitidx - 1);

6035

6038

6036

word = ACCESS_ONCE(bitmap[word_bitidx]);

6039

word = ACCESS_ONCE(bitmap[word_bitidx]);

6037

for (;;) {

6040

for (;;) {

6038

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6041

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6039

if (word == old_word)

6042

if (word == old_word)

6040

break;

6043

break;

6041

word = old_word;

6044

word = old_word;

6042

}

6045

}

6043

}

6046

}

6044

6047

6045

/*

6048

/*

6046

* This function checks whether pageblock includes unmovable pages or not.

6049

* This function checks whether pageblock includes unmovable pages or not.

6047

* If @count is not zero, it is okay to include less @count unmovable pages

6050

* If @count is not zero, it is okay to include less @count unmovable pages

6048

*

6051

*

6049

* PageLRU check without isolation or lru_lock could race so that

6052

* PageLRU check without isolation or lru_lock could race so that

6050

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6053

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6051

* expect this function should be exact.

6054

* expect this function should be exact.

6052

*/

6055

*/

6053

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6056

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6054

bool skip_hwpoisoned_pages)

6057

bool skip_hwpoisoned_pages)

6055

{

6058

{

6056

unsigned long pfn, iter, found;

6059

unsigned long pfn, iter, found;

6057

int mt;

6060

int mt;

6058

6061

6059

/*

6062

/*

6060

* For avoiding noise data, lru_add_drain_all() should be called

6063

* For avoiding noise data, lru_add_drain_all() should be called

6061

* If ZONE_MOVABLE, the zone never contains unmovable pages

6064

* If ZONE_MOVABLE, the zone never contains unmovable pages

6062

*/

6065

*/

6063

if (zone_idx(zone) == ZONE_MOVABLE)

6066

if (zone_idx(zone) == ZONE_MOVABLE)

6064

return false;

6067

return false;

6065

mt = get_pageblock_migratetype(page);

6068

mt = get_pageblock_migratetype(page);

6066

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6069

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6067

return false;

6070

return false;

6068

6071

6069

pfn = page_to_pfn(page);

6072

pfn = page_to_pfn(page);

6070

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6073

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6071

unsigned long check = pfn + iter;

6074

unsigned long check = pfn + iter;

6072

6075

6073

if (!pfn_valid_within(check))

6076

if (!pfn_valid_within(check))

6074

continue;

6077

continue;

6075

6078

6076

page = pfn_to_page(check);

6079

page = pfn_to_page(check);

6077

6080

6078

/*

6081

/*

6079

* Hugepages are not in LRU lists, but they're movable.

6082

* Hugepages are not in LRU lists, but they're movable.

6080

* We need not scan over tail pages bacause we don't

6083

* We need not scan over tail pages bacause we don't

6081

* handle each tail page individually in migration.

6084

* handle each tail page individually in migration.

6082

*/

6085

*/

6083

if (PageHuge(page)) {

6086

if (PageHuge(page)) {

6084

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6087

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6085

continue;

6088

continue;

6086

}

6089

}

6087

6090

6088

/*

6091

/*

6089

* We can't use page_count without pin a page

6092

* We can't use page_count without pin a page

6090

* because another CPU can free compound page.

6093

* because another CPU can free compound page.

6091

* This check already skips compound tails of THP

6094

* This check already skips compound tails of THP

6092

* because their page->_count is zero at all time.

6095

* because their page->_count is zero at all time.

6093

*/

6096

*/

6094

if (!atomic_read(&page->_count)) {

6097

if (!atomic_read(&page->_count)) {

6095

if (PageBuddy(page))

6098

if (PageBuddy(page))

6096

iter += (1 << page_order(page)) - 1;

6099

iter += (1 << page_order(page)) - 1;

6097

continue;

6100

continue;

6098

}

6101

}

6099

6102

6100

/*

6103

/*

6101

* The HWPoisoned page may be not in buddy system, and

6104

* The HWPoisoned page may be not in buddy system, and

6102

* page_count() is not 0.

6105

* page_count() is not 0.

6103

*/

6106

*/

6104

if (skip_hwpoisoned_pages && PageHWPoison(page))

6107

if (skip_hwpoisoned_pages && PageHWPoison(page))

6105

continue;

6108

continue;

6106

6109

6107

if (!PageLRU(page))

6110

if (!PageLRU(page))

6108

found++;

6111

found++;

6109

/*

6112

/*

6110

* If there are RECLAIMABLE pages, we need to check it.

6113

* If there are RECLAIMABLE pages, we need to check it.

6111

* But now, memory offline itself doesn't call shrink_slab()

6114

* But now, memory offline itself doesn't call shrink_slab()

6112

* and it still to be fixed.

6115

* and it still to be fixed.

6113

*/

6116

*/

6114

/*

6117

/*

6115

* If the page is not RAM, page_count()should be 0.

6118

* If the page is not RAM, page_count()should be 0.

6116

* we don't need more check. This is an _used_ not-movable page.

6119

* we don't need more check. This is an _used_ not-movable page.

6117

*

6120

*

6118

* The problematic thing here is PG_reserved pages. PG_reserved

6121

* The problematic thing here is PG_reserved pages. PG_reserved

6119

* is set to both of a memory hole page and a _used_ kernel

6122

* is set to both of a memory hole page and a _used_ kernel

6120

* page at boot.

6123

* page at boot.

6121

*/

6124

*/

6122

if (found > count)

6125

if (found > count)

6123

return true;

6126

return true;

6124

}

6127

}

6125

return false;

6128

return false;

6126

}

6129

}

6127

6130

6128

bool is_pageblock_removable_nolock(struct page *page)

6131

bool is_pageblock_removable_nolock(struct page *page)

6129

{

6132

{

6130

struct zone *zone;

6133

struct zone *zone;

6131

unsigned long pfn;

6134

unsigned long pfn;

6132

6135

6133

/*

6136

/*

6134

* We have to be careful here because we are iterating over memory

6137

* We have to be careful here because we are iterating over memory

6135

* sections which are not zone aware so we might end up outside of

6138

* sections which are not zone aware so we might end up outside of

6136

* the zone but still within the section.

6139

* the zone but still within the section.

6137

* We have to take care about the node as well. If the node is offline

6140

* We have to take care about the node as well. If the node is offline

6138

* its NODE_DATA will be NULL - see page_zone.

6141

* its NODE_DATA will be NULL - see page_zone.

6139

*/

6142

*/

6140

if (!node_online(page_to_nid(page)))

6143

if (!node_online(page_to_nid(page)))

6141

return false;

6144

return false;

6142

6145

6143

zone = page_zone(page);

6146

zone = page_zone(page);

6144

pfn = page_to_pfn(page);

6147

pfn = page_to_pfn(page);

6145

if (!zone_spans_pfn(zone, pfn))

6148

if (!zone_spans_pfn(zone, pfn))

6146

return false;

6149

return false;

6147

6150

6148

return !has_unmovable_pages(zone, page, 0, true);

6151

return !has_unmovable_pages(zone, page, 0, true);

6149

}

6152

}

6150

6153

6151

#ifdef CONFIG_CMA

6154

#ifdef CONFIG_CMA

6152

6155

6153

static unsigned long pfn_max_align_down(unsigned long pfn)

6156

static unsigned long pfn_max_align_down(unsigned long pfn)

6154

{

6157

{

6155

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6158

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6156

pageblock_nr_pages) - 1);

6159

pageblock_nr_pages) - 1);

6157

}

6160

}

6158

6161

6159

static unsigned long pfn_max_align_up(unsigned long pfn)

6162

static unsigned long pfn_max_align_up(unsigned long pfn)

6160

{

6163

{

6161

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6164

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6162

pageblock_nr_pages));

6165

pageblock_nr_pages));

6163

}

6166

}

6164

6167

6165

/* [start, end) must belong to a single zone. */

6168

/* [start, end) must belong to a single zone. */

6166

static int __alloc_contig_migrate_range(struct compact_control *cc,

6169

static int __alloc_contig_migrate_range(struct compact_control *cc,

6167

unsigned long start, unsigned long end)

6170

unsigned long start, unsigned long end)

6168

{

6171

{

6169

/* This function is based on compact_zone() from compaction.c. */

6172

/* This function is based on compact_zone() from compaction.c. */

6170

unsigned long nr_reclaimed;

6173

unsigned long nr_reclaimed;

6171

unsigned long pfn = start;

6174

unsigned long pfn = start;

6172

unsigned int tries = 0;

6175

unsigned int tries = 0;

6173

int ret = 0;

6176

int ret = 0;

6174

6177

6175

migrate_prep();

6178

migrate_prep();

6176

6179

6177

while (pfn < end || !list_empty(&cc->migratepages)) {

6180

while (pfn < end || !list_empty(&cc->migratepages)) {

6178

if (fatal_signal_pending(current)) {

6181

if (fatal_signal_pending(current)) {

6179

ret = -EINTR;

6182

ret = -EINTR;

6180

break;

6183

break;

6181

}

6184

}

6182

6185

6183

if (list_empty(&cc->migratepages)) {

6186

if (list_empty(&cc->migratepages)) {

6184

cc->nr_migratepages = 0;

6187

cc->nr_migratepages = 0;

6185

pfn = isolate_migratepages_range(cc->zone, cc,

6188

pfn = isolate_migratepages_range(cc->zone, cc,

6186

pfn, end, true);

6189

pfn, end, true);

6187

if (!pfn) {

6190

if (!pfn) {

6188

ret = -EINTR;

6191

ret = -EINTR;

6189

break;

6192

break;

6190

}

6193

}

6191

tries = 0;

6194

tries = 0;

6192

} else if (++tries == 5) {

6195

} else if (++tries == 5) {

6193

ret = ret < 0 ? ret : -EBUSY;

6196

ret = ret < 0 ? ret : -EBUSY;

6194

break;

6197

break;

6195

}

6198

}

6196

6199

6197

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6200

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6198

&cc->migratepages);

6201

&cc->migratepages);

6199

cc->nr_migratepages -= nr_reclaimed;

6202

cc->nr_migratepages -= nr_reclaimed;

6200

6203

6201

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6204

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6202

NULL, 0, cc->mode, MR_CMA);

6205

NULL, 0, cc->mode, MR_CMA);

6203

}

6206

}

6204

if (ret < 0) {

6207

if (ret < 0) {

6205

putback_movable_pages(&cc->migratepages);

6208

putback_movable_pages(&cc->migratepages);

6206

return ret;

6209

return ret;

6207

}

6210

}

6208

return 0;

6211

return 0;

6209

}

6212

}

6210

6213

6211

/**

6214

/**

6212

* alloc_contig_range() -- tries to allocate given range of pages

6215

* alloc_contig_range() -- tries to allocate given range of pages

6213

* @start: start PFN to allocate

6216

* @start: start PFN to allocate

6214

* @end: one-past-the-last PFN to allocate

6217

* @end: one-past-the-last PFN to allocate

6215

* @migratetype: migratetype of the underlaying pageblocks (either

6218

* @migratetype: migratetype of the underlaying pageblocks (either

6216

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6219

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6217

* in range must have the same migratetype and it must

6220

* in range must have the same migratetype and it must

6218

* be either of the two.

6221

* be either of the two.

6219

*

6222

*

6220

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6223

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6221

* aligned, however it's the caller's responsibility to guarantee that

6224

* aligned, however it's the caller's responsibility to guarantee that

6222

* we are the only thread that changes migrate type of pageblocks the

6225

* we are the only thread that changes migrate type of pageblocks the

6223

* pages fall in.

6226

* pages fall in.

6224

*

6227

*

6225

* The PFN range must belong to a single zone.

6228

* The PFN range must belong to a single zone.

6226

*

6229

*

6227

* Returns zero on success or negative error code. On success all

6230

* Returns zero on success or negative error code. On success all

6228

* pages which PFN is in [start, end) are allocated for the caller and

6231

* pages which PFN is in [start, end) are allocated for the caller and

6229

* need to be freed with free_contig_range().

6232

* need to be freed with free_contig_range().

6230

*/

6233

*/

6231

int alloc_contig_range(unsigned long start, unsigned long end,

6234

int alloc_contig_range(unsigned long start, unsigned long end,

6232

unsigned migratetype)

6235

unsigned migratetype)

6233

{

6236

{

6234

unsigned long outer_start, outer_end;

6237

unsigned long outer_start, outer_end;

6235

int ret = 0, order;

6238

int ret = 0, order;

6236

6239

6237

struct compact_control cc = {

6240

struct compact_control cc = {

6238

.nr_migratepages = 0,

6241

.nr_migratepages = 0,

6239

.order = -1,

6242

.order = -1,

6240

.zone = page_zone(pfn_to_page(start)),

6243

.zone = page_zone(pfn_to_page(start)),

6241

.mode = MIGRATE_SYNC,

6244

.mode = MIGRATE_SYNC,

6242

.ignore_skip_hint = true,

6245

.ignore_skip_hint = true,

6243

};

6246

};

6244

INIT_LIST_HEAD(&cc.migratepages);

6247

INIT_LIST_HEAD(&cc.migratepages);

6245

6248

6246

/*

6249

/*

6247

* What we do here is we mark all pageblocks in range as

6250

* What we do here is we mark all pageblocks in range as

6248

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6251

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6249

* have different sizes, and due to the way page allocator

6252

* have different sizes, and due to the way page allocator

6250

* work, we align the range to biggest of the two pages so

6253

* work, we align the range to biggest of the two pages so

6251

* that page allocator won't try to merge buddies from

6254

* that page allocator won't try to merge buddies from

6252

* different pageblocks and change MIGRATE_ISOLATE to some

6255

* different pageblocks and change MIGRATE_ISOLATE to some

6253

* other migration type.

6256

* other migration type.

6254

*

6257

*

6255

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6258

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6256

* migrate the pages from an unaligned range (ie. pages that

6259

* migrate the pages from an unaligned range (ie. pages that

6257

* we are interested in). This will put all the pages in

6260

* we are interested in). This will put all the pages in

6258

* range back to page allocator as MIGRATE_ISOLATE.

6261

* range back to page allocator as MIGRATE_ISOLATE.

6259

*

6262

*

6260

* When this is done, we take the pages in range from page

6263

* When this is done, we take the pages in range from page

6261

* allocator removing them from the buddy system. This way

6264

* allocator removing them from the buddy system. This way

6262

* page allocator will never consider using them.

6265

* page allocator will never consider using them.

6263

*

6266

*

6264

* This lets us mark the pageblocks back as

6267

* This lets us mark the pageblocks back as

6265

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6268

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6266

* aligned range but not in the unaligned, original range are

6269

* aligned range but not in the unaligned, original range are

6267

* put back to page allocator so that buddy can use them.

6270

* put back to page allocator so that buddy can use them.

6268

*/

6271

*/

6269

6272

6270

ret = start_isolate_page_range(pfn_max_align_down(start),

6273

ret = start_isolate_page_range(pfn_max_align_down(start),

6271

pfn_max_align_up(end), migratetype,

6274

pfn_max_align_up(end), migratetype,

6272

false);

6275

false);

6273

if (ret)

6276

if (ret)

6274

return ret;

6277

return ret;

6275

6278

6276

ret = __alloc_contig_migrate_range(&cc, start, end);

6279

ret = __alloc_contig_migrate_range(&cc, start, end);

6277

if (ret)

6280

if (ret)

6278

goto done;

6281

goto done;

6279

6282

6280

/*

6283

/*

6281

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6284

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6282

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6285

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6283

* more, all pages in [start, end) are free in page allocator.

6286

* more, all pages in [start, end) are free in page allocator.

6284

* What we are going to do is to allocate all pages from

6287

* What we are going to do is to allocate all pages from

6285

* [start, end) (that is remove them from page allocator).

6288

* [start, end) (that is remove them from page allocator).

6286

*

6289

*

6287

* The only problem is that pages at the beginning and at the

6290

* The only problem is that pages at the beginning and at the

6288

* end of interesting range may be not aligned with pages that

6291

* end of interesting range may be not aligned with pages that

6289

* page allocator holds, ie. they can be part of higher order

6292

* page allocator holds, ie. they can be part of higher order

6290

* pages. Because of this, we reserve the bigger range and

6293

* pages. Because of this, we reserve the bigger range and

6291

* once this is done free the pages we are not interested in.

6294

* once this is done free the pages we are not interested in.

6292

*

6295

*

6293

* We don't have to hold zone->lock here because the pages are

6296

* We don't have to hold zone->lock here because the pages are

6294

* isolated thus they won't get removed from buddy.

6297

* isolated thus they won't get removed from buddy.

6295

*/

6298

*/

6296

6299

6297

lru_add_drain_all();

6300

lru_add_drain_all();

6298

drain_all_pages();

6301

drain_all_pages();

6299

6302

6300

order = 0;

6303

order = 0;

6301

outer_start = start;

6304

outer_start = start;

6302

while (!PageBuddy(pfn_to_page(outer_start))) {

6305

while (!PageBuddy(pfn_to_page(outer_start))) {

6303

if (++order >= MAX_ORDER) {

6306

if (++order >= MAX_ORDER) {

6304

ret = -EBUSY;

6307

ret = -EBUSY;

6305

goto done;

6308

goto done;

6306

}

6309

}

6307

outer_start &= ~0UL << order;

6310

outer_start &= ~0UL << order;

6308

}

6311

}

6309

6312

6310

/* Make sure the range is really isolated. */

6313

/* Make sure the range is really isolated. */

6311

if (test_pages_isolated(outer_start, end, false)) {

6314

if (test_pages_isolated(outer_start, end, false)) {

6312

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6315

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6313

outer_start, end);

6316

outer_start, end);

6314

ret = -EBUSY;

6317

ret = -EBUSY;

6315

goto done;

6318

goto done;

6316

}

6319

}

6317

6320

6318

6321

6319

/* Grab isolated pages from freelists. */

6322

/* Grab isolated pages from freelists. */

6320

outer_end = isolate_freepages_range(&cc, outer_start, end);

6323

outer_end = isolate_freepages_range(&cc, outer_start, end);

6321

if (!outer_end) {

6324

if (!outer_end) {

6322

ret = -EBUSY;

6325

ret = -EBUSY;

6323

goto done;

6326

goto done;

6324

}

6327

}

6325

6328

6326

/* Free head and tail (if any) */

6329

/* Free head and tail (if any) */

6327

if (start != outer_start)

6330

if (start != outer_start)

6328

free_contig_range(outer_start, start - outer_start);

6331

free_contig_range(outer_start, start - outer_start);

6329

if (end != outer_end)

6332

if (end != outer_end)

6330

free_contig_range(end, outer_end - end);

6333

free_contig_range(end, outer_end - end);

6331

6334

6332

done:

6335

done:

6333

undo_isolate_page_range(pfn_max_align_down(start),

6336

undo_isolate_page_range(pfn_max_align_down(start),

6334

pfn_max_align_up(end), migratetype);

6337

pfn_max_align_up(end), migratetype);

6335

return ret;

6338

return ret;

6336

}

6339

}

6337

6340

6338

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6341

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6339

{

6342

{

6340

unsigned int count = 0;

6343

unsigned int count = 0;

6341

6344

6342

for (; nr_pages--; pfn++) {

6345

for (; nr_pages--; pfn++) {

6343

struct page *page = pfn_to_page(pfn);

6346

struct page *page = pfn_to_page(pfn);

6344

6347

6345

count += page_count(page) != 1;

6348

count += page_count(page) != 1;

6346

__free_page(page);

6349

__free_page(page);

6347

}

6350

}

6348

WARN(count != 0, "%d pages are still in use!\n", count);

6351

WARN(count != 0, "%d pages are still in use!\n", count);

6349

}

6352

}

6350

#endif

6353

#endif

6351

6354

6352

#ifdef CONFIG_MEMORY_HOTPLUG

6355

#ifdef CONFIG_MEMORY_HOTPLUG

6353

/*

6356

/*

6354

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6357

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6355

* page high values need to be recalulated.

6358

* page high values need to be recalulated.

6356

*/

6359

*/

6357

void __meminit zone_pcp_update(struct zone *zone)

6360

void __meminit zone_pcp_update(struct zone *zone)

6358

{

6361

{

6359

unsigned cpu;

6362

unsigned cpu;

6360

mutex_lock(&pcp_batch_high_lock);

6363

mutex_lock(&pcp_batch_high_lock);

6361

for_each_possible_cpu(cpu)

6364

for_each_possible_cpu(cpu)

6362

pageset_set_high_and_batch(zone,

6365

pageset_set_high_and_batch(zone,

6363

per_cpu_ptr(zone->pageset, cpu));

6366

per_cpu_ptr(zone->pageset, cpu));

6364

mutex_unlock(&pcp_batch_high_lock);

6367

mutex_unlock(&pcp_batch_high_lock);

6365

}

6368

}

6366

#endif

6369

#endif

6367

6370

6368

void zone_pcp_reset(struct zone *zone)

6371

void zone_pcp_reset(struct zone *zone)

6369

{

6372

{

6370

unsigned long flags;

6373

unsigned long flags;

6371

int cpu;

6374

int cpu;

6372

struct per_cpu_pageset *pset;

6375

struct per_cpu_pageset *pset;

6373

6376

6374

/* avoid races with drain_pages() */

6377

/* avoid races with drain_pages() */

6375

local_irq_save(flags);

6378

local_irq_save(flags);

6376

if (zone->pageset != &boot_pageset) {

6379

if (zone->pageset != &boot_pageset) {

6377

for_each_online_cpu(cpu) {

6380

for_each_online_cpu(cpu) {

6378

pset = per_cpu_ptr(zone->pageset, cpu);

6381

pset = per_cpu_ptr(zone->pageset, cpu);

6379

drain_zonestat(zone, pset);

6382

drain_zonestat(zone, pset);

6380

}

6383

}

6381

free_percpu(zone->pageset);

6384

free_percpu(zone->pageset);

6382

zone->pageset = &boot_pageset;

6385

zone->pageset = &boot_pageset;

6383

}

6386

}

6384

local_irq_restore(flags);

6387

local_irq_restore(flags);

6385

}

6388

}

6386

6389

6387

#ifdef CONFIG_MEMORY_HOTREMOVE

6390

#ifdef CONFIG_MEMORY_HOTREMOVE

6388

/*

6391

/*

6389

* All pages in the range must be isolated before calling this.

6392

* All pages in the range must be isolated before calling this.

6390

*/

6393

*/

6391

void

6394

void

6392

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6395

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6393

{

6396

{

6394

struct page *page;

6397

struct page *page;

6395

struct zone *zone;

6398

struct zone *zone;

6396

int order, i;

6399

int order, i;

6397

unsigned long pfn;

6400

unsigned long pfn;

6398

unsigned long flags;

6401

unsigned long flags;

6399

/* find the first valid pfn */

6402

/* find the first valid pfn */

6400

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6403

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6401

if (pfn_valid(pfn))

6404

if (pfn_valid(pfn))

6402

break;

6405

break;

6403

if (pfn == end_pfn)

6406

if (pfn == end_pfn)

6404

return;

6407

return;

6405

zone = page_zone(pfn_to_page(pfn));

6408

zone = page_zone(pfn_to_page(pfn));

6406

spin_lock_irqsave(&zone->lock, flags);

6409

spin_lock_irqsave(&zone->lock, flags);

6407

pfn = start_pfn;

6410

pfn = start_pfn;

6408

while (pfn < end_pfn) {

6411

while (pfn < end_pfn) {

6409

if (!pfn_valid(pfn)) {

6412

if (!pfn_valid(pfn)) {

6410

pfn++;

6413

pfn++;

6411

continue;

6414

continue;

6412

}

6415

}

6413

page = pfn_to_page(pfn);

6416

page = pfn_to_page(pfn);

6414

/*

6417

/*

6415

* The HWPoisoned page may be not in buddy system, and

6418

* The HWPoisoned page may be not in buddy system, and

6416

* page_count() is not 0.

6419

* page_count() is not 0.

6417

*/

6420

*/

6418

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6421

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6419

pfn++;

6422

pfn++;

6420

SetPageReserved(page);

6423

SetPageReserved(page);

6421

continue;

6424

continue;

6422

}

6425

}

6423

6426

6424

BUG_ON(page_count(page));

6427

BUG_ON(page_count(page));

6425

BUG_ON(!PageBuddy(page));

6428

BUG_ON(!PageBuddy(page));

6426

order = page_order(page);

6429

order = page_order(page);

6427

#ifdef CONFIG_DEBUG_VM

6430

#ifdef CONFIG_DEBUG_VM

6428

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6431

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6429

pfn, 1 << order, end_pfn);

6432

pfn, 1 << order, end_pfn);

6430

#endif

6433

#endif

6431

list_del(&page->lru);

6434

list_del(&page->lru);

6432

rmv_page_order(page);

6435

rmv_page_order(page);

6433

zone->free_area[order].nr_free--;

6436

zone->free_area[order].nr_free--;

6434

for (i = 0; i < (1 << order); i++)

6437

for (i = 0; i < (1 << order); i++)

6435

SetPageReserved((page+i));

6438

SetPageReserved((page+i));

6436

pfn += (1 << order);

6439

pfn += (1 << order);

6437

}

6440

}

6438

spin_unlock_irqrestore(&zone->lock, flags);

6441

spin_unlock_irqrestore(&zone->lock, flags);

6439

}

6442

}

6440

#endif

6443

#endif

6441

6444

6442

#ifdef CONFIG_MEMORY_FAILURE

6445

#ifdef CONFIG_MEMORY_FAILURE

6443

bool is_free_buddy_page(struct page *page)

6446

bool is_free_buddy_page(struct page *page)

6444

{

6447

{

6445

struct zone *zone = page_zone(page);

6448

struct zone *zone = page_zone(page);

6446

unsigned long pfn = page_to_pfn(page);

6449

unsigned long pfn = page_to_pfn(page);

6447

unsigned long flags;

6450

unsigned long flags;

6448

int order;

6451

int order;

6449

6452

6450

spin_lock_irqsave(&zone->lock, flags);

6453

spin_lock_irqsave(&zone->lock, flags);

6451

for (order = 0; order < MAX_ORDER; order++) {

6454

for (order = 0; order < MAX_ORDER; order++) {

6452

struct page *page_head = page - (pfn & ((1 << order) - 1));

6455

struct page *page_head = page - (pfn & ((1 << order) - 1));

6453

6456

6454

if (PageBuddy(page_head) && page_order(page_head) >= order)

6457

if (PageBuddy(page_head) && page_order(page_head) >= order)

6455

break;

6458

break;

6456

}

6459

}

6457

spin_unlock_irqrestore(&zone->lock, flags);

6460

spin_unlock_irqrestore(&zone->lock, flags);

6458

6461

6459

return order < MAX_ORDER;

6462

return order < MAX_ORDER;

6460

}

6463

}

6461

#endif

6464

#endif

6462

6465

6463

static const struct trace_print_flags pageflag_names[] = {

6466

static const struct trace_print_flags pageflag_names[] = {

6464

{1UL << PG_locked, "locked" },

6467

{1UL << PG_locked, "locked" },

6465

{1UL << PG_error, "error" },

6468

{1UL << PG_error, "error" },

6466

{1UL << PG_referenced, "referenced" },

6469

{1UL << PG_referenced, "referenced" },

6467

{1UL << PG_uptodate, "uptodate" },

6470

{1UL << PG_uptodate, "uptodate" },

6468

{1UL << PG_dirty, "dirty" },

6471

{1UL << PG_dirty, "dirty" },

6469

{1UL << PG_lru, "lru" },

6472

{1UL << PG_lru, "lru" },

6470

{1UL << PG_active, "active" },

6473

{1UL << PG_active, "active" },

6471

{1UL << PG_slab, "slab" },

6474

{1UL << PG_slab, "slab" },

6472

{1UL << PG_owner_priv_1, "owner_priv_1" },

6475

{1UL << PG_owner_priv_1, "owner_priv_1" },

6473

{1UL << PG_arch_1, "arch_1" },

6476

{1UL << PG_arch_1, "arch_1" },

6474

{1UL << PG_reserved, "reserved" },

6477

{1UL << PG_reserved, "reserved" },

6475

{1UL << PG_private, "private" },

6478

{1UL << PG_private, "private" },

6476

{1UL << PG_private_2, "private_2" },

6479

{1UL << PG_private_2, "private_2" },

6477

{1UL << PG_writeback, "writeback" },

6480

{1UL << PG_writeback, "writeback" },

6478

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6481

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6479

{1UL << PG_head, "head" },

6482

{1UL << PG_head, "head" },

6480

{1UL << PG_tail, "tail" },

6483

{1UL << PG_tail, "tail" },

6481

#else

6484

#else

6482

{1UL << PG_compound, "compound" },

6485

{1UL << PG_compound, "compound" },

6483

#endif

6486

#endif

6484

{1UL << PG_swapcache, "swapcache" },

6487

{1UL << PG_swapcache, "swapcache" },

6485

{1UL << PG_mappedtodisk, "mappedtodisk" },

6488

{1UL << PG_mappedtodisk, "mappedtodisk" },

6486

{1UL << PG_reclaim, "reclaim" },

6489

{1UL << PG_reclaim, "reclaim" },

6487

{1UL << PG_swapbacked, "swapbacked" },

6490

{1UL << PG_swapbacked, "swapbacked" },

6488

{1UL << PG_unevictable, "unevictable" },

6491

{1UL << PG_unevictable, "unevictable" },

6489

#ifdef CONFIG_MMU

6492

#ifdef CONFIG_MMU

6490

{1UL << PG_mlocked, "mlocked" },

6493

{1UL << PG_mlocked, "mlocked" },

6491

#endif

6494

#endif

6492

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6495

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6493

{1UL << PG_uncached, "uncached" },

6496

{1UL << PG_uncached, "uncached" },

6494

#endif

6497

#endif

6495

#ifdef CONFIG_MEMORY_FAILURE

6498

#ifdef CONFIG_MEMORY_FAILURE

6496

{1UL << PG_hwpoison, "hwpoison" },

6499

{1UL << PG_hwpoison, "hwpoison" },

6497

#endif

6500

#endif

6498

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6501

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6499

{1UL << PG_compound_lock, "compound_lock" },

6502

{1UL << PG_compound_lock, "compound_lock" },

6500

#endif

6503

#endif

6501

};

6504

};

6502

6505

6503

static void dump_page_flags(unsigned long flags)

6506

static void dump_page_flags(unsigned long flags)

6504

{

6507

{

6505

const char *delim = "";

6508

const char *delim = "";

6506

unsigned long mask;

6509

unsigned long mask;

6507

int i;

6510

int i;

6508

6511

6509

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6512

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6510

6513

6511

printk(KERN_ALERT "page flags: %#lx(", flags);

6514

printk(KERN_ALERT "page flags: %#lx(", flags);

6512

6515

6513

/* remove zone id */

6516

/* remove zone id */

6514

flags &= (1UL << NR_PAGEFLAGS) - 1;

6517

flags &= (1UL << NR_PAGEFLAGS) - 1;

6515

6518

6516

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6519

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6517

6520

6518

mask = pageflag_names[i].mask;

6521

mask = pageflag_names[i].mask;

6519

if ((flags & mask) != mask)

6522

if ((flags & mask) != mask)

6520

continue;

6523

continue;

6521

6524

6522

flags &= ~mask;

6525

flags &= ~mask;

6523

printk("%s%s", delim, pageflag_names[i].name);

6526

printk("%s%s", delim, pageflag_names[i].name);

6524

delim = "|";

6527

delim = "|";

6525

}

6528

}

6526

6529

6527

/* check for left over flags */

6530

/* check for left over flags */

6528

if (flags)

6531

if (flags)

6529

printk("%s%#lx", delim, flags);

6532

printk("%s%#lx", delim, flags);

6530

6533

6531

printk(")\n");

6534

printk(")\n");

6532

}

6535

}

6533

6536

6534

void dump_page(struct page *page)

6537

void dump_page(struct page *page)

6535

{

6538

{

6536

printk(KERN_ALERT

6539

printk(KERN_ALERT

6537

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6540

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

GITLAB

mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 #ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page %lu outside zone [ %lu - %lu ]\n",
 			pfn, start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		set_page_count(p, 0);
 		p->first_page = page;
 		/* Make sure p->first_page is always valid for PageTail() */
 		smp_wmb();
 		__SetPageTail(p);
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	VM_BUG_ON(!zone_is_initialized(zone));
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	page_nid_reset_last(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 			if (likely(!is_migrate_isolate_page(page))) {
 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
 				if (is_migrate_cma(mt))
 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 			}
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	if (pageblock_order >= MAX_ORDER) {
 		i = pageblock_nr_pages;
 		p = page;
 		do {
 			set_page_refcounted(p);
 			__free_pages(p, MAX_ORDER - 1);
 			p += MAX_ORDER_NR_PAGES;
 		} while (i -= MAX_ORDER_NR_PAGES);
 	} else {
 		set_page_refcounted(page);
 		__free_pages(page, pageblock_order);
 	}
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_freepage_state(zone, -(1 << high),
 						  migratetype);
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
+		set_freepage_migratetype(page, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 #endif
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * If breaking a large block of pages, move all free pages to the preferred
  * allocation list. If falling back for a reclaimable kernel allocation, be
  * more aggressive about taking ownership of free pages.
  *
  * On the other hand, never change migration type of MIGRATE_CMA pageblocks
  * nor move CMA pages to different free lists. We don't want unmovable pages
  * to be allocated from MIGRATE_CMA areas.
  *
  * Returns the new migratetype of the pageblock (or the same old migratetype
  * if it was unchanged).
  */
 static int try_to_steal_freepages(struct zone *zone, struct page *page,
 				  int start_type, int fallback_type)
 {
 	int current_order = page_order(page);
 	/*
 	 * When borrowing from MIGRATE_CMA, we need to release the excess
-	 * buddy pages to CMA itself.
+	 * buddy pages to CMA itself. We also ensure the freepage_migratetype
+	 * is set to CMA so it is returned to the correct freelist in case
+	 * the page ends up being not actually allocated from the pcp lists.
 	 */
 	if (is_migrate_cma(fallback_type))
 		return fallback_type;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		return start_type;
 	}
 	if (current_order >= pageblock_order / 2 ||
 	    start_type == MIGRATE_RECLAIMABLE ||
 	    page_group_by_mobility_disabled) {
 		int pages;
 		pages = move_freepages_block(zone, page, start_type);
 		/* Claim the whole block if over half of it is free */
 		if (pages >= (1 << (pageblock_order-1)) ||
 				page_group_by_mobility_disabled) {
 			set_pageblock_migratetype(page, start_type);
 			return start_type;
 		}
 	}
 	return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int migratetype, new_type, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			new_type = try_to_steal_freepages(zone, page,
 							  start_migratetype,
 							  migratetype);
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			expand(zone, page, order, current_order, area,
 			       new_type);
+			/* The freepage_migratetype may differ from pageblock's
+			 * migratetype depending on the decisions in
+			 * try_to_steal_freepages. This is OK as long as it does
+			 * not differ for MIGRATE_CMA type.
+			 */
+			set_freepage_migratetype(page, new_type);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype, new_type);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
-	int mt = migratetype, i;
+	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
-		if (IS_ENABLED(CONFIG_CMA)) {
-			mt = get_pageblock_migratetype(page);
-			if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
-				mt = migratetype;
-		}
-		set_freepage_migratetype(page, mt);
 		list = &page->lru;
-		if (is_migrate_cma(mt))
+		if (is_migrate_cma(get_freepage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	unsigned long batch;
 	local_irq_save(flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	if (pcp->count >= batch)
 		to_drain = batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	int nr_pages;
 	order = page_order(page);
 	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	return nr_pages;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pageblock_migratetype(page));
+					  get_freepage_migratetype(page));
 	}
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	long free_cma = 0;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	if (free_pages - free_cma <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return local_zone->node == zone->node;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 static void __paginginit init_zone_allows_reclaim(int nid)
 {
 	int i;
 	for_each_node_state(i, N_MEMORY)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
 		else
 			zone_reclaim_mode = 1;
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static inline void init_zone_allows_reclaim(int nid)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		unsigned long mark;
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
 			goto try_this_zone;
 		/*
 		 * Distribute pages in proportion to the individual
 		 * zone size to ensure fair page aging.  The zone a
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			if (!zone_local(preferred_zone, zone))
 				continue;
 			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
 				continue;
 		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if ((alloc_flags & ALLOC_WMARK_LOW) &&
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			goto this_zone_full;
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_ok(zone, order, mark,
 				       classzone_idx, alloc_flags)) {
 			int ret;
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto try_this_zone;
 				/*
 				 * Failed to reclaim enough to meet watermark.
 				 * Only mark the zone full if checking the min
 				 * watermark or if we failed to reclaim just
 				 * 1<<order pages or else the page allocator
 				 * fastpath will prematurely mark zones full
 				 * when the watermark is between the low and
 				 * min watermarks.
 				 */
 				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
 				    ret == ZONE_RECLAIM_SOME)
 					goto this_zone_full;
 				continue;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (IS_ENABLED(CONFIG_NUMA))
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
 		 * that the caller is taking steps that will free more
 		 * memory. The caller should avoid the page being used
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * Walking all memory to count page types is very expensive and should
 	 * be inhibited in non-blockable contexts.
 	 */
 	if (!(gfp_mask & __GFP_WAIT))
 		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		struct page *page;
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
 				preferred_zone, migratetype);
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, enum migrate_mode mode, bool *contended_compaction,
 	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (IS_ENABLED(CONFIG_NUMA))
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
 					preferred_zone, migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static void reset_alloc_batches(struct zonelist *zonelist,
 				enum zone_type high_zoneidx,
 				struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		/*
 		 * Only reset the batches of zones that were actually
 		 * considered in the fairness pass, we don't want to
 		 * trash fairness information for zones that are not
 		 * actually part of this zonelist's round-robin cycle.
 		 */
 		if (!zone_local(preferred_zone, zone))
 			continue;
 		mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
 			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 	}
 }
 static void wake_all_kswapds(unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
 			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (atomic) {
 		/*
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
 		if (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed_softwall().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (IS_ENABLED(CONFIG_NUMA) &&
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
 		first_zones_zonelist(zonelist, high_zoneidx, NULL,
 					&preferred_zone);
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		/*
 		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
 		 * the allocation is high priority and these type of
 		 * allocations are system rather than user orientated
 		 */
 		zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 		if (page) {
 			goto got_pg;
 		}
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	migration_mode = MIGRATE_SYNC_LIGHT;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * requested a movable allocation that does not heavily disrupt the
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
 						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
 					preferred_zone, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	struct mem_cgroup *memcg = NULL;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 	/*
 	 * Will only have any effect when __GFP_KMEMCG is set.  This is
 	 * verified in the (always inline) callee
 	 */
 	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
 			preferred_zone, migratetype);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
 		 * fairly within the local node.  However, the local
 		 * node might have free pages left after the fairness
 		 * batches are exhausted, and remote zones haven't
 		 * even been considered yet.  Try once more without
 		 * fairness, and include remote zones now, before
 		 * entering the slowpath and waking kswapd: prefer
 		 * spilling to a remote zone over swapping locally.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			reset_alloc_batches(zonelist, high_zoneidx,
 					    preferred_zone);
 			alloc_flags &= ~ALLOC_FAIR;
 			goto retry;
 		}
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
 		 * complete.
 		 */
 		gfp_mask = memalloc_noio_flags(gfp_mask);
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 	}
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	memcg_kmem_commit_charge(page, memcg, order);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
  * pages allocated with __GFP_KMEMCG.
  *
  * Those pages are accounted to a particular memcg, embedded in the
  * corresponding page_cgroup. To avoid adding a hit in the allocator to search
  * for that information only to find out that it is NULL for users who have no
  * interest in that whatsoever, we provide these functions.
  *
  * The caller knows better which flags it relies on.
  */
 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
 {
 	memcg_kmem_uncharge_pages(page, order);
 	__free_pages(page, order);
 }
 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
 	}
 }
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *     managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = read_mems_allowed_begin();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RESERVE]	= 'R',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk("(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE),
 		global_page_state(NR_FREE_CMA_PAGES));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk("= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write) {
 		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
 			ret = -EINVAL;
 			goto out;
 		}
 		strcpy(saved_string, (char *)table->data);
 	}
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		ret = __parse_numa_zonelist_order((char *)table->data);
 		if (ret) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char *)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
 	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->managed_pages;
 				total_size += z->managed_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
 	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
 	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	int old_reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = zone_end_pfn(zone);
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	old_reserve = zone->nr_migrate_reserve_block;
 	/* When memory hot-add, we almost always need to do nothing */
 	if (reserve == old_reserve)
 		return;
 	zone->nr_migrate_reserve_block = reserve;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		} else if (!old_reserve) {
 			/*
 			 * At boot time we don't need to scan the whole zone
 			 * for turning off MIGRATE_RESERVE.
 			 */
 			break;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
 		page_nid_reset_last(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < zone_end_pfn(z))
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void pageset_set_high_and_batch(struct zone *zone,
 				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
 	 */
 	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
 	static int __meminitdata last_nid;
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		last_start_pfn = start_pfn;
 		last_end_pfn = end_pfn;
 		last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 							 node_start_pfn,
 							 node_end_pfn,
 							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not naturally algined on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
 						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
 								node_start_pfn,
 								node_end_pfn,
 								zholes_size);
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (freesize >= memmap_pages) {
 			freesize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds freesize %lu\n",
 				zone_names[j], memmap_pages, freesize);
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	if (node_state(nid, N_MEMORY))
 		init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, start_pfn, end_pfn,
 			    zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	printk("Memory: %luK/%luK available "
 	       "(%luK kernel code, %luK rwdata, %luK rodata, "
 	       "%luK init, %luK bss, %luK reserved"
 #ifdef	CONFIG_HIGHMEM
 	       ", %luK highmem"
 #endif
 	       "%s%s)\n",
 	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
 	       codesize >> 10, datasize >> 10, rosize >> 10,
 	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
 	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
 #ifdef	CONFIG_HIGHMEM
 	       totalhigh_pages << (PAGE_SHIFT-10),
 #endif
 	       str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->managed_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
 				      low_wmark_pages(zone) -
 				      zone_page_state(zone, NR_ALLOC_BATCH));
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
 	int ret;
 	mutex_lock(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || ret < 0)
 		goto out;
 	/* Sanity checking to avoid pcp imbalance */
 	if (percpu_pagelist_fraction &&
 	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
 		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
 		ret = -EINVAL;
 		goto out;
 	}
 	/* No change? */
 	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
 		goto out;
 	for_each_populated_zone(zone) {
 		unsigned int cpu;
 		for_each_possible_cpu(cpu)
 			pageset_set_high_and_batch(zone,
 					per_cpu_ptr(zone->pageset, cpu));
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_mask(struct page *page,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long word;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	word = bitmap[word_bitidx];
 	bitidx += end_bitidx;
 	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 }
 /**
  * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long old_word, word;
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
 	flags <<= (BITS_PER_LONG - bitidx - 1);
 	word = ACCESS_ONCE(bitmap[word_bitidx]);
 	for (;;) {
 		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 		if (word == old_word)
 			break;
 		word = old_word;
 	}
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    NULL, 0, cc->mode, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",