Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

73

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

75

DEFINE_PER_CPU(int, numa_node);

75

DEFINE_PER_CPU(int, numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

77

#endif

77

#endif

78

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

80

/*

80

/*

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

84

* defined in <linux/topology.h>.

84

* defined in <linux/topology.h>.

85

*/

85

*/

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

88

#endif

88

#endif

89

90

/*

90

/*

91

* Array of node states.

91

* Array of node states.

92

*/

92

*/

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

94

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_POSSIBLE] = NODE_MASK_ALL,

95

[N_ONLINE] = { { [0] = 1UL } },

95

[N_ONLINE] = { { [0] = 1UL } },

96

#ifndef CONFIG_NUMA

96

#ifndef CONFIG_NUMA

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

100

#endif

100

#endif

101

#ifdef CONFIG_MOVABLE_NODE

101

#ifdef CONFIG_MOVABLE_NODE

102

[N_MEMORY] = { { [0] = 1UL } },

102

[N_MEMORY] = { { [0] = 1UL } },

103

#endif

103

#endif

104

[N_CPU] = { { [0] = 1UL } },

104

[N_CPU] = { { [0] = 1UL } },

105

#endif /* NUMA */

105

#endif /* NUMA */

106

};

106

};

107

EXPORT_SYMBOL(node_states);

107

EXPORT_SYMBOL(node_states);

108

109

/* Protect totalram_pages and zone->managed_pages */

109

/* Protect totalram_pages and zone->managed_pages */

110

static DEFINE_SPINLOCK(managed_page_count_lock);

110

static DEFINE_SPINLOCK(managed_page_count_lock);

111

112

unsigned long totalram_pages __read_mostly;

112

unsigned long totalram_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

114

/*

114

/*

115

* When calculating the number of globally allowed dirty pages, there

115

* When calculating the number of globally allowed dirty pages, there

116

* is a certain number of per-zone reserves that should not be

116

* is a certain number of per-zone reserves that should not be

117

* considered dirtyable memory. This is the sum of those reserves

117

* considered dirtyable memory. This is the sum of those reserves

118

* over all existing zones that contribute dirtyable memory.

118

* over all existing zones that contribute dirtyable memory.

119

*/

119

*/

120

unsigned long dirty_balance_reserve __read_mostly;

120

unsigned long dirty_balance_reserve __read_mostly;

121

122

int percpu_pagelist_fraction;

122

int percpu_pagelist_fraction;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

124

125

#ifdef CONFIG_PM_SLEEP

125

#ifdef CONFIG_PM_SLEEP

126

/*

126

/*

127

* The following functions are used by the suspend/hibernate code to temporarily

127

* The following functions are used by the suspend/hibernate code to temporarily

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

132

* guaranteed not to run in parallel with that modification).

132

* guaranteed not to run in parallel with that modification).

133

*/

133

*/

134

135

static gfp_t saved_gfp_mask;

135

static gfp_t saved_gfp_mask;

136

137

void pm_restore_gfp_mask(void)

137

void pm_restore_gfp_mask(void)

138

{

138

{

139

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(!mutex_is_locked(&pm_mutex));

140

if (saved_gfp_mask) {

140

if (saved_gfp_mask) {

141

gfp_allowed_mask = saved_gfp_mask;

141

gfp_allowed_mask = saved_gfp_mask;

142

saved_gfp_mask = 0;

142

saved_gfp_mask = 0;

143

}

143

}

144

}

144

}

145

146

void pm_restrict_gfp_mask(void)

146

void pm_restrict_gfp_mask(void)

147

{

147

{

148

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(!mutex_is_locked(&pm_mutex));

149

WARN_ON(saved_gfp_mask);

149

WARN_ON(saved_gfp_mask);

150

saved_gfp_mask = gfp_allowed_mask;

150

saved_gfp_mask = gfp_allowed_mask;

151

gfp_allowed_mask &= ~GFP_IOFS;

151

gfp_allowed_mask &= ~GFP_IOFS;

152

}

152

}

153

154

bool pm_suspended_storage(void)

154

bool pm_suspended_storage(void)

155

{

155

{

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

157

return false;

157

return false;

158

return true;

158

return true;

159

}

159

}

160

#endif /* CONFIG_PM_SLEEP */

160

#endif /* CONFIG_PM_SLEEP */

161

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

163

int pageblock_order __read_mostly;

163

int pageblock_order __read_mostly;

164

#endif

164

#endif

165

166

static void __free_pages_ok(struct page *page, unsigned int order);

166

static void __free_pages_ok(struct page *page, unsigned int order);

167

168

/*

168

/*

169

* results with 256, 32 in the lowmem_reserve sysctl:

169

* results with 256, 32 in the lowmem_reserve sysctl:

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

175

*

175

*

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

177

* don't need any ZONE_NORMAL reservation

177

* don't need any ZONE_NORMAL reservation

178

*/

178

*/

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

180

#ifdef CONFIG_ZONE_DMA

180

#ifdef CONFIG_ZONE_DMA

181

256,

181

256,

182

#endif

182

#endif

183

#ifdef CONFIG_ZONE_DMA32

183

#ifdef CONFIG_ZONE_DMA32

184

256,

184

256,

185

#endif

185

#endif

186

#ifdef CONFIG_HIGHMEM

186

#ifdef CONFIG_HIGHMEM

187

32,

187

32,

188

#endif

188

#endif

189

32,

189

32,

190

};

190

};

191

192

EXPORT_SYMBOL(totalram_pages);

192

EXPORT_SYMBOL(totalram_pages);

193

194

static char * const zone_names[MAX_NR_ZONES] = {

194

static char * const zone_names[MAX_NR_ZONES] = {

195

#ifdef CONFIG_ZONE_DMA

195

#ifdef CONFIG_ZONE_DMA

196

"DMA",

196

"DMA",

197

#endif

197

#endif

198

#ifdef CONFIG_ZONE_DMA32

198

#ifdef CONFIG_ZONE_DMA32

199

"DMA32",

199

"DMA32",

200

#endif

200

#endif

201

"Normal",

201

"Normal",

202

#ifdef CONFIG_HIGHMEM

202

#ifdef CONFIG_HIGHMEM

203

"HighMem",

203

"HighMem",

204

#endif

204

#endif

205

"Movable",

205

"Movable",

206

};

206

};

207

208

int min_free_kbytes = 1024;

208

int min_free_kbytes = 1024;

209

int user_min_free_kbytes;

209

int user_min_free_kbytes;

210

211

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_kernel_pages;

212

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata nr_all_pages;

213

static unsigned long __meminitdata dma_reserve;

213

static unsigned long __meminitdata dma_reserve;

214

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

218

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_kernelcore;

219

static unsigned long __initdata required_movablecore;

219

static unsigned long __initdata required_movablecore;

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

221

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

223

int movable_zone;

223

int movable_zone;

224

EXPORT_SYMBOL(movable_zone);

224

EXPORT_SYMBOL(movable_zone);

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

226

227

#if MAX_NUMNODES > 1

227

#if MAX_NUMNODES > 1

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

229

int nr_online_nodes __read_mostly = 1;

229

int nr_online_nodes __read_mostly = 1;

230

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_node_ids);

231

EXPORT_SYMBOL(nr_online_nodes);

231

EXPORT_SYMBOL(nr_online_nodes);

232

#endif

232

#endif

233

234

int page_group_by_mobility_disabled __read_mostly;

234

int page_group_by_mobility_disabled __read_mostly;

235

236

void set_pageblock_migratetype(struct page *page, int migratetype)

236

void set_pageblock_migratetype(struct page *page, int migratetype)

237

{

237

{

238

239

if (unlikely(page_group_by_mobility_disabled))

239

if (unlikely(page_group_by_mobility_disabled))

240

migratetype = MIGRATE_UNMOVABLE;

240

migratetype = MIGRATE_UNMOVABLE;

241

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

243

PB_migrate, PB_migrate_end);

243

PB_migrate, PB_migrate_end);

244

}

244

}

245

246

bool oom_killer_disabled __read_mostly;

246

bool oom_killer_disabled __read_mostly;

247

248

#ifdef CONFIG_DEBUG_VM

248

#ifdef CONFIG_DEBUG_VM

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

250

{

250

{

251

int ret = 0;

251

int ret = 0;

252

unsigned seq;

252

unsigned seq;

253

unsigned long pfn = page_to_pfn(page);

253

unsigned long pfn = page_to_pfn(page);

254

unsigned long sp, start_pfn;

254

unsigned long sp, start_pfn;

255

256

do {

256

do {

257

seq = zone_span_seqbegin(zone);

257

seq = zone_span_seqbegin(zone);

258

start_pfn = zone->zone_start_pfn;

258

start_pfn = zone->zone_start_pfn;

259

sp = zone->spanned_pages;

259

sp = zone->spanned_pages;

260

if (!zone_spans_pfn(zone, pfn))

260

if (!zone_spans_pfn(zone, pfn))

261

ret = 1;

261

ret = 1;

262

} while (zone_span_seqretry(zone, seq));

262

} while (zone_span_seqretry(zone, seq));

263

264

if (ret)

264

if (ret)

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

266

pfn, start_pfn, start_pfn + sp);

266

pfn, start_pfn, start_pfn + sp);

267

268

return ret;

268

return ret;

269

}

269

}

270

271

static int page_is_consistent(struct zone *zone, struct page *page)

271

static int page_is_consistent(struct zone *zone, struct page *page)

272

{

272

{

273

if (!pfn_valid_within(page_to_pfn(page)))

273

if (!pfn_valid_within(page_to_pfn(page)))

274

return 0;

274

return 0;

275

if (zone != page_zone(page))

275

if (zone != page_zone(page))

276

return 0;

276

return 0;

277

278

return 1;

278

return 1;

279

}

279

}

280

/*

280

/*

281

* Temporary debugging check for pages not lying within a given zone.

281

* Temporary debugging check for pages not lying within a given zone.

282

*/

282

*/

283

static int bad_range(struct zone *zone, struct page *page)

283

static int bad_range(struct zone *zone, struct page *page)

284

{

284

{

285

if (page_outside_zone_boundaries(zone, page))

285

if (page_outside_zone_boundaries(zone, page))

286

return 1;

286

return 1;

287

if (!page_is_consistent(zone, page))

287

if (!page_is_consistent(zone, page))

288

return 1;

288

return 1;

289

290

return 0;

290

return 0;

291

}

291

}

292

#else

292

#else

293

static inline int bad_range(struct zone *zone, struct page *page)

293

static inline int bad_range(struct zone *zone, struct page *page)

294

{

294

{

295

return 0;

295

return 0;

296

}

296

}

297

#endif

297

#endif

298

299

static void bad_page(struct page *page)

299

static void bad_page(struct page *page)

300

{

300

{

301

static unsigned long resume;

301

static unsigned long resume;

302

static unsigned long nr_shown;

302

static unsigned long nr_shown;

303

static unsigned long nr_unshown;

303

static unsigned long nr_unshown;

304

305

/* Don't complain about poisoned pages */

305

/* Don't complain about poisoned pages */

306

if (PageHWPoison(page)) {

306

if (PageHWPoison(page)) {

307

page_mapcount_reset(page); /* remove PageBuddy */

307

page_mapcount_reset(page); /* remove PageBuddy */

308

return;

308

return;

309

}

309

}

310

311

/*

311

/*

312

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* Allow a burst of 60 reports, then keep quiet for that minute;

313

* or allow a steady drip of one report per second.

313

* or allow a steady drip of one report per second.

314

*/

314

*/

315

if (nr_shown == 60) {

315

if (nr_shown == 60) {

316

if (time_before(jiffies, resume)) {

316

if (time_before(jiffies, resume)) {

317

nr_unshown++;

317

nr_unshown++;

318

goto out;

318

goto out;

319

}

319

}

320

if (nr_unshown) {

320

if (nr_unshown) {

321

printk(KERN_ALERT

321

printk(KERN_ALERT

322

"BUG: Bad page state: %lu messages suppressed\n",

322

"BUG: Bad page state: %lu messages suppressed\n",

323

nr_unshown);

323

nr_unshown);

324

nr_unshown = 0;

324

nr_unshown = 0;

325

}

325

}

326

nr_shown = 0;

326

nr_shown = 0;

327

}

327

}

328

if (nr_shown++ == 0)

328

if (nr_shown++ == 0)

329

resume = jiffies + 60 * HZ;

329

resume = jiffies + 60 * HZ;

330

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

332

current->comm, page_to_pfn(page));

332

current->comm, page_to_pfn(page));

333

dump_page(page);

333

dump_page(page);

334

335

print_modules();

335

print_modules();

336

dump_stack();

336

dump_stack();

337

out:

337

out:

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

339

page_mapcount_reset(page); /* remove PageBuddy */

339

page_mapcount_reset(page); /* remove PageBuddy */

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

341

}

341

}

342

343

/*

343

/*

344

* Higher-order pages are called "compound pages". They are structured thusly:

344

* Higher-order pages are called "compound pages". They are structured thusly:

345

*

345

*

346

* The first PAGE_SIZE page is called the "head page".

346

* The first PAGE_SIZE page is called the "head page".

347

*

347

*

348

* The remaining PAGE_SIZE pages are called "tail pages".

348

* The remaining PAGE_SIZE pages are called "tail pages".

349

*

349

*

350

* All pages have PG_compound set. All tail pages have their ->first_page

350

* All pages have PG_compound set. All tail pages have their ->first_page

351

* pointing at the head page.

351

* pointing at the head page.

352

*

352

*

353

* The first tail page's ->lru.next holds the address of the compound page's

353

* The first tail page's ->lru.next holds the address of the compound page's

354

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* put_page() function. Its ->lru.prev holds the order of allocation.

355

* This usage means that zero-order pages may not be compound.

355

* This usage means that zero-order pages may not be compound.

356

*/

356

*/

357

358

static void free_compound_page(struct page *page)

358

static void free_compound_page(struct page *page)

359

{

359

{

360

__free_pages_ok(page, compound_order(page));

360

__free_pages_ok(page, compound_order(page));

361

}

361

}

362

363

void prep_compound_page(struct page *page, unsigned long order)

363

void prep_compound_page(struct page *page, unsigned long order)

364

{

364

{

365

int i;

365

int i;

366

int nr_pages = 1 << order;

366

int nr_pages = 1 << order;

367

368

set_compound_page_dtor(page, free_compound_page);

368

set_compound_page_dtor(page, free_compound_page);

369

set_compound_order(page, order);

369

set_compound_order(page, order);

370

__SetPageHead(page);

370

__SetPageHead(page);

371

for (i = 1; i < nr_pages; i++) {

371

for (i = 1; i < nr_pages; i++) {

372

struct page *p = page + i;

372

struct page *p = page + i;

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

/* Make sure p->first_page is always valid for PageTail() */

375

/* Make sure p->first_page is always valid for PageTail() */

376

smp_wmb();

376

smp_wmb();

377

__SetPageTail(p);

377

__SetPageTail(p);

378

}

378

}

379

}

379

}

380

381

/* update __split_huge_page_refcount if you change this function */

381

/* update __split_huge_page_refcount if you change this function */

382

static int destroy_compound_page(struct page *page, unsigned long order)

382

static int destroy_compound_page(struct page *page, unsigned long order)

383

{

383

{

384

int i;

384

int i;

385

int nr_pages = 1 << order;

385

int nr_pages = 1 << order;

386

int bad = 0;

386

int bad = 0;

387

388

if (unlikely(compound_order(page) != order)) {

388

if (unlikely(compound_order(page) != order)) {

389

bad_page(page);

389

bad_page(page);

390

bad++;

390

bad++;

391

}

391

}

392

393

__ClearPageHead(page);

393

__ClearPageHead(page);

394

395

for (i = 1; i < nr_pages; i++) {

395

for (i = 1; i < nr_pages; i++) {

396

struct page *p = page + i;

396

struct page *p = page + i;

397

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

399

bad_page(page);

399

bad_page(page);

400

bad++;

400

bad++;

401

}

401

}

402

__ClearPageTail(p);

402

__ClearPageTail(p);

403

}

403

}

404

405

return bad;

405

return bad;

406

}

406

}

407

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

408

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

409

{

409

{

410

int i;

410

int i;

411

412

/*

412

/*

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

413

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

414

* and __GFP_HIGHMEM from hard or soft interrupt context.

415

*/

415

*/

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

416

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

417

for (i = 0; i < (1 << order); i++)

417

for (i = 0; i < (1 << order); i++)

418

clear_highpage(page + i);

418

clear_highpage(page + i);

419

}

419

}

420

421

#ifdef CONFIG_DEBUG_PAGEALLOC

421

#ifdef CONFIG_DEBUG_PAGEALLOC

422

unsigned int _debug_guardpage_minorder;

422

unsigned int _debug_guardpage_minorder;

423

424

static int __init debug_guardpage_minorder_setup(char *buf)

424

static int __init debug_guardpage_minorder_setup(char *buf)

425

{

425

{

426

unsigned long res;

426

unsigned long res;

427

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

428

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

429

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

430

return 0;

430

return 0;

431

}

431

}

432

_debug_guardpage_minorder = res;

432

_debug_guardpage_minorder = res;

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

433

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

434

return 0;

434

return 0;

435

}

435

}

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

436

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

437

438

static inline void set_page_guard_flag(struct page *page)

438

static inline void set_page_guard_flag(struct page *page)

439

{

439

{

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

440

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

441

}

441

}

442

443

static inline void clear_page_guard_flag(struct page *page)

443

static inline void clear_page_guard_flag(struct page *page)

444

{

444

{

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

445

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

446

}

446

}

447

#else

447

#else

448

static inline void set_page_guard_flag(struct page *page) { }

448

static inline void set_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

449

static inline void clear_page_guard_flag(struct page *page) { }

450

#endif

450

#endif

451

452

static inline void set_page_order(struct page *page, int order)

452

static inline void set_page_order(struct page *page, int order)

453

{

453

{

454

set_page_private(page, order);

454

set_page_private(page, order);

455

__SetPageBuddy(page);

455

__SetPageBuddy(page);

456

}

456

}

457

458

static inline void rmv_page_order(struct page *page)

458

static inline void rmv_page_order(struct page *page)

459

{

459

{

460

__ClearPageBuddy(page);

460

__ClearPageBuddy(page);

461

set_page_private(page, 0);

461

set_page_private(page, 0);

462

}

462

}

463

464

/*

464

/*

465

* Locate the struct page for both the matching buddy in our

465

* Locate the struct page for both the matching buddy in our

466

* pair (buddy1) and the combined O(n+1) page they form (page).

466

* pair (buddy1) and the combined O(n+1) page they form (page).

467

*

467

*

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

468

* 1) Any buddy B1 will have an order O twin B2 which satisfies

469

* the following equation:

469

* the following equation:

470

* B2 = B1 ^ (1 << O)

470

* B2 = B1 ^ (1 << O)

471

* For example, if the starting buddy (buddy2) is #8 its order

471

* For example, if the starting buddy (buddy2) is #8 its order

472

* 1 buddy is #10:

472

* 1 buddy is #10:

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

473

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

474

*

474

*

475

* 2) Any buddy B will have an order O+1 parent P which

475

* 2) Any buddy B will have an order O+1 parent P which

476

* satisfies the following equation:

476

* satisfies the following equation:

477

* P = B & ~(1 << O)

477

* P = B & ~(1 << O)

478

*

478

*

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

479

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

480

*/

480

*/

481

static inline unsigned long

481

static inline unsigned long

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

482

__find_buddy_index(unsigned long page_idx, unsigned int order)

483

{

483

{

484

return page_idx ^ (1 << order);

484

return page_idx ^ (1 << order);

485

}

485

}

486

487

/*

487

/*

488

* This function checks whether a page is free && is the buddy

488

* This function checks whether a page is free && is the buddy

489

* we can do coalesce a page and its buddy if

489

* we can do coalesce a page and its buddy if

490

* (a) the buddy is not in a hole &&

490

* (a) the buddy is not in a hole &&

491

* (b) the buddy is in the buddy system &&

491

* (b) the buddy is in the buddy system &&

492

* (c) a page and its buddy have the same order &&

492

* (c) a page and its buddy have the same order &&

493

* (d) a page and its buddy are in the same zone.

493

* (d) a page and its buddy are in the same zone.

494

*

494

*

495

* For recording whether a page is in the buddy system, we set ->_mapcount

495

* For recording whether a page is in the buddy system, we set ->_mapcount

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

496

* PAGE_BUDDY_MAPCOUNT_VALUE.

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

497

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

498

* serialized by zone->lock.

498

* serialized by zone->lock.

499

*

499

*

500

* For recording page's order, we use page_private(page).

500

* For recording page's order, we use page_private(page).

501

*/

501

*/

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

502

static inline int page_is_buddy(struct page *page, struct page *buddy,

503

int order)

503

int order)

504

{

504

{

505

if (!pfn_valid_within(page_to_pfn(buddy)))

505

if (!pfn_valid_within(page_to_pfn(buddy)))

506

return 0;

506

return 0;

507

508

if (page_zone_id(page) != page_zone_id(buddy))

508

if (page_zone_id(page) != page_zone_id(buddy))

509

return 0;

509

return 0;

510

511

if (page_is_guard(buddy) && page_order(buddy) == order) {

511

if (page_is_guard(buddy) && page_order(buddy) == order) {

512

VM_BUG_ON(page_count(buddy) != 0);

512

VM_BUG_ON(page_count(buddy) != 0);

513

return 1;

513

return 1;

514

}

514

}

515

516

if (PageBuddy(buddy) && page_order(buddy) == order) {

516

if (PageBuddy(buddy) && page_order(buddy) == order) {

517

VM_BUG_ON(page_count(buddy) != 0);

517

VM_BUG_ON(page_count(buddy) != 0);

518

return 1;

518

return 1;

519

}

519

}

520

return 0;

520

return 0;

521

}

521

}

522

523

/*

523

/*

524

* Freeing function for a buddy system allocator.

524

* Freeing function for a buddy system allocator.

525

*

525

*

526

* The concept of a buddy system is to maintain direct-mapped table

526

* The concept of a buddy system is to maintain direct-mapped table

527

* (containing bit values) for memory blocks of various "orders".

527

* (containing bit values) for memory blocks of various "orders".

528

* The bottom level table contains the map for the smallest allocatable

528

* The bottom level table contains the map for the smallest allocatable

529

* units of memory (here, pages), and each level above it describes

529

* units of memory (here, pages), and each level above it describes

530

* pairs of units from the levels below, hence, "buddies".

530

* pairs of units from the levels below, hence, "buddies".

531

* At a high level, all that happens here is marking the table entry

531

* At a high level, all that happens here is marking the table entry

532

* at the bottom level available, and propagating the changes upward

532

* at the bottom level available, and propagating the changes upward

533

* as necessary, plus some accounting needed to play nicely with other

533

* as necessary, plus some accounting needed to play nicely with other

534

* parts of the VM system.

534

* parts of the VM system.

535

* At each level, we keep a list of pages, which are heads of continuous

535

* At each level, we keep a list of pages, which are heads of continuous

536

* free pages of length of (1 << order) and marked with _mapcount

536

* free pages of length of (1 << order) and marked with _mapcount

537

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

537

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

538

* field.

538

* field.

539

* So when we are allocating or freeing one, we can derive the state of the

539

* So when we are allocating or freeing one, we can derive the state of the

540

* other. That is, if we allocate a small block, and both were

540

* other. That is, if we allocate a small block, and both were

541

* free, the remainder of the region must be split into blocks.

541

* free, the remainder of the region must be split into blocks.

542

* If a block is freed, and its buddy is also free, then this

542

* If a block is freed, and its buddy is also free, then this

543

* triggers coalescing into a block of larger size.

543

* triggers coalescing into a block of larger size.

544

*

544

*

545

* -- nyc

545

* -- nyc

546

*/

546

*/

547

548

static inline void __free_one_page(struct page *page,

548

static inline void __free_one_page(struct page *page,

549

struct zone *zone, unsigned int order,

549

struct zone *zone, unsigned int order,

550

int migratetype)

550

int migratetype)

551

{

551

{

552

unsigned long page_idx;

552

unsigned long page_idx;

553

unsigned long combined_idx;

553

unsigned long combined_idx;

554

unsigned long uninitialized_var(buddy_idx);

554

unsigned long uninitialized_var(buddy_idx);

555

struct page *buddy;

555

struct page *buddy;

556

557

VM_BUG_ON(!zone_is_initialized(zone));

557

VM_BUG_ON(!zone_is_initialized(zone));

558

559

if (unlikely(PageCompound(page)))

559

if (unlikely(PageCompound(page)))

560

if (unlikely(destroy_compound_page(page, order)))

560

if (unlikely(destroy_compound_page(page, order)))

561

return;

561

return;

562

563

VM_BUG_ON(migratetype == -1);

563

VM_BUG_ON(migratetype == -1);

564

565

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

565

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

566

567

VM_BUG_ON(page_idx & ((1 << order) - 1));

567

VM_BUG_ON(page_idx & ((1 << order) - 1));

568

VM_BUG_ON(bad_range(zone, page));

568

VM_BUG_ON(bad_range(zone, page));

569

570

while (order < MAX_ORDER-1) {

570

while (order < MAX_ORDER-1) {

571

buddy_idx = __find_buddy_index(page_idx, order);

571

buddy_idx = __find_buddy_index(page_idx, order);

572

buddy = page + (buddy_idx - page_idx);

572

buddy = page + (buddy_idx - page_idx);

573

if (!page_is_buddy(page, buddy, order))

573

if (!page_is_buddy(page, buddy, order))

574

break;

574

break;

575

/*

575

/*

576

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

576

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

577

* merge with it and move up one order.

577

* merge with it and move up one order.

578

*/

578

*/

579

if (page_is_guard(buddy)) {

579

if (page_is_guard(buddy)) {

580

clear_page_guard_flag(buddy);

580

clear_page_guard_flag(buddy);

581

set_page_private(page, 0);

581

set_page_private(page, 0);

582

__mod_zone_freepage_state(zone, 1 << order,

582

__mod_zone_freepage_state(zone, 1 << order,

583

migratetype);

583

migratetype);

584

} else {

584

} else {

585

list_del(&buddy->lru);

585

list_del(&buddy->lru);

586

zone->free_area[order].nr_free--;

586

zone->free_area[order].nr_free--;

587

rmv_page_order(buddy);

587

rmv_page_order(buddy);

588

}

588

}

589

combined_idx = buddy_idx & page_idx;

589

combined_idx = buddy_idx & page_idx;

590

page = page + (combined_idx - page_idx);

590

page = page + (combined_idx - page_idx);

591

page_idx = combined_idx;

591

page_idx = combined_idx;

592

order++;

592

order++;

593

}

593

}

594

set_page_order(page, order);

594

set_page_order(page, order);

595

596

/*

596

/*

597

* If this is not the largest possible page, check if the buddy

597

* If this is not the largest possible page, check if the buddy

598

* of the next-highest order is free. If it is, it's possible

598

* of the next-highest order is free. If it is, it's possible

599

* that pages are being freed that will coalesce soon. In case,

599

* that pages are being freed that will coalesce soon. In case,

600

* that is happening, add the free page to the tail of the list

600

* that is happening, add the free page to the tail of the list

601

* so it's less likely to be used soon and more likely to be merged

601

* so it's less likely to be used soon and more likely to be merged

602

* as a higher order page

602

* as a higher order page

603

*/

603

*/

604

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

604

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

605

struct page *higher_page, *higher_buddy;

605

struct page *higher_page, *higher_buddy;

606

combined_idx = buddy_idx & page_idx;

606

combined_idx = buddy_idx & page_idx;

607

higher_page = page + (combined_idx - page_idx);

607

higher_page = page + (combined_idx - page_idx);

608

buddy_idx = __find_buddy_index(combined_idx, order + 1);

608

buddy_idx = __find_buddy_index(combined_idx, order + 1);

609

higher_buddy = higher_page + (buddy_idx - combined_idx);

609

higher_buddy = higher_page + (buddy_idx - combined_idx);

610

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

610

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

611

list_add_tail(&page->lru,

611

list_add_tail(&page->lru,

612

&zone->free_area[order].free_list[migratetype]);

612

&zone->free_area[order].free_list[migratetype]);

613

goto out;

613

goto out;

614

}

614

}

615

}

615

}

616

617

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

617

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

618

out:

618

out:

619

zone->free_area[order].nr_free++;

619

zone->free_area[order].nr_free++;

620

}

620

}

621

622

static inline int free_pages_check(struct page *page)

622

static inline int free_pages_check(struct page *page)

623

{

623

{

624

if (unlikely(page_mapcount(page) |

624

if (unlikely(page_mapcount(page) |

625

(page->mapping != NULL) |

625

(page->mapping != NULL) |

626

(atomic_read(&page->_count) != 0) |

626

(atomic_read(&page->_count) != 0) |

627

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

627

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

628

(mem_cgroup_bad_page_check(page)))) {

628

(mem_cgroup_bad_page_check(page)))) {

629

bad_page(page);

629

bad_page(page);

630

return 1;

630

return 1;

631

}

631

}

632

page_nid_reset_last(page);

632

page_nid_reset_last(page);

633

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

633

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

634

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

634

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

635

return 0;

635

return 0;

636

}

636

}

637

638

/*

638

/*

639

* Frees a number of pages from the PCP lists

639

* Frees a number of pages from the PCP lists

640

* Assumes all pages on list are in same zone, and of same order.

640

* Assumes all pages on list are in same zone, and of same order.

641

* count is the number of pages to free.

641

* count is the number of pages to free.

642

*

642

*

643

* If the zone was previously in an "all pages pinned" state then look to

643

* If the zone was previously in an "all pages pinned" state then look to

644

* see if this freeing clears that state.

644

* see if this freeing clears that state.

645

*

645

*

646

* And clear the zone's pages_scanned counter, to hold off the "all pages are

646

* And clear the zone's pages_scanned counter, to hold off the "all pages are

647

* pinned" detection logic.

647

* pinned" detection logic.

648

*/

648

*/

649

static void free_pcppages_bulk(struct zone *zone, int count,

649

static void free_pcppages_bulk(struct zone *zone, int count,

650

struct per_cpu_pages *pcp)

650

struct per_cpu_pages *pcp)

651

{

651

{

652

int migratetype = 0;

652

int migratetype = 0;

653

int batch_free = 0;

653

int batch_free = 0;

654

int to_free = count;

654

int to_free = count;

655

656

spin_lock(&zone->lock);

656

spin_lock(&zone->lock);

657

zone->pages_scanned = 0;

657

zone->pages_scanned = 0;

658

659

while (to_free) {

659

while (to_free) {

660

struct page *page;

660

struct page *page;

661

struct list_head *list;

661

struct list_head *list;

662

663

/*

663

/*

664

* Remove pages from lists in a round-robin fashion. A

664

* Remove pages from lists in a round-robin fashion. A

665

* batch_free count is maintained that is incremented when an

665

* batch_free count is maintained that is incremented when an

666

* empty list is encountered. This is so more pages are freed

666

* empty list is encountered. This is so more pages are freed

667

* off fuller lists instead of spinning excessively around empty

667

* off fuller lists instead of spinning excessively around empty

668

* lists

668

* lists

669

*/

669

*/

670

do {

670

do {

671

batch_free++;

671

batch_free++;

672

if (++migratetype == MIGRATE_PCPTYPES)

672

if (++migratetype == MIGRATE_PCPTYPES)

673

migratetype = 0;

673

migratetype = 0;

674

list = &pcp->lists[migratetype];

674

list = &pcp->lists[migratetype];

675

} while (list_empty(list));

675

} while (list_empty(list));

676

677

/* This is the only non-empty list. Free them all. */

677

/* This is the only non-empty list. Free them all. */

678

if (batch_free == MIGRATE_PCPTYPES)

678

if (batch_free == MIGRATE_PCPTYPES)

679

batch_free = to_free;

679

batch_free = to_free;

680

681

do {

681

do {

682

int mt; /* migratetype of the to-be-freed page */

682

int mt; /* migratetype of the to-be-freed page */

683

684

page = list_entry(list->prev, struct page, lru);

684

page = list_entry(list->prev, struct page, lru);

685

/* must delete as __free_one_page list manipulates */

685

/* must delete as __free_one_page list manipulates */

686

list_del(&page->lru);

686

list_del(&page->lru);

687

mt = get_freepage_migratetype(page);

687

mt = get_freepage_migratetype(page);

688

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

688

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

689

__free_one_page(page, zone, 0, mt);

689

__free_one_page(page, zone, 0, mt);

690

trace_mm_page_pcpu_drain(page, 0, mt);

690

trace_mm_page_pcpu_drain(page, 0, mt);

691

if (likely(!is_migrate_isolate_page(page))) {

691

if (likely(!is_migrate_isolate_page(page))) {

692

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

692

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

693

if (is_migrate_cma(mt))

693

if (is_migrate_cma(mt))

694

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

694

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

695

}

695

}

696

} while (--to_free && --batch_free && !list_empty(list));

696

} while (--to_free && --batch_free && !list_empty(list));

697

}

697

}

698

spin_unlock(&zone->lock);

698

spin_unlock(&zone->lock);

699

}

699

}

700

701

static void free_one_page(struct zone *zone, struct page *page, int order,

701

static void free_one_page(struct zone *zone, struct page *page, int order,

702

int migratetype)

702

int migratetype)

703

{

703

{

704

spin_lock(&zone->lock);

704

spin_lock(&zone->lock);

705

zone->pages_scanned = 0;

705

zone->pages_scanned = 0;

706

707

__free_one_page(page, zone, order, migratetype);

707

__free_one_page(page, zone, order, migratetype);

708

if (unlikely(!is_migrate_isolate(migratetype)))

708

if (unlikely(!is_migrate_isolate(migratetype)))

709

__mod_zone_freepage_state(zone, 1 << order, migratetype);

709

__mod_zone_freepage_state(zone, 1 << order, migratetype);

710

spin_unlock(&zone->lock);

710

spin_unlock(&zone->lock);

711

}

711

}

712

713

static bool free_pages_prepare(struct page *page, unsigned int order)

713

static bool free_pages_prepare(struct page *page, unsigned int order)

714

{

714

{

715

int i;

715

int i;

716

int bad = 0;

716

int bad = 0;

717

718

trace_mm_page_free(page, order);

718

trace_mm_page_free(page, order);

719

kmemcheck_free_shadow(page, order);

719

kmemcheck_free_shadow(page, order);

720

721

if (PageAnon(page))

721

if (PageAnon(page))

722

page->mapping = NULL;

722

page->mapping = NULL;

723

for (i = 0; i < (1 << order); i++)

723

for (i = 0; i < (1 << order); i++)

724

bad += free_pages_check(page + i);

724

bad += free_pages_check(page + i);

725

if (bad)

725

if (bad)

726

return false;

726

return false;

727

728

if (!PageHighMem(page)) {

728

if (!PageHighMem(page)) {

729

debug_check_no_locks_freed(page_address(page),

729

debug_check_no_locks_freed(page_address(page),

730

PAGE_SIZE << order);

730

PAGE_SIZE << order);

731

debug_check_no_obj_freed(page_address(page),

731

debug_check_no_obj_freed(page_address(page),

732

PAGE_SIZE << order);

732

PAGE_SIZE << order);

733

}

733

}

734

arch_free_page(page, order);

734

arch_free_page(page, order);

735

kernel_map_pages(page, 1 << order, 0);

735

kernel_map_pages(page, 1 << order, 0);

736

737

return true;

737

return true;

738

}

738

}

739

740

static void __free_pages_ok(struct page *page, unsigned int order)

740

static void __free_pages_ok(struct page *page, unsigned int order)

741

{

741

{

742

unsigned long flags;

742

unsigned long flags;

743

int migratetype;

743

int migratetype;

744

745

if (!free_pages_prepare(page, order))

745

if (!free_pages_prepare(page, order))

746

return;

746

return;

747

748

local_irq_save(flags);

748

local_irq_save(flags);

749

__count_vm_events(PGFREE, 1 << order);

749

__count_vm_events(PGFREE, 1 << order);

750

migratetype = get_pageblock_migratetype(page);

750

migratetype = get_pageblock_migratetype(page);

751

set_freepage_migratetype(page, migratetype);

751

set_freepage_migratetype(page, migratetype);

752

free_one_page(page_zone(page), page, order, migratetype);

752

free_one_page(page_zone(page), page, order, migratetype);

753

local_irq_restore(flags);

753

local_irq_restore(flags);

754

}

754

}

755

756

void __init __free_pages_bootmem(struct page *page, unsigned int order)

756

void __init __free_pages_bootmem(struct page *page, unsigned int order)

757

{

757

{

758

unsigned int nr_pages = 1 << order;

758

unsigned int nr_pages = 1 << order;

759

struct page *p = page;

759

struct page *p = page;

760

unsigned int loop;

760

unsigned int loop;

761

762

prefetchw(p);

762

prefetchw(p);

763

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

763

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

764

prefetchw(p + 1);

764

prefetchw(p + 1);

765

__ClearPageReserved(p);

765

__ClearPageReserved(p);

766

set_page_count(p, 0);

766

set_page_count(p, 0);

767

}

767

}

768

__ClearPageReserved(p);

768

__ClearPageReserved(p);

769

set_page_count(p, 0);

769

set_page_count(p, 0);

770

771

page_zone(page)->managed_pages += nr_pages;

771

page_zone(page)->managed_pages += nr_pages;

772

set_page_refcounted(page);

772

set_page_refcounted(page);

773

__free_pages(page, order);

773

__free_pages(page, order);

774

}

774

}

775

776

#ifdef CONFIG_CMA

776

#ifdef CONFIG_CMA

777

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

777

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

778

void __init init_cma_reserved_pageblock(struct page *page)

778

void __init init_cma_reserved_pageblock(struct page *page)

779

{

779

{

780

unsigned i = pageblock_nr_pages;

780

unsigned i = pageblock_nr_pages;

781

struct page *p = page;

781

struct page *p = page;

782

783

do {

783

do {

784

__ClearPageReserved(p);

784

__ClearPageReserved(p);

785

set_page_count(p, 0);

785

set_page_count(p, 0);

786

} while (++p, --i);

786

} while (++p, --i);

787

788

set_pageblock_migratetype(page, MIGRATE_CMA);

788

set_pageblock_migratetype(page, MIGRATE_CMA);

789

790

if (pageblock_order >= MAX_ORDER) {

790

if (pageblock_order >= MAX_ORDER) {

791

i = pageblock_nr_pages;

791

i = pageblock_nr_pages;

792

p = page;

792

p = page;

793

do {

793

do {

794

set_page_refcounted(p);

794

set_page_refcounted(p);

795

__free_pages(p, MAX_ORDER - 1);

795

__free_pages(p, MAX_ORDER - 1);

796

p += MAX_ORDER_NR_PAGES;

796

p += MAX_ORDER_NR_PAGES;

797

} while (i -= MAX_ORDER_NR_PAGES);

797

} while (i -= MAX_ORDER_NR_PAGES);

798

} else {

798

} else {

799

set_page_refcounted(page);

799

set_page_refcounted(page);

800

__free_pages(page, pageblock_order);

800

__free_pages(page, pageblock_order);

801

}

801

}

802

803

adjust_managed_page_count(page, pageblock_nr_pages);

803

adjust_managed_page_count(page, pageblock_nr_pages);

804

}

804

}

805

#endif

805

#endif

806

807

/*

807

/*

808

* The order of subdivision here is critical for the IO subsystem.

808

* The order of subdivision here is critical for the IO subsystem.

809

* Please do not alter this order without good reasons and regression

809

* Please do not alter this order without good reasons and regression

810

* testing. Specifically, as large blocks of memory are subdivided,

810

* testing. Specifically, as large blocks of memory are subdivided,

811

* the order in which smaller blocks are delivered depends on the order

811

* the order in which smaller blocks are delivered depends on the order

812

* they're subdivided in this function. This is the primary factor

812

* they're subdivided in this function. This is the primary factor

813

* influencing the order in which pages are delivered to the IO

813

* influencing the order in which pages are delivered to the IO

814

* subsystem according to empirical testing, and this is also justified

814

* subsystem according to empirical testing, and this is also justified

815

* by considering the behavior of a buddy system containing a single

815

* by considering the behavior of a buddy system containing a single

816

* large block of memory acted on by a series of small allocations.

816

* large block of memory acted on by a series of small allocations.

817

* This behavior is a critical factor in sglist merging's success.

817

* This behavior is a critical factor in sglist merging's success.

818

*

818

*

819

* -- nyc

819

* -- nyc

820

*/

820

*/

821

static inline void expand(struct zone *zone, struct page *page,

821

static inline void expand(struct zone *zone, struct page *page,

822

int low, int high, struct free_area *area,

822

int low, int high, struct free_area *area,

823

int migratetype)

823

int migratetype)

824

{

824

{

825

unsigned long size = 1 << high;

825

unsigned long size = 1 << high;

826

827

while (high > low) {

827

while (high > low) {

828

area--;

828

area--;

829

high--;

829

high--;

830

size >>= 1;

830

size >>= 1;

831

VM_BUG_ON(bad_range(zone, &page[size]));

831

VM_BUG_ON(bad_range(zone, &page[size]));

832

833

#ifdef CONFIG_DEBUG_PAGEALLOC

833

#ifdef CONFIG_DEBUG_PAGEALLOC

834

if (high < debug_guardpage_minorder()) {

834

if (high < debug_guardpage_minorder()) {

835

/*

835

/*

836

* Mark as guard pages (or page), that will allow to

836

* Mark as guard pages (or page), that will allow to

837

* merge back to allocator when buddy will be freed.

837

* merge back to allocator when buddy will be freed.

838

* Corresponding page table entries will not be touched,

838

* Corresponding page table entries will not be touched,

839

* pages will stay not present in virtual address space

839

* pages will stay not present in virtual address space

840

*/

840

*/

841

INIT_LIST_HEAD(&page[size].lru);

841

INIT_LIST_HEAD(&page[size].lru);

842

set_page_guard_flag(&page[size]);

842

set_page_guard_flag(&page[size]);

843

set_page_private(&page[size], high);

843

set_page_private(&page[size], high);

844

/* Guard pages are not available for any usage */

844

/* Guard pages are not available for any usage */

845

__mod_zone_freepage_state(zone, -(1 << high),

845

__mod_zone_freepage_state(zone, -(1 << high),

846

migratetype);

846

migratetype);

847

continue;

847

continue;

848

}

848

}

849

#endif

849

#endif

850

list_add(&page[size].lru, &area->free_list[migratetype]);

850

list_add(&page[size].lru, &area->free_list[migratetype]);

851

area->nr_free++;

851

area->nr_free++;

852

set_page_order(&page[size], high);

852

set_page_order(&page[size], high);

853

}

853

}

854

}

854

}

855

856

/*

856

/*

857

* This page is about to be returned from the page allocator

857

* This page is about to be returned from the page allocator

858

*/

858

*/

859

static inline int check_new_page(struct page *page)

859

static inline int check_new_page(struct page *page)

860

{

860

{

861

if (unlikely(page_mapcount(page) |

861

if (unlikely(page_mapcount(page) |

862

(page->mapping != NULL) |

862

(page->mapping != NULL) |

863

(atomic_read(&page->_count) != 0) |

863

(atomic_read(&page->_count) != 0) |

864

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

864

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

865

(mem_cgroup_bad_page_check(page)))) {

865

(mem_cgroup_bad_page_check(page)))) {

866

bad_page(page);

866

bad_page(page);

867

return 1;

867

return 1;

868

}

868

}

869

return 0;

869

return 0;

870

}

870

}

871

872

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

872

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

873

{

873

{

874

int i;

874

int i;

875

876

for (i = 0; i < (1 << order); i++) {

876

for (i = 0; i < (1 << order); i++) {

877

struct page *p = page + i;

877

struct page *p = page + i;

878

if (unlikely(check_new_page(p)))

878

if (unlikely(check_new_page(p)))

879

return 1;

879

return 1;

880

}

880

}

881

882

set_page_private(page, 0);

882

set_page_private(page, 0);

883

set_page_refcounted(page);

883

set_page_refcounted(page);

884

885

arch_alloc_page(page, order);

885

arch_alloc_page(page, order);

886

kernel_map_pages(page, 1 << order, 1);

886

kernel_map_pages(page, 1 << order, 1);

887

888

if (gfp_flags & __GFP_ZERO)

888

if (gfp_flags & __GFP_ZERO)

889

prep_zero_page(page, order, gfp_flags);

889

prep_zero_page(page, order, gfp_flags);

890

891

if (order && (gfp_flags & __GFP_COMP))

891

if (order && (gfp_flags & __GFP_COMP))

892

prep_compound_page(page, order);

892

prep_compound_page(page, order);

893

894

return 0;

894

return 0;

895

}

895

}

896

897

/*

897

/*

898

* Go through the free lists for the given migratetype and remove

898

* Go through the free lists for the given migratetype and remove

899

* the smallest available page from the freelists

899

* the smallest available page from the freelists

900

*/

900

*/

901

static inline

901

static inline

902

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

902

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

903

int migratetype)

903

int migratetype)

904

{

904

{

905

unsigned int current_order;

905

unsigned int current_order;

906

struct free_area *area;

906

struct free_area *area;

907

struct page *page;

907

struct page *page;

908

909

/* Find a page of the appropriate size in the preferred list */

909

/* Find a page of the appropriate size in the preferred list */

910

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

910

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

911

area = &(zone->free_area[current_order]);

911

area = &(zone->free_area[current_order]);

912

if (list_empty(&area->free_list[migratetype]))

912

if (list_empty(&area->free_list[migratetype]))

913

continue;

913

continue;

914

915

page = list_entry(area->free_list[migratetype].next,

915

page = list_entry(area->free_list[migratetype].next,

916

struct page, lru);

916

struct page, lru);

917

list_del(&page->lru);

917

list_del(&page->lru);

918

rmv_page_order(page);

918

rmv_page_order(page);

919

area->nr_free--;

919

area->nr_free--;

920

expand(zone, page, order, current_order, area, migratetype);

920

expand(zone, page, order, current_order, area, migratetype);

921

set_freepage_migratetype(page, migratetype);

921

set_freepage_migratetype(page, migratetype);

922

return page;

922

return page;

923

}

923

}

924

925

return NULL;

925

return NULL;

926

}

926

}

927

928

929

/*

929

/*

930

* This array describes the order lists are fallen back to when

930

* This array describes the order lists are fallen back to when

931

* the free lists for the desirable migrate type are depleted

931

* the free lists for the desirable migrate type are depleted

932

*/

932

*/

933

static int fallbacks[MIGRATE_TYPES][4] = {

933

static int fallbacks[MIGRATE_TYPES][4] = {

934

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

934

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

935

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

935

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

936

#ifdef CONFIG_CMA

936

#ifdef CONFIG_CMA

937

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

937

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

938

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

938

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

939

#else

939

#else

940

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

940

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

941

#endif

941

#endif

942

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

942

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

943

#ifdef CONFIG_MEMORY_ISOLATION

943

#ifdef CONFIG_MEMORY_ISOLATION

944

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

944

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

945

#endif

945

#endif

946

};

946

};

947

948

/*

948

/*

949

* Move the free pages in a range to the free lists of the requested type.

949

* Move the free pages in a range to the free lists of the requested type.

950

* Note that start_page and end_pages are not aligned on a pageblock

950

* Note that start_page and end_pages are not aligned on a pageblock

951

* boundary. If alignment is required, use move_freepages_block()

951

* boundary. If alignment is required, use move_freepages_block()

952

*/

952

*/

953

int move_freepages(struct zone *zone,

953

int move_freepages(struct zone *zone,

954

struct page *start_page, struct page *end_page,

954

struct page *start_page, struct page *end_page,

955

int migratetype)

955

int migratetype)

956

{

956

{

957

struct page *page;

957

struct page *page;

958

unsigned long order;

958

unsigned long order;

959

int pages_moved = 0;

959

int pages_moved = 0;

960

961

#ifndef CONFIG_HOLES_IN_ZONE

961

#ifndef CONFIG_HOLES_IN_ZONE

962

/*

962

/*

963

* page_zone is not safe to call in this context when

963

* page_zone is not safe to call in this context when

964

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

964

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

965

* anyway as we check zone boundaries in move_freepages_block().

965

* anyway as we check zone boundaries in move_freepages_block().

966

* Remove at a later date when no bug reports exist related to

966

* Remove at a later date when no bug reports exist related to

967

* grouping pages by mobility

967

* grouping pages by mobility

968

*/

968

*/

969

BUG_ON(page_zone(start_page) != page_zone(end_page));

969

BUG_ON(page_zone(start_page) != page_zone(end_page));

970

#endif

970

#endif

971

972

for (page = start_page; page <= end_page;) {

972

for (page = start_page; page <= end_page;) {

973

/* Make sure we are not inadvertently changing nodes */

973

/* Make sure we are not inadvertently changing nodes */

974

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

974

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

975

976

if (!pfn_valid_within(page_to_pfn(page))) {

976

if (!pfn_valid_within(page_to_pfn(page))) {

977

page++;

977

page++;

978

continue;

978

continue;

979

}

979

}

980

981

if (!PageBuddy(page)) {

981

if (!PageBuddy(page)) {

982

page++;

982

page++;

983

continue;

983

continue;

984

}

984

}

985

986

order = page_order(page);

986

order = page_order(page);

987

list_move(&page->lru,

987

list_move(&page->lru,

988

&zone->free_area[order].free_list[migratetype]);

988

&zone->free_area[order].free_list[migratetype]);

989

set_freepage_migratetype(page, migratetype);

989

set_freepage_migratetype(page, migratetype);

990

page += 1 << order;

990

page += 1 << order;

991

pages_moved += 1 << order;

991

pages_moved += 1 << order;

992

}

992

}

993

994

return pages_moved;

994

return pages_moved;

995

}

995

}

996

997

int move_freepages_block(struct zone *zone, struct page *page,

997

int move_freepages_block(struct zone *zone, struct page *page,

998

int migratetype)

998

int migratetype)

999

{

999

{

1000

unsigned long start_pfn, end_pfn;

1000

unsigned long start_pfn, end_pfn;

1001

struct page *start_page, *end_page;

1001

struct page *start_page, *end_page;

1002

1003

start_pfn = page_to_pfn(page);

1003

start_pfn = page_to_pfn(page);

1004

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1004

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1005

start_page = pfn_to_page(start_pfn);

1005

start_page = pfn_to_page(start_pfn);

1006

end_page = start_page + pageblock_nr_pages - 1;

1006

end_page = start_page + pageblock_nr_pages - 1;

1007

end_pfn = start_pfn + pageblock_nr_pages - 1;

1007

end_pfn = start_pfn + pageblock_nr_pages - 1;

1008

1009

/* Do not cross zone boundaries */

1009

/* Do not cross zone boundaries */

1010

if (!zone_spans_pfn(zone, start_pfn))

1010

if (!zone_spans_pfn(zone, start_pfn))

1011

start_page = page;

1011

start_page = page;

1012

if (!zone_spans_pfn(zone, end_pfn))

1012

if (!zone_spans_pfn(zone, end_pfn))

1013

return 0;

1013

return 0;

1014

1015

return move_freepages(zone, start_page, end_page, migratetype);

1015

return move_freepages(zone, start_page, end_page, migratetype);

1016

}

1016

}

1017

1018

static void change_pageblock_range(struct page *pageblock_page,

1018

static void change_pageblock_range(struct page *pageblock_page,

1019

int start_order, int migratetype)

1019

int start_order, int migratetype)

1020

{

1020

{

1021

int nr_pageblocks = 1 << (start_order - pageblock_order);

1021

int nr_pageblocks = 1 << (start_order - pageblock_order);

1022

1023

while (nr_pageblocks--) {

1023

while (nr_pageblocks--) {

1024

set_pageblock_migratetype(pageblock_page, migratetype);

1024

set_pageblock_migratetype(pageblock_page, migratetype);

1025

pageblock_page += pageblock_nr_pages;

1025

pageblock_page += pageblock_nr_pages;

1026

}

1026

}

1027

}

1027

}

1028

1029

/*

1029

/*

1030

* If breaking a large block of pages, move all free pages to the preferred

1030

* If breaking a large block of pages, move all free pages to the preferred

1031

* allocation list. If falling back for a reclaimable kernel allocation, be

1031

* allocation list. If falling back for a reclaimable kernel allocation, be

1032

* more aggressive about taking ownership of free pages.

1032

* more aggressive about taking ownership of free pages.

1033

*

1033

*

1034

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1034

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1035

* nor move CMA pages to different free lists. We don't want unmovable pages

1035

* nor move CMA pages to different free lists. We don't want unmovable pages

1036

* to be allocated from MIGRATE_CMA areas.

1036

* to be allocated from MIGRATE_CMA areas.

1037

*

1037

*

1038

* Returns the new migratetype of the pageblock (or the same old migratetype

1038

* Returns the new migratetype of the pageblock (or the same old migratetype

1039

* if it was unchanged).

1039

* if it was unchanged).

1040

*/

1040

*/

1041

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1041

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1042

int start_type, int fallback_type)

1042

int start_type, int fallback_type)

1043

{

1043

{

1044

int current_order = page_order(page);

1044

int current_order = page_order(page);

1045

1046

/*

1046

/*

1047

* When borrowing from MIGRATE_CMA, we need to release the excess

1047

* When borrowing from MIGRATE_CMA, we need to release the excess

1048

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1048

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1049

* is set to CMA so it is returned to the correct freelist in case

1049

* is set to CMA so it is returned to the correct freelist in case

1050

* the page ends up being not actually allocated from the pcp lists.

1050

* the page ends up being not actually allocated from the pcp lists.

1051

*/

1051

*/

1052

if (is_migrate_cma(fallback_type))

1052

if (is_migrate_cma(fallback_type))

1053

return fallback_type;

1053

return fallback_type;

1054

1055

/* Take ownership for orders >= pageblock_order */

1055

/* Take ownership for orders >= pageblock_order */

1056

if (current_order >= pageblock_order) {

1056

if (current_order >= pageblock_order) {

1057

change_pageblock_range(page, current_order, start_type);

1057

change_pageblock_range(page, current_order, start_type);

1058

return start_type;

1058

return start_type;

1059

}

1059

}

1060

1061

if (current_order >= pageblock_order / 2 ||

1061

if (current_order >= pageblock_order / 2 ||

1062

start_type == MIGRATE_RECLAIMABLE ||

1062

start_type == MIGRATE_RECLAIMABLE ||

1063

page_group_by_mobility_disabled) {

1063

page_group_by_mobility_disabled) {

1064

int pages;

1064

int pages;

1065

1066

pages = move_freepages_block(zone, page, start_type);

1066

pages = move_freepages_block(zone, page, start_type);

1067

1068

/* Claim the whole block if over half of it is free */

1068

/* Claim the whole block if over half of it is free */

1069

if (pages >= (1 << (pageblock_order-1)) ||

1069

if (pages >= (1 << (pageblock_order-1)) ||

1070

page_group_by_mobility_disabled) {

1070

page_group_by_mobility_disabled) {

1071

1072

set_pageblock_migratetype(page, start_type);

1072

set_pageblock_migratetype(page, start_type);

1073

return start_type;

1073

return start_type;

1074

}

1074

}

1075

1076

}

1076

}

1077

1078

return fallback_type;

1078

return fallback_type;

1079

}

1079

}

1080

1081

/* Remove an element from the buddy allocator from the fallback list */

1081

/* Remove an element from the buddy allocator from the fallback list */

1082

static inline struct page *

1082

static inline struct page *

1083

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1083

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1084

{

1084

{

1085

struct free_area *area;

1085

struct free_area *area;

1086

int current_order;

1086

int current_order;

1087

struct page *page;

1087

struct page *page;

1088

int migratetype, new_type, i;

1088

int migratetype, new_type, i;

1089

1090

/* Find the largest possible block of pages in the other list */

1090

/* Find the largest possible block of pages in the other list */

1091

for (current_order = MAX_ORDER-1; current_order >= order;

1091

for (current_order = MAX_ORDER-1; current_order >= order;

1092

--current_order) {

1092

--current_order) {

1093

for (i = 0;; i++) {

1093

for (i = 0;; i++) {

1094

migratetype = fallbacks[start_migratetype][i];

1094

migratetype = fallbacks[start_migratetype][i];

1095

1096

/* MIGRATE_RESERVE handled later if necessary */

1096

/* MIGRATE_RESERVE handled later if necessary */

1097

if (migratetype == MIGRATE_RESERVE)

1097

if (migratetype == MIGRATE_RESERVE)

1098

break;

1098

break;

1099

1100

area = &(zone->free_area[current_order]);

1100

area = &(zone->free_area[current_order]);

1101

if (list_empty(&area->free_list[migratetype]))

1101

if (list_empty(&area->free_list[migratetype]))

1102

continue;

1102

continue;

1103

1104

page = list_entry(area->free_list[migratetype].next,

1104

page = list_entry(area->free_list[migratetype].next,

1105

struct page, lru);

1105

struct page, lru);

1106

area->nr_free--;

1106

area->nr_free--;

1107

1108

new_type = try_to_steal_freepages(zone, page,

1108

new_type = try_to_steal_freepages(zone, page,

1109

start_migratetype,

1109

start_migratetype,

1110

migratetype);

1110

migratetype);

1111

1112

/* Remove the page from the freelists */

1112

/* Remove the page from the freelists */

1113

list_del(&page->lru);

1113

list_del(&page->lru);

1114

rmv_page_order(page);

1114

rmv_page_order(page);

1115

1116

expand(zone, page, order, current_order, area,

1116

expand(zone, page, order, current_order, area,

1117

new_type);

1117

new_type);

1118

/* The freepage_migratetype may differ from pageblock's

1118

/* The freepage_migratetype may differ from pageblock's

1119

* migratetype depending on the decisions in

1119

* migratetype depending on the decisions in

1120

* try_to_steal_freepages. This is OK as long as it does

1120

* try_to_steal_freepages. This is OK as long as it does

1121

* not differ for MIGRATE_CMA type.

1121

* not differ for MIGRATE_CMA type.

1122

*/

1122

*/

1123

set_freepage_migratetype(page, new_type);

1123

set_freepage_migratetype(page, new_type);

1124

1125

trace_mm_page_alloc_extfrag(page, order, current_order,

1125

trace_mm_page_alloc_extfrag(page, order, current_order,

1126

start_migratetype, migratetype, new_type);

1126

start_migratetype, migratetype, new_type);

1127

1128

return page;

1128

return page;

1129

}

1129

}

1130

}

1130

}

1131

1132

return NULL;

1132

return NULL;

1133

}

1133

}

1134

1135

/*

1135

/*

1136

* Do the hard work of removing an element from the buddy allocator.

1136

* Do the hard work of removing an element from the buddy allocator.

1137

* Call me with the zone->lock already held.

1137

* Call me with the zone->lock already held.

1138

*/

1138

*/

1139

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1139

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1140

int migratetype)

1140

int migratetype)

1141

{

1141

{

1142

struct page *page;

1142

struct page *page;

1143

1144

retry_reserve:

1144

retry_reserve:

1145

page = __rmqueue_smallest(zone, order, migratetype);

1145

page = __rmqueue_smallest(zone, order, migratetype);

1146

1147

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1147

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1148

page = __rmqueue_fallback(zone, order, migratetype);

1148

page = __rmqueue_fallback(zone, order, migratetype);

1149

1150

/*

1150

/*

1151

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1151

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1152

* is used because __rmqueue_smallest is an inline function

1152

* is used because __rmqueue_smallest is an inline function

1153

* and we want just one call site

1153

* and we want just one call site

1154

*/

1154

*/

1155

if (!page) {

1155

if (!page) {

1156

migratetype = MIGRATE_RESERVE;

1156

migratetype = MIGRATE_RESERVE;

1157

goto retry_reserve;

1157

goto retry_reserve;

1158

}

1158

}

1159

}

1159

}

1160

1161

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1161

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1162

return page;

1162

return page;

1163

}

1163

}

1164

1165

/*

1165

/*

1166

* Obtain a specified number of elements from the buddy allocator, all under

1166

* Obtain a specified number of elements from the buddy allocator, all under

1167

* a single hold of the lock, for efficiency. Add them to the supplied list.

1167

* a single hold of the lock, for efficiency. Add them to the supplied list.

1168

* Returns the number of new pages which were placed at *list.

1168

* Returns the number of new pages which were placed at *list.

1169

*/

1169

*/

1170

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1170

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1171

unsigned long count, struct list_head *list,

1171

unsigned long count, struct list_head *list,

1172

int migratetype, int cold)

1172

int migratetype, int cold)

1173

{

1173

{

1174

int i;

1174

int i;

1175

1176

spin_lock(&zone->lock);

1176

spin_lock(&zone->lock);

1177

for (i = 0; i < count; ++i) {

1177

for (i = 0; i < count; ++i) {

1178

struct page *page = __rmqueue(zone, order, migratetype);

1178

struct page *page = __rmqueue(zone, order, migratetype);

1179

if (unlikely(page == NULL))

1179

if (unlikely(page == NULL))

1180

break;

1180

break;

1181

1182

/*

1182

/*

1183

* Split buddy pages returned by expand() are received here

1183

* Split buddy pages returned by expand() are received here

1184

* in physical page order. The page is added to the callers and

1184

* in physical page order. The page is added to the callers and

1185

* list and the list head then moves forward. From the callers

1185

* list and the list head then moves forward. From the callers

1186

* perspective, the linked list is ordered by page number in

1186

* perspective, the linked list is ordered by page number in

1187

* some conditions. This is useful for IO devices that can

1187

* some conditions. This is useful for IO devices that can

1188

* merge IO requests if the physical pages are ordered

1188

* merge IO requests if the physical pages are ordered

1189

* properly.

1189

* properly.

1190

*/

1190

*/

1191

if (likely(cold == 0))

1191

if (likely(cold == 0))

1192

list_add(&page->lru, list);

1192

list_add(&page->lru, list);

1193

else

1193

else

1194

list_add_tail(&page->lru, list);

1194

list_add_tail(&page->lru, list);

1195

list = &page->lru;

1195

list = &page->lru;

1196

if (is_migrate_cma(get_freepage_migratetype(page)))

1196

if (is_migrate_cma(get_freepage_migratetype(page)))

1197

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1197

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1198

-(1 << order));

1198

-(1 << order));

1199

}

1199

}

1200

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1200

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1201

spin_unlock(&zone->lock);

1201

spin_unlock(&zone->lock);

1202

return i;

1202

return i;

1203

}

1203

}

1204

1205

#ifdef CONFIG_NUMA

1205

#ifdef CONFIG_NUMA

1206

/*

1206

/*

1207

* Called from the vmstat counter updater to drain pagesets of this

1207

* Called from the vmstat counter updater to drain pagesets of this

1208

* currently executing processor on remote nodes after they have

1208

* currently executing processor on remote nodes after they have

1209

* expired.

1209

* expired.

1210

*

1210

*

1211

* Note that this function must be called with the thread pinned to

1211

* Note that this function must be called with the thread pinned to

1212

* a single processor.

1212

* a single processor.

1213

*/

1213

*/

1214

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1214

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1215

{

1215

{

1216

unsigned long flags;

1216

unsigned long flags;

1217

int to_drain;

1217

int to_drain;

1218

unsigned long batch;

1218

unsigned long batch;

1219

1220

local_irq_save(flags);

1220

local_irq_save(flags);

1221

batch = ACCESS_ONCE(pcp->batch);

1221

batch = ACCESS_ONCE(pcp->batch);

1222

if (pcp->count >= batch)

1222

if (pcp->count >= batch)

1223

to_drain = batch;

1223

to_drain = batch;

1224

else

1224

else

1225

to_drain = pcp->count;

1225

to_drain = pcp->count;

1226

if (to_drain > 0) {

1226

if (to_drain > 0) {

1227

free_pcppages_bulk(zone, to_drain, pcp);

1227

free_pcppages_bulk(zone, to_drain, pcp);

1228

pcp->count -= to_drain;

1228

pcp->count -= to_drain;

1229

}

1229

}

1230

local_irq_restore(flags);

1230

local_irq_restore(flags);

1231

}

1231

}

1232

#endif

1232

#endif

1233

1234

/*

1234

/*

1235

* Drain pages of the indicated processor.

1235

* Drain pages of the indicated processor.

1236

*

1236

*

1237

* The processor must either be the current processor and the

1237

* The processor must either be the current processor and the

1238

* thread pinned to the current processor or a processor that

1238

* thread pinned to the current processor or a processor that

1239

* is not online.

1239

* is not online.

1240

*/

1240

*/

1241

static void drain_pages(unsigned int cpu)

1241

static void drain_pages(unsigned int cpu)

1242

{

1242

{

1243

unsigned long flags;

1243

unsigned long flags;

1244

struct zone *zone;

1244

struct zone *zone;

1245

1246

for_each_populated_zone(zone) {

1246

for_each_populated_zone(zone) {

1247

struct per_cpu_pageset *pset;

1247

struct per_cpu_pageset *pset;

1248

struct per_cpu_pages *pcp;

1248

struct per_cpu_pages *pcp;

1249

1250

local_irq_save(flags);

1250

local_irq_save(flags);

1251

pset = per_cpu_ptr(zone->pageset, cpu);

1251

pset = per_cpu_ptr(zone->pageset, cpu);

1252

1253

pcp = &pset->pcp;

1253

pcp = &pset->pcp;

1254

if (pcp->count) {

1254

if (pcp->count) {

1255

free_pcppages_bulk(zone, pcp->count, pcp);

1255

free_pcppages_bulk(zone, pcp->count, pcp);

1256

pcp->count = 0;

1256

pcp->count = 0;

1257

}

1257

}

1258

local_irq_restore(flags);

1258

local_irq_restore(flags);

1259

}

1259

}

1260

}

1260

}

1261

1262

/*

1262

/*

1263

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1263

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1264

*/

1264

*/

1265

void drain_local_pages(void *arg)

1265

void drain_local_pages(void *arg)

1266

{

1266

{

1267

drain_pages(smp_processor_id());

1267

drain_pages(smp_processor_id());

1268

}

1268

}

1269

1270

/*

1270

/*

1271

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1271

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1272

*

1272

*

1273

* Note that this code is protected against sending an IPI to an offline

1273

* Note that this code is protected against sending an IPI to an offline

1274

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1274

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1275

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1275

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1276

* nothing keeps CPUs from showing up after we populated the cpumask and

1276

* nothing keeps CPUs from showing up after we populated the cpumask and

1277

* before the call to on_each_cpu_mask().

1277

* before the call to on_each_cpu_mask().

1278

*/

1278

*/

1279

void drain_all_pages(void)

1279

void drain_all_pages(void)

1280

{

1280

{

1281

int cpu;

1281

int cpu;

1282

struct per_cpu_pageset *pcp;

1282

struct per_cpu_pageset *pcp;

1283

struct zone *zone;

1283

struct zone *zone;

1284

1285

/*

1285

/*

1286

* Allocate in the BSS so we wont require allocation in

1286

* Allocate in the BSS so we wont require allocation in

1287

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1287

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1288

*/

1288

*/

1289

static cpumask_t cpus_with_pcps;

1289

static cpumask_t cpus_with_pcps;

1290

1291

/*

1291

/*

1292

* We don't care about racing with CPU hotplug event

1292

* We don't care about racing with CPU hotplug event

1293

* as offline notification will cause the notified

1293

* as offline notification will cause the notified

1294

* cpu to drain that CPU pcps and on_each_cpu_mask

1294

* cpu to drain that CPU pcps and on_each_cpu_mask

1295

* disables preemption as part of its processing

1295

* disables preemption as part of its processing

1296

*/

1296

*/

1297

for_each_online_cpu(cpu) {

1297

for_each_online_cpu(cpu) {

1298

bool has_pcps = false;

1298

bool has_pcps = false;

1299

for_each_populated_zone(zone) {

1299

for_each_populated_zone(zone) {

1300

pcp = per_cpu_ptr(zone->pageset, cpu);

1300

pcp = per_cpu_ptr(zone->pageset, cpu);

1301

if (pcp->pcp.count) {

1301

if (pcp->pcp.count) {

1302

has_pcps = true;

1302

has_pcps = true;

1303

break;

1303

break;

1304

}

1304

}

1305

}

1305

}

1306

if (has_pcps)

1306

if (has_pcps)

1307

cpumask_set_cpu(cpu, &cpus_with_pcps);

1307

cpumask_set_cpu(cpu, &cpus_with_pcps);

1308

else

1308

else

1309

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1309

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1310

}

1310

}

1311

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1311

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1312

}

1312

}

1313

1314

#ifdef CONFIG_HIBERNATION

1314

#ifdef CONFIG_HIBERNATION

1315

1316

void mark_free_pages(struct zone *zone)

1316

void mark_free_pages(struct zone *zone)

1317

{

1317

{

1318

unsigned long pfn, max_zone_pfn;

1318

unsigned long pfn, max_zone_pfn;

1319

unsigned long flags;

1319

unsigned long flags;

1320

int order, t;

1320

int order, t;

1321

struct list_head *curr;

1321

struct list_head *curr;

1322

1323

if (zone_is_empty(zone))

1323

if (zone_is_empty(zone))

1324

return;

1324

return;

1325

1326

spin_lock_irqsave(&zone->lock, flags);

1326

spin_lock_irqsave(&zone->lock, flags);

1327

1328

max_zone_pfn = zone_end_pfn(zone);

1328

max_zone_pfn = zone_end_pfn(zone);

1329

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1329

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1330

if (pfn_valid(pfn)) {

1330

if (pfn_valid(pfn)) {

1331

struct page *page = pfn_to_page(pfn);

1331

struct page *page = pfn_to_page(pfn);

1332

1333

if (!swsusp_page_is_forbidden(page))

1333

if (!swsusp_page_is_forbidden(page))

1334

swsusp_unset_page_free(page);

1334

swsusp_unset_page_free(page);

1335

}

1335

}

1336

1337

for_each_migratetype_order(order, t) {

1337

for_each_migratetype_order(order, t) {

1338

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1338

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1339

unsigned long i;

1339

unsigned long i;

1340

1341

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1341

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1342

for (i = 0; i < (1UL << order); i++)

1342

for (i = 0; i < (1UL << order); i++)

1343

swsusp_set_page_free(pfn_to_page(pfn + i));

1343

swsusp_set_page_free(pfn_to_page(pfn + i));

1344

}

1344

}

1345

}

1345

}

1346

spin_unlock_irqrestore(&zone->lock, flags);

1346

spin_unlock_irqrestore(&zone->lock, flags);

1347

}

1347

}

1348

#endif /* CONFIG_PM */

1348

#endif /* CONFIG_PM */

1349

1350

/*

1350

/*

1351

* Free a 0-order page

1351

* Free a 0-order page

1352

* cold == 1 ? free a cold page : free a hot page

1352

* cold == 1 ? free a cold page : free a hot page

1353

*/

1353

*/

1354

void free_hot_cold_page(struct page *page, int cold)

1354

void free_hot_cold_page(struct page *page, int cold)

1355

{

1355

{

1356

struct zone *zone = page_zone(page);

1356

struct zone *zone = page_zone(page);

1357

struct per_cpu_pages *pcp;

1357

struct per_cpu_pages *pcp;

1358

unsigned long flags;

1358

unsigned long flags;

1359

int migratetype;

1359

int migratetype;

1360

1361

if (!free_pages_prepare(page, 0))

1361

if (!free_pages_prepare(page, 0))

1362

return;

1362

return;

1363

1364

migratetype = get_pageblock_migratetype(page);

1364

migratetype = get_pageblock_migratetype(page);

1365

set_freepage_migratetype(page, migratetype);

1365

set_freepage_migratetype(page, migratetype);

1366

local_irq_save(flags);

1366

local_irq_save(flags);

1367

__count_vm_event(PGFREE);

1367

__count_vm_event(PGFREE);

1368

1369

/*

1369

/*

1370

* We only track unmovable, reclaimable and movable on pcp lists.

1370

* We only track unmovable, reclaimable and movable on pcp lists.

1371

* Free ISOLATE pages back to the allocator because they are being

1371

* Free ISOLATE pages back to the allocator because they are being

1372

* offlined but treat RESERVE as movable pages so we can get those

1372

* offlined but treat RESERVE as movable pages so we can get those

1373

* areas back if necessary. Otherwise, we may have to free

1373

* areas back if necessary. Otherwise, we may have to free

1374

* excessively into the page allocator

1374

* excessively into the page allocator

1375

*/

1375

*/

1376

if (migratetype >= MIGRATE_PCPTYPES) {

1376

if (migratetype >= MIGRATE_PCPTYPES) {

1377

if (unlikely(is_migrate_isolate(migratetype))) {

1377

if (unlikely(is_migrate_isolate(migratetype))) {

1378

free_one_page(zone, page, 0, migratetype);

1378

free_one_page(zone, page, 0, migratetype);

1379

goto out;

1379

goto out;

1380

}

1380

}

1381

migratetype = MIGRATE_MOVABLE;

1381

migratetype = MIGRATE_MOVABLE;

1382

}

1382

}

1383

1384

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1384

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1385

if (cold)

1385

if (cold)

1386

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1386

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1387

else

1387

else

1388

list_add(&page->lru, &pcp->lists[migratetype]);

1388

list_add(&page->lru, &pcp->lists[migratetype]);

1389

pcp->count++;

1389

pcp->count++;

1390

if (pcp->count >= pcp->high) {

1390

if (pcp->count >= pcp->high) {

1391

unsigned long batch = ACCESS_ONCE(pcp->batch);

1391

unsigned long batch = ACCESS_ONCE(pcp->batch);

1392

free_pcppages_bulk(zone, batch, pcp);

1392

free_pcppages_bulk(zone, batch, pcp);

1393

pcp->count -= batch;

1393

pcp->count -= batch;

1394

}

1394

}

1395

1396

out:

1396

out:

1397

local_irq_restore(flags);

1397

local_irq_restore(flags);

1398

}

1398

}

1399

1400

/*

1400

/*

1401

* Free a list of 0-order pages

1401

* Free a list of 0-order pages

1402

*/

1402

*/

1403

void free_hot_cold_page_list(struct list_head *list, int cold)

1403

void free_hot_cold_page_list(struct list_head *list, int cold)

1404

{

1404

{

1405

struct page *page, *next;

1405

struct page *page, *next;

1406

1407

list_for_each_entry_safe(page, next, list, lru) {

1407

list_for_each_entry_safe(page, next, list, lru) {

1408

trace_mm_page_free_batched(page, cold);

1408

trace_mm_page_free_batched(page, cold);

1409

free_hot_cold_page(page, cold);

1409

free_hot_cold_page(page, cold);

1410

}

1410

}

1411

}

1411

}

1412

1413

/*

1413

/*

1414

* split_page takes a non-compound higher-order page, and splits it into

1414

* split_page takes a non-compound higher-order page, and splits it into

1415

* n (1<<order) sub-pages: page[0..n]

1415

* n (1<<order) sub-pages: page[0..n]

1416

* Each sub-page must be freed individually.

1416

* Each sub-page must be freed individually.

1417

*

1417

*

1418

* Note: this is probably too low level an operation for use in drivers.

1418

* Note: this is probably too low level an operation for use in drivers.

1419

* Please consult with lkml before using this in your driver.

1419

* Please consult with lkml before using this in your driver.

1420

*/

1420

*/

1421

void split_page(struct page *page, unsigned int order)

1421

void split_page(struct page *page, unsigned int order)

1422

{

1422

{

1423

int i;

1423

int i;

1424

1425

VM_BUG_ON(PageCompound(page));

1425

VM_BUG_ON(PageCompound(page));

1426

VM_BUG_ON(!page_count(page));

1426

VM_BUG_ON(!page_count(page));

1427

1428

#ifdef CONFIG_KMEMCHECK

1428

#ifdef CONFIG_KMEMCHECK

1429

/*

1429

/*

1430

* Split shadow pages too, because free(page[0]) would

1430

* Split shadow pages too, because free(page[0]) would

1431

* otherwise free the whole shadow.

1431

* otherwise free the whole shadow.

1432

*/

1432

*/

1433

if (kmemcheck_page_is_tracked(page))

1433

if (kmemcheck_page_is_tracked(page))

1434

split_page(virt_to_page(page[0].shadow), order);

1434

split_page(virt_to_page(page[0].shadow), order);

1435

#endif

1435

#endif

1436

1437

for (i = 1; i < (1 << order); i++)

1437

for (i = 1; i < (1 << order); i++)

1438

set_page_refcounted(page + i);

1438

set_page_refcounted(page + i);

1439

}

1439

}

1440

EXPORT_SYMBOL_GPL(split_page);

1440

EXPORT_SYMBOL_GPL(split_page);

1441

1442

static int __isolate_free_page(struct page *page, unsigned int order)

1442

static int __isolate_free_page(struct page *page, unsigned int order)

1443

{

1443

{

1444

unsigned long watermark;

1444

unsigned long watermark;

1445

struct zone *zone;

1445

struct zone *zone;

1446

int mt;

1446

int mt;

1447

1448

BUG_ON(!PageBuddy(page));

1448

BUG_ON(!PageBuddy(page));

1449

1450

zone = page_zone(page);

1450

zone = page_zone(page);

1451

mt = get_pageblock_migratetype(page);

1451

mt = get_pageblock_migratetype(page);

1452

1453

if (!is_migrate_isolate(mt)) {

1453

if (!is_migrate_isolate(mt)) {

1454

/* Obey watermarks as if the page was being allocated */

1454

/* Obey watermarks as if the page was being allocated */

1455

watermark = low_wmark_pages(zone) + (1 << order);

1455

watermark = low_wmark_pages(zone) + (1 << order);

1456

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1456

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1457

return 0;

1457

return 0;

1458

1459

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1459

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1460

}

1460

}

1461

1462

/* Remove page from free list */

1462

/* Remove page from free list */

1463

list_del(&page->lru);

1463

list_del(&page->lru);

1464

zone->free_area[order].nr_free--;

1464

zone->free_area[order].nr_free--;

1465

rmv_page_order(page);

1465

rmv_page_order(page);

1466

1467

/* Set the pageblock if the isolated page is at least a pageblock */

1467

/* Set the pageblock if the isolated page is at least a pageblock */

1468

if (order >= pageblock_order - 1) {

1468

if (order >= pageblock_order - 1) {

1469

struct page *endpage = page + (1 << order) - 1;

1469

struct page *endpage = page + (1 << order) - 1;

1470

for (; page < endpage; page += pageblock_nr_pages) {

1470

for (; page < endpage; page += pageblock_nr_pages) {

1471

int mt = get_pageblock_migratetype(page);

1471

int mt = get_pageblock_migratetype(page);

1472

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1472

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1473

set_pageblock_migratetype(page,

1473

set_pageblock_migratetype(page,

1474

MIGRATE_MOVABLE);

1474

MIGRATE_MOVABLE);

1475

}

1475

}

1476

}

1476

}

1477

1478

return 1UL << order;

1478

return 1UL << order;

1479

}

1479

}

1480

1481

/*

1481

/*

1482

* Similar to split_page except the page is already free. As this is only

1482

* Similar to split_page except the page is already free. As this is only

1483

* being used for migration, the migratetype of the block also changes.

1483

* being used for migration, the migratetype of the block also changes.

1484

* As this is called with interrupts disabled, the caller is responsible

1484

* As this is called with interrupts disabled, the caller is responsible

1485

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1485

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1486

* are enabled.

1486

* are enabled.

1487

*

1487

*

1488

* Note: this is probably too low level an operation for use in drivers.

1488

* Note: this is probably too low level an operation for use in drivers.

1489

* Please consult with lkml before using this in your driver.

1489

* Please consult with lkml before using this in your driver.

1490

*/

1490

*/

1491

int split_free_page(struct page *page)

1491

int split_free_page(struct page *page)

1492

{

1492

{

1493

unsigned int order;

1493

unsigned int order;

1494

int nr_pages;

1494

int nr_pages;

1495

1496

order = page_order(page);

1496

order = page_order(page);

1497

1498

nr_pages = __isolate_free_page(page, order);

1498

nr_pages = __isolate_free_page(page, order);

1499

if (!nr_pages)

1499

if (!nr_pages)

1500

return 0;

1500

return 0;

1501

1502

/* Split into individual pages */

1502

/* Split into individual pages */

1503

set_page_refcounted(page);

1503

set_page_refcounted(page);

1504

split_page(page, order);

1504

split_page(page, order);

1505

return nr_pages;

1505

return nr_pages;

1506

}

1506

}

1507

1508

/*

1508

/*

1509

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1509

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1510

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1510

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1511

* or two.

1511

* or two.

1512

*/

1512

*/

1513

static inline

1513

static inline

1514

struct page *buffered_rmqueue(struct zone *preferred_zone,

1514

struct page *buffered_rmqueue(struct zone *preferred_zone,

1515

struct zone *zone, int order, gfp_t gfp_flags,

1515

struct zone *zone, int order, gfp_t gfp_flags,

1516

int migratetype)

1516

int migratetype)

1517

{

1517

{

1518

unsigned long flags;

1518

unsigned long flags;

1519

struct page *page;

1519

struct page *page;

1520

int cold = !!(gfp_flags & __GFP_COLD);

1520

int cold = !!(gfp_flags & __GFP_COLD);

1521

1522

again:

1522

again:

1523

if (likely(order == 0)) {

1523

if (likely(order == 0)) {

1524

struct per_cpu_pages *pcp;

1524

struct per_cpu_pages *pcp;

1525

struct list_head *list;

1525

struct list_head *list;

1526

1527

local_irq_save(flags);

1527

local_irq_save(flags);

1528

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1528

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1529

list = &pcp->lists[migratetype];

1529

list = &pcp->lists[migratetype];

1530

if (list_empty(list)) {

1530

if (list_empty(list)) {

1531

pcp->count += rmqueue_bulk(zone, 0,

1531

pcp->count += rmqueue_bulk(zone, 0,

1532

pcp->batch, list,

1532

pcp->batch, list,

1533

migratetype, cold);

1533

migratetype, cold);

1534

if (unlikely(list_empty(list)))

1534

if (unlikely(list_empty(list)))

1535

goto failed;

1535

goto failed;

1536

}

1536

}

1537

1538

if (cold)

1538

if (cold)

1539

page = list_entry(list->prev, struct page, lru);

1539

page = list_entry(list->prev, struct page, lru);

1540

else

1540

else

1541

page = list_entry(list->next, struct page, lru);

1541

page = list_entry(list->next, struct page, lru);

1542

1543

list_del(&page->lru);

1543

list_del(&page->lru);

1544

pcp->count--;

1544

pcp->count--;

1545

} else {

1545

} else {

1546

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1546

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1547

/*

1547

/*

1548

* __GFP_NOFAIL is not to be used in new code.

1548

* __GFP_NOFAIL is not to be used in new code.

1549

*

1549

*

1550

* All __GFP_NOFAIL callers should be fixed so that they

1550

* All __GFP_NOFAIL callers should be fixed so that they

1551

* properly detect and handle allocation failures.

1551

* properly detect and handle allocation failures.

1552

*

1552

*

1553

* We most definitely don't want callers attempting to

1553

* We most definitely don't want callers attempting to

1554

* allocate greater than order-1 page units with

1554

* allocate greater than order-1 page units with

1555

* __GFP_NOFAIL.

1555

* __GFP_NOFAIL.

1556

*/

1556

*/

1557

WARN_ON_ONCE(order > 1);

1557

WARN_ON_ONCE(order > 1);

1558

}

1558

}

1559

spin_lock_irqsave(&zone->lock, flags);

1559

spin_lock_irqsave(&zone->lock, flags);

1560

page = __rmqueue(zone, order, migratetype);

1560

page = __rmqueue(zone, order, migratetype);

1561

spin_unlock(&zone->lock);

1561

spin_unlock(&zone->lock);

1562

if (!page)

1562

if (!page)

1563

goto failed;

1563

goto failed;

1564

__mod_zone_freepage_state(zone, -(1 << order),

1564

__mod_zone_freepage_state(zone, -(1 << order),

1565

get_freepage_migratetype(page));

1565

get_freepage_migratetype(page));

1566

}

1566

}

1567

1568

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1568

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1569

1570

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1570

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1571

zone_statistics(preferred_zone, zone, gfp_flags);

1571

zone_statistics(preferred_zone, zone, gfp_flags);

1572

local_irq_restore(flags);

1572

local_irq_restore(flags);

1573

1574

VM_BUG_ON(bad_range(zone, page));

1574

VM_BUG_ON(bad_range(zone, page));

1575

if (prep_new_page(page, order, gfp_flags))

1575

if (prep_new_page(page, order, gfp_flags))

1576

goto again;

1576

goto again;

1577

return page;

1577

return page;

1578

1579

failed:

1579

failed:

1580

local_irq_restore(flags);

1580

local_irq_restore(flags);

1581

return NULL;

1581

return NULL;

1582

}

1582

}

1583

1584

#ifdef CONFIG_FAIL_PAGE_ALLOC

1584

#ifdef CONFIG_FAIL_PAGE_ALLOC

1585

1586

static struct {

1586

static struct {

1587

struct fault_attr attr;

1587

struct fault_attr attr;

1588

1589

u32 ignore_gfp_highmem;

1589

u32 ignore_gfp_highmem;

1590

u32 ignore_gfp_wait;

1590

u32 ignore_gfp_wait;

1591

u32 min_order;

1591

u32 min_order;

1592

} fail_page_alloc = {

1592

} fail_page_alloc = {

1593

.attr = FAULT_ATTR_INITIALIZER,

1593

.attr = FAULT_ATTR_INITIALIZER,

1594

.ignore_gfp_wait = 1,

1594

.ignore_gfp_wait = 1,

1595

.ignore_gfp_highmem = 1,

1595

.ignore_gfp_highmem = 1,

1596

.min_order = 1,

1596

.min_order = 1,

1597

};

1597

};

1598

1599

static int __init setup_fail_page_alloc(char *str)

1599

static int __init setup_fail_page_alloc(char *str)

1600

{

1600

{

1601

return setup_fault_attr(&fail_page_alloc.attr, str);

1601

return setup_fault_attr(&fail_page_alloc.attr, str);

1602

}

1602

}

1603

__setup("fail_page_alloc=", setup_fail_page_alloc);

1603

__setup("fail_page_alloc=", setup_fail_page_alloc);

1604

1605

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1605

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1606

{

1606

{

1607

if (order < fail_page_alloc.min_order)

1607

if (order < fail_page_alloc.min_order)

1608

return false;

1608

return false;

1609

if (gfp_mask & __GFP_NOFAIL)

1609

if (gfp_mask & __GFP_NOFAIL)

1610

return false;

1610

return false;

1611

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1611

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1612

return false;

1612

return false;

1613

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1613

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1614

return false;

1614

return false;

1615

1616

return should_fail(&fail_page_alloc.attr, 1 << order);

1616

return should_fail(&fail_page_alloc.attr, 1 << order);

1617

}

1617

}

1618

1619

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1619

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1620

1621

static int __init fail_page_alloc_debugfs(void)

1621

static int __init fail_page_alloc_debugfs(void)

1622

{

1622

{

1623

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1623

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1624

struct dentry *dir;

1624

struct dentry *dir;

1625

1626

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1626

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1627

&fail_page_alloc.attr);

1627

&fail_page_alloc.attr);

1628

if (IS_ERR(dir))

1628

if (IS_ERR(dir))

1629

return PTR_ERR(dir);

1629

return PTR_ERR(dir);

1630

1631

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1631

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1632

&fail_page_alloc.ignore_gfp_wait))

1632

&fail_page_alloc.ignore_gfp_wait))

1633

goto fail;

1633

goto fail;

1634

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1634

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1635

&fail_page_alloc.ignore_gfp_highmem))

1635

&fail_page_alloc.ignore_gfp_highmem))

1636

goto fail;

1636

goto fail;

1637

if (!debugfs_create_u32("min-order", mode, dir,

1637

if (!debugfs_create_u32("min-order", mode, dir,

1638

&fail_page_alloc.min_order))

1638

&fail_page_alloc.min_order))

1639

goto fail;

1639

goto fail;

1640

1641

return 0;

1641

return 0;

1642

fail:

1642

fail:

1643

debugfs_remove_recursive(dir);

1643

debugfs_remove_recursive(dir);

1644

1645

return -ENOMEM;

1645

return -ENOMEM;

1646

}

1646

}

1647

1648

late_initcall(fail_page_alloc_debugfs);

1648

late_initcall(fail_page_alloc_debugfs);

1649

1650

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1650

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1651

1652

#else /* CONFIG_FAIL_PAGE_ALLOC */

1652

#else /* CONFIG_FAIL_PAGE_ALLOC */

1653

1654

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1654

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1655

{

1655

{

1656

return false;

1656

return false;

1657

}

1657

}

1658

1659

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1659

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1660

1661

/*

1661

/*

1662

* Return true if free pages are above 'mark'. This takes into account the order

1662

* Return true if free pages are above 'mark'. This takes into account the order

1663

* of the allocation.

1663

* of the allocation.

1664

*/

1664

*/

1665

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1665

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1666

int classzone_idx, int alloc_flags, long free_pages)

1666

int classzone_idx, int alloc_flags, long free_pages)

1667

{

1667

{

1668

/* free_pages my go negative - that's OK */

1668

/* free_pages my go negative - that's OK */

1669

long min = mark;

1669

long min = mark;

1670

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1670

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1671

int o;

1671

int o;

1672

long free_cma = 0;

1672

long free_cma = 0;

1673

1674

free_pages -= (1 << order) - 1;

1674

free_pages -= (1 << order) - 1;

1675

if (alloc_flags & ALLOC_HIGH)

1675

if (alloc_flags & ALLOC_HIGH)

1676

min -= min / 2;

1676

min -= min / 2;

1677

if (alloc_flags & ALLOC_HARDER)

1677

if (alloc_flags & ALLOC_HARDER)

1678

min -= min / 4;

1678

min -= min / 4;

1679

#ifdef CONFIG_CMA

1679

#ifdef CONFIG_CMA

1680

/* If allocation can't use CMA areas don't use free CMA pages */

1680

/* If allocation can't use CMA areas don't use free CMA pages */

1681

if (!(alloc_flags & ALLOC_CMA))

1681

if (!(alloc_flags & ALLOC_CMA))

1682

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1682

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1683

#endif

1683

#endif

1684

1685

if (free_pages - free_cma <= min + lowmem_reserve)

1685

if (free_pages - free_cma <= min + lowmem_reserve)

1686

return false;

1686

return false;

1687

for (o = 0; o < order; o++) {

1687

for (o = 0; o < order; o++) {

1688

/* At the next order, this order's pages become unavailable */

1688

/* At the next order, this order's pages become unavailable */

1689

free_pages -= z->free_area[o].nr_free << o;

1689

free_pages -= z->free_area[o].nr_free << o;

1690

1691

/* Require fewer higher order pages to be free */

1691

/* Require fewer higher order pages to be free */

1692

min >>= 1;

1692

min >>= 1;

1693

1694

if (free_pages <= min)

1694

if (free_pages <= min)

1695

return false;

1695

return false;

1696

}

1696

}

1697

return true;

1697

return true;

1698

}

1698

}

1699

1700

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1700

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1701

int classzone_idx, int alloc_flags)

1701

int classzone_idx, int alloc_flags)

1702

{

1702

{

1703

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1703

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1704

zone_page_state(z, NR_FREE_PAGES));

1704

zone_page_state(z, NR_FREE_PAGES));

1705

}

1705

}

1706

1707

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1707

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1708

int classzone_idx, int alloc_flags)

1708

int classzone_idx, int alloc_flags)

1709

{

1709

{

1710

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1710

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1711

1712

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1712

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1713

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1713

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1714

1715

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1715

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1716

free_pages);

1716

free_pages);

1717

}

1717

}

1718

1719

#ifdef CONFIG_NUMA

1719

#ifdef CONFIG_NUMA

1720

/*

1720

/*

1721

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1721

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1722

* skip over zones that are not allowed by the cpuset, or that have

1722

* skip over zones that are not allowed by the cpuset, or that have

1723

* been recently (in last second) found to be nearly full. See further

1723

* been recently (in last second) found to be nearly full. See further

1724

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1724

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1725

* that have to skip over a lot of full or unallowed zones.

1725

* that have to skip over a lot of full or unallowed zones.

1726

*

1726

*

1727

* If the zonelist cache is present in the passed in zonelist, then

1727

* If the zonelist cache is present in the passed in zonelist, then

1728

* returns a pointer to the allowed node mask (either the current

1728

* returns a pointer to the allowed node mask (either the current

1729

* tasks mems_allowed, or node_states[N_MEMORY].)

1729

* tasks mems_allowed, or node_states[N_MEMORY].)

1730

*

1730

*

1731

* If the zonelist cache is not available for this zonelist, does

1731

* If the zonelist cache is not available for this zonelist, does

1732

* nothing and returns NULL.

1732

* nothing and returns NULL.

1733

*

1733

*

1734

* If the fullzones BITMAP in the zonelist cache is stale (more than

1734

* If the fullzones BITMAP in the zonelist cache is stale (more than

1735

* a second since last zap'd) then we zap it out (clear its bits.)

1735

* a second since last zap'd) then we zap it out (clear its bits.)

1736

*

1736

*

1737

* We hold off even calling zlc_setup, until after we've checked the

1737

* We hold off even calling zlc_setup, until after we've checked the

1738

* first zone in the zonelist, on the theory that most allocations will

1738

* first zone in the zonelist, on the theory that most allocations will

1739

* be satisfied from that first zone, so best to examine that zone as

1739

* be satisfied from that first zone, so best to examine that zone as

1740

* quickly as we can.

1740

* quickly as we can.

1741

*/

1741

*/

1742

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1742

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1743

{

1743

{

1744

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1744

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1745

nodemask_t *allowednodes; /* zonelist_cache approximation */

1745

nodemask_t *allowednodes; /* zonelist_cache approximation */

1746

1747

zlc = zonelist->zlcache_ptr;

1747

zlc = zonelist->zlcache_ptr;

1748

if (!zlc)

1748

if (!zlc)

1749

return NULL;

1749

return NULL;

1750

1751

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1751

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1752

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1752

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1753

zlc->last_full_zap = jiffies;

1753

zlc->last_full_zap = jiffies;

1754

}

1754

}

1755

1756

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1756

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1757

&cpuset_current_mems_allowed :

1757

&cpuset_current_mems_allowed :

1758

&node_states[N_MEMORY];

1758

&node_states[N_MEMORY];

1759

return allowednodes;

1759

return allowednodes;

1760

}

1760

}

1761

1762

/*

1762

/*

1763

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1763

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1764

* if it is worth looking at further for free memory:

1764

* if it is worth looking at further for free memory:

1765

* 1) Check that the zone isn't thought to be full (doesn't have its

1765

* 1) Check that the zone isn't thought to be full (doesn't have its

1766

* bit set in the zonelist_cache fullzones BITMAP).

1766

* bit set in the zonelist_cache fullzones BITMAP).

1767

* 2) Check that the zones node (obtained from the zonelist_cache

1767

* 2) Check that the zones node (obtained from the zonelist_cache

1768

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1768

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1769

* Return true (non-zero) if zone is worth looking at further, or

1769

* Return true (non-zero) if zone is worth looking at further, or

1770

* else return false (zero) if it is not.

1770

* else return false (zero) if it is not.

1771

*

1771

*

1772

* This check -ignores- the distinction between various watermarks,

1772

* This check -ignores- the distinction between various watermarks,

1773

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1773

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1774

* found to be full for any variation of these watermarks, it will

1774

* found to be full for any variation of these watermarks, it will

1775

* be considered full for up to one second by all requests, unless

1775

* be considered full for up to one second by all requests, unless

1776

* we are so low on memory on all allowed nodes that we are forced

1776

* we are so low on memory on all allowed nodes that we are forced

1777

* into the second scan of the zonelist.

1777

* into the second scan of the zonelist.

1778

*

1778

*

1779

* In the second scan we ignore this zonelist cache and exactly

1779

* In the second scan we ignore this zonelist cache and exactly

1780

* apply the watermarks to all zones, even it is slower to do so.

1780

* apply the watermarks to all zones, even it is slower to do so.

1781

* We are low on memory in the second scan, and should leave no stone

1781

* We are low on memory in the second scan, and should leave no stone

1782

* unturned looking for a free page.

1782

* unturned looking for a free page.

1783

*/

1783

*/

1784

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1784

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1785

nodemask_t *allowednodes)

1785

nodemask_t *allowednodes)

1786

{

1786

{

1787

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1787

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1788

int i; /* index of *z in zonelist zones */

1788

int i; /* index of *z in zonelist zones */

1789

int n; /* node that zone *z is on */

1789

int n; /* node that zone *z is on */

1790

1791

zlc = zonelist->zlcache_ptr;

1791

zlc = zonelist->zlcache_ptr;

1792

if (!zlc)

1792

if (!zlc)

1793

return 1;

1793

return 1;

1794

1795

i = z - zonelist->_zonerefs;

1795

i = z - zonelist->_zonerefs;

1796

n = zlc->z_to_n[i];

1796

n = zlc->z_to_n[i];

1797

1798

/* This zone is worth trying if it is allowed but not full */

1798

/* This zone is worth trying if it is allowed but not full */

1799

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1799

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1800

}

1800

}

1801

1802

/*

1802

/*

1803

* Given 'z' scanning a zonelist, set the corresponding bit in

1803

* Given 'z' scanning a zonelist, set the corresponding bit in

1804

* zlc->fullzones, so that subsequent attempts to allocate a page

1804

* zlc->fullzones, so that subsequent attempts to allocate a page

1805

* from that zone don't waste time re-examining it.

1805

* from that zone don't waste time re-examining it.

1806

*/

1806

*/

1807

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1807

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1808

{

1808

{

1809

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1809

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1810

int i; /* index of *z in zonelist zones */

1810

int i; /* index of *z in zonelist zones */

1811

1812

zlc = zonelist->zlcache_ptr;

1812

zlc = zonelist->zlcache_ptr;

1813

if (!zlc)

1813

if (!zlc)

1814

return;

1814

return;

1815

1816

i = z - zonelist->_zonerefs;

1816

i = z - zonelist->_zonerefs;

1817

1818

set_bit(i, zlc->fullzones);

1818

set_bit(i, zlc->fullzones);

1819

}

1819

}

1820

1821

/*

1821

/*

1822

* clear all zones full, called after direct reclaim makes progress so that

1822

* clear all zones full, called after direct reclaim makes progress so that

1823

* a zone that was recently full is not skipped over for up to a second

1823

* a zone that was recently full is not skipped over for up to a second

1824

*/

1824

*/

1825

static void zlc_clear_zones_full(struct zonelist *zonelist)

1825

static void zlc_clear_zones_full(struct zonelist *zonelist)

1826

{

1826

{

1827

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1827

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1828

1829

zlc = zonelist->zlcache_ptr;

1829

zlc = zonelist->zlcache_ptr;

1830

if (!zlc)

1830

if (!zlc)

1831

return;

1831

return;

1832

1833

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1833

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1834

}

1834

}

1835

1836

static bool zone_local(struct zone *local_zone, struct zone *zone)

1836

static bool zone_local(struct zone *local_zone, struct zone *zone)

1837

{

1837

{

1838

return local_zone->node == zone->node;

1838

return local_zone->node == zone->node;

1839

}

1839

}

1840

1841

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1841

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1842

{

1842

{

1843

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1843

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1844

}

1844

}

1845

1846

static void __paginginit init_zone_allows_reclaim(int nid)

1846

static void __paginginit init_zone_allows_reclaim(int nid)

1847

{

1847

{

1848

int i;

1848

int i;

1849

1850

for_each_node_state(i, N_MEMORY)

1850

for_each_node_state(i, N_MEMORY)

1851

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1851

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1852

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1852

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1853

else

1853

else

1854

zone_reclaim_mode = 1;

1854

zone_reclaim_mode = 1;

1855

}

1855

}

1856

1857

#else /* CONFIG_NUMA */

1857

#else /* CONFIG_NUMA */

1858

1859

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1859

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1860

{

1860

{

1861

return NULL;

1861

return NULL;

1862

}

1862

}

1863

1864

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1864

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1865

nodemask_t *allowednodes)

1865

nodemask_t *allowednodes)

1866

{

1866

{

1867

return 1;

1867

return 1;

1868

}

1868

}

1869

1870

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1870

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1871

{

1871

{

1872

}

1872

}

1873

1874

static void zlc_clear_zones_full(struct zonelist *zonelist)

1874

static void zlc_clear_zones_full(struct zonelist *zonelist)

1875

{

1875

{

1876

}

1876

}

1877

1878

static bool zone_local(struct zone *local_zone, struct zone *zone)

1878

static bool zone_local(struct zone *local_zone, struct zone *zone)

1879

{

1879

{

1880

return true;

1880

return true;

1881

}

1881

}

1882

1883

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1883

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1884

{

1884

{

1885

return true;

1885

return true;

1886

}

1886

}

1887

1888

static inline void init_zone_allows_reclaim(int nid)

1888

static inline void init_zone_allows_reclaim(int nid)

1889

{

1889

{

1890

}

1890

}

1891

#endif /* CONFIG_NUMA */

1891

#endif /* CONFIG_NUMA */

1892

1893

/*

1893

/*

1894

* get_page_from_freelist goes through the zonelist trying to allocate

1894

* get_page_from_freelist goes through the zonelist trying to allocate

1895

* a page.

1895

* a page.

1896

*/

1896

*/

1897

static struct page *

1897

static struct page *

1898

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1898

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1899

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1899

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1900

struct zone *preferred_zone, int migratetype)

1900

struct zone *preferred_zone, int classzone_idx, int migratetype)

1901

{

1901

{

1902

struct zoneref *z;

1902

struct zoneref *z;

1903

struct page *page = NULL;

1903

struct page *page = NULL;

1904

int classzone_idx;

1905

struct zone *zone;

1904

struct zone *zone;

1906

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1905

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1907

int zlc_active = 0; /* set if using zonelist_cache */

1906

int zlc_active = 0; /* set if using zonelist_cache */

1908

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1907

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1909

1908

1910

classzone_idx = zone_idx(preferred_zone);

1911

zonelist_scan:

1909

zonelist_scan:

1912

/*

1910

/*

1913

* Scan zonelist, looking for a zone with enough free.

1911

* Scan zonelist, looking for a zone with enough free.

1914

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1912

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1915

*/

1913

*/

1916

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1914

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1917

high_zoneidx, nodemask) {

1915

high_zoneidx, nodemask) {

1918

unsigned long mark;

1916

unsigned long mark;

1919

1917

1920

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1918

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1921

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1919

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1922

continue;

1920

continue;

1923

if (cpusets_enabled() &&

1921

if (cpusets_enabled() &&

1924

(alloc_flags & ALLOC_CPUSET) &&

1922

(alloc_flags & ALLOC_CPUSET) &&

1925

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1923

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1926

continue;

1924

continue;

1927

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1925

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1928

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1926

if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))

1929

goto try_this_zone;

1927

goto try_this_zone;

1930

/*

1928

/*

1931

* Distribute pages in proportion to the individual

1929

* Distribute pages in proportion to the individual

1932

* zone size to ensure fair page aging. The zone a

1930

* zone size to ensure fair page aging. The zone a

1933

* page was allocated in should have no effect on the

1931

* page was allocated in should have no effect on the

1934

* time the page has in memory before being reclaimed.

1932

* time the page has in memory before being reclaimed.

1935

*/

1933

*/

1936

if (alloc_flags & ALLOC_FAIR) {

1934

if (alloc_flags & ALLOC_FAIR) {

1937

if (!zone_local(preferred_zone, zone))

1935

if (!zone_local(preferred_zone, zone))

1938

continue;

1936

continue;

1939

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1937

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1940

continue;

1938

continue;

1941

}

1939

}

1942

/*

1940

/*

1943

* When allocating a page cache page for writing, we

1941

* When allocating a page cache page for writing, we

1944

* want to get it from a zone that is within its dirty

1942

* want to get it from a zone that is within its dirty

1945

* limit, such that no single zone holds more than its

1943

* limit, such that no single zone holds more than its

1946

* proportional share of globally allowed dirty pages.

1944

* proportional share of globally allowed dirty pages.

1947

* The dirty limits take into account the zone's

1945

* The dirty limits take into account the zone's

1948

* lowmem reserves and high watermark so that kswapd

1946

* lowmem reserves and high watermark so that kswapd

1949

* should be able to balance it without having to

1947

* should be able to balance it without having to

1950

* write pages from its LRU list.

1948

* write pages from its LRU list.

1951

*

1949

*

1952

* This may look like it could increase pressure on

1950

* This may look like it could increase pressure on

1953

* lower zones by failing allocations in higher zones

1951

* lower zones by failing allocations in higher zones

1954

* before they are full. But the pages that do spill

1952

* before they are full. But the pages that do spill

1955

* over are limited as the lower zones are protected

1953

* over are limited as the lower zones are protected

1956

* by this very same mechanism. It should not become

1954

* by this very same mechanism. It should not become

1957

* a practical burden to them.

1955

* a practical burden to them.

1958

*

1956

*

1959

* XXX: For now, allow allocations to potentially

1957

* XXX: For now, allow allocations to potentially

1960

* exceed the per-zone dirty limit in the slowpath

1958

* exceed the per-zone dirty limit in the slowpath

1961

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1959

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1962

* which is important when on a NUMA setup the allowed

1960

* which is important when on a NUMA setup the allowed

1963

* zones are together not big enough to reach the

1961

* zones are together not big enough to reach the

1964

* global limit. The proper fix for these situations

1962

* global limit. The proper fix for these situations

1965

* will require awareness of zones in the

1963

* will require awareness of zones in the

1966

* dirty-throttling and the flusher threads.

1964

* dirty-throttling and the flusher threads.

1967

*/

1965

*/

1968

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1966

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1969

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1967

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1970

continue;

1968

continue;

1971

1969

1972

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1970

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1973

if (!zone_watermark_ok(zone, order, mark,

1971

if (!zone_watermark_ok(zone, order, mark,

1974

classzone_idx, alloc_flags)) {

1972

classzone_idx, alloc_flags)) {

1975

int ret;

1973

int ret;

1976

1974

1977

if (IS_ENABLED(CONFIG_NUMA) &&

1975

if (IS_ENABLED(CONFIG_NUMA) &&

1978

!did_zlc_setup && nr_online_nodes > 1) {

1976

!did_zlc_setup && nr_online_nodes > 1) {

1979

/*

1977

/*

1980

* we do zlc_setup if there are multiple nodes

1978

* we do zlc_setup if there are multiple nodes

1981

* and before considering the first zone allowed

1979

* and before considering the first zone allowed

1982

* by the cpuset.

1980

* by the cpuset.

1983

*/

1981

*/

1984

allowednodes = zlc_setup(zonelist, alloc_flags);

1982

allowednodes = zlc_setup(zonelist, alloc_flags);

1985

zlc_active = 1;

1983

zlc_active = 1;

1986

did_zlc_setup = 1;

1984

did_zlc_setup = 1;

1987

}

1985

}

1988

1986

1989

if (zone_reclaim_mode == 0 ||

1987

if (zone_reclaim_mode == 0 ||

1990

!zone_allows_reclaim(preferred_zone, zone))

1988

!zone_allows_reclaim(preferred_zone, zone))

1991

goto this_zone_full;

1989

goto this_zone_full;

1992

1990

1993

/*

1991

/*

1994

* As we may have just activated ZLC, check if the first

1992

* As we may have just activated ZLC, check if the first

1995

* eligible zone has failed zone_reclaim recently.

1993

* eligible zone has failed zone_reclaim recently.

1996

*/

1994

*/

1997

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1995

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1998

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1996

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1999

continue;

1997

continue;

2000

1998

2001

ret = zone_reclaim(zone, gfp_mask, order);

1999

ret = zone_reclaim(zone, gfp_mask, order);

2002

switch (ret) {

2000

switch (ret) {

2003

case ZONE_RECLAIM_NOSCAN:

2001

case ZONE_RECLAIM_NOSCAN:

2004

/* did not scan */

2002

/* did not scan */

2005

continue;

2003

continue;

2006

case ZONE_RECLAIM_FULL:

2004

case ZONE_RECLAIM_FULL:

2007

/* scanned but unreclaimable */

2005

/* scanned but unreclaimable */

2008

continue;

2006

continue;

2009

default:

2007

default:

2010

/* did we reclaim enough */

2008

/* did we reclaim enough */

2011

if (zone_watermark_ok(zone, order, mark,

2009

if (zone_watermark_ok(zone, order, mark,

2012

classzone_idx, alloc_flags))

2010

classzone_idx, alloc_flags))

2013

goto try_this_zone;

2011

goto try_this_zone;

2014

2012

2015

/*

2013

/*

2016

* Failed to reclaim enough to meet watermark.

2014

* Failed to reclaim enough to meet watermark.

2017

* Only mark the zone full if checking the min

2015

* Only mark the zone full if checking the min

2018

* watermark or if we failed to reclaim just

2016

* watermark or if we failed to reclaim just

2019

* 1<<order pages or else the page allocator

2017

* 1<<order pages or else the page allocator

2020

* fastpath will prematurely mark zones full

2018

* fastpath will prematurely mark zones full

2021

* when the watermark is between the low and

2019

* when the watermark is between the low and

2022

* min watermarks.

2020

* min watermarks.

2023

*/

2021

*/

2024

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2022

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2025

ret == ZONE_RECLAIM_SOME)

2023

ret == ZONE_RECLAIM_SOME)

2026

goto this_zone_full;

2024

goto this_zone_full;

2027

2025

2028

continue;

2026

continue;

2029

}

2027

}

2030

}

2028

}

2031

2029

2032

try_this_zone:

2030

try_this_zone:

2033

page = buffered_rmqueue(preferred_zone, zone, order,

2031

page = buffered_rmqueue(preferred_zone, zone, order,

2034

gfp_mask, migratetype);

2032

gfp_mask, migratetype);

2035

if (page)

2033

if (page)

2036

break;

2034

break;

2037

this_zone_full:

2035

this_zone_full:

2038

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2036

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2039

zlc_mark_zone_full(zonelist, z);

2037

zlc_mark_zone_full(zonelist, z);

2040

}

2038

}

2041

2039

2042

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2040

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2043

/* Disable zlc cache for second zonelist scan */

2041

/* Disable zlc cache for second zonelist scan */

2044

zlc_active = 0;

2042

zlc_active = 0;

2045

goto zonelist_scan;

2043

goto zonelist_scan;

2046

}

2044

}

2047

2045

2048

if (page)

2046

if (page)

2049

/*

2047

/*

2050

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2048

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2051

* necessary to allocate the page. The expectation is

2049

* necessary to allocate the page. The expectation is

2052

* that the caller is taking steps that will free more

2050

* that the caller is taking steps that will free more

2053

* memory. The caller should avoid the page being used

2051

* memory. The caller should avoid the page being used

2054

* for !PFMEMALLOC purposes.

2052

* for !PFMEMALLOC purposes.

2055

*/

2053

*/

2056

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2054

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2057

2055

2058

return page;

2056

return page;

2059

}

2057

}

2060

2058

2061

/*

2059

/*

2062

* Large machines with many possible nodes should not always dump per-node

2060

* Large machines with many possible nodes should not always dump per-node

2063

* meminfo in irq context.

2061

* meminfo in irq context.

2064

*/

2062

*/

2065

static inline bool should_suppress_show_mem(void)

2063

static inline bool should_suppress_show_mem(void)

2066

{

2064

{

2067

bool ret = false;

2065

bool ret = false;

2068

2066

2069

#if NODES_SHIFT > 8

2067

#if NODES_SHIFT > 8

2070

ret = in_interrupt();

2068

ret = in_interrupt();

2071

#endif

2069

#endif

2072

return ret;

2070

return ret;

2073

}

2071

}

2074

2072

2075

static DEFINE_RATELIMIT_STATE(nopage_rs,

2073

static DEFINE_RATELIMIT_STATE(nopage_rs,

2076

DEFAULT_RATELIMIT_INTERVAL,

2074

DEFAULT_RATELIMIT_INTERVAL,

2077

DEFAULT_RATELIMIT_BURST);

2075

DEFAULT_RATELIMIT_BURST);

2078

2076

2079

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2077

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2080

{

2078

{

2081

unsigned int filter = SHOW_MEM_FILTER_NODES;

2079

unsigned int filter = SHOW_MEM_FILTER_NODES;

2082

2080

2083

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2081

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2084

debug_guardpage_minorder() > 0)

2082

debug_guardpage_minorder() > 0)

2085

return;

2083

return;

2086

2084

2087

/*

2085

/*

2088

* Walking all memory to count page types is very expensive and should

2086

* Walking all memory to count page types is very expensive and should

2089

* be inhibited in non-blockable contexts.

2087

* be inhibited in non-blockable contexts.

2090

*/

2088

*/

2091

if (!(gfp_mask & __GFP_WAIT))

2089

if (!(gfp_mask & __GFP_WAIT))

2092

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2090

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2093

2091

2094

/*

2092

/*

2095

* This documents exceptions given to allocations in certain

2093

* This documents exceptions given to allocations in certain

2096

* contexts that are allowed to allocate outside current's set

2094

* contexts that are allowed to allocate outside current's set

2097

* of allowed nodes.

2095

* of allowed nodes.

2098

*/

2096

*/

2099

if (!(gfp_mask & __GFP_NOMEMALLOC))

2097

if (!(gfp_mask & __GFP_NOMEMALLOC))

2100

if (test_thread_flag(TIF_MEMDIE) ||

2098

if (test_thread_flag(TIF_MEMDIE) ||

2101

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2099

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2102

filter &= ~SHOW_MEM_FILTER_NODES;

2100

filter &= ~SHOW_MEM_FILTER_NODES;

2103

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2101

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2104

filter &= ~SHOW_MEM_FILTER_NODES;

2102

filter &= ~SHOW_MEM_FILTER_NODES;

2105

2103

2106

if (fmt) {

2104

if (fmt) {

2107

struct va_format vaf;

2105

struct va_format vaf;

2108

va_list args;

2106

va_list args;

2109

2107

2110

va_start(args, fmt);

2108

va_start(args, fmt);

2111

2109

2112

vaf.fmt = fmt;

2110

vaf.fmt = fmt;

2113

vaf.va = &args;

2111

vaf.va = &args;

2114

2112

2115

pr_warn("%pV", &vaf);

2113

pr_warn("%pV", &vaf);

2116

2114

2117

va_end(args);

2115

va_end(args);

2118

}

2116

}

2119

2117

2120

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2118

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2121

current->comm, order, gfp_mask);

2119

current->comm, order, gfp_mask);

2122

2120

2123

dump_stack();

2121

dump_stack();

2124

if (!should_suppress_show_mem())

2122

if (!should_suppress_show_mem())

2125

show_mem(filter);

2123

show_mem(filter);

2126

}

2124

}

2127

2125

2128

static inline int

2126

static inline int

2129

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2127

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2130

unsigned long did_some_progress,

2128

unsigned long did_some_progress,

2131

unsigned long pages_reclaimed)

2129

unsigned long pages_reclaimed)

2132

{

2130

{

2133

/* Do not loop if specifically requested */

2131

/* Do not loop if specifically requested */

2134

if (gfp_mask & __GFP_NORETRY)

2132

if (gfp_mask & __GFP_NORETRY)

2135

return 0;

2133

return 0;

2136

2134

2137

/* Always retry if specifically requested */

2135

/* Always retry if specifically requested */

2138

if (gfp_mask & __GFP_NOFAIL)

2136

if (gfp_mask & __GFP_NOFAIL)

2139

return 1;

2137

return 1;

2140

2138

2141

/*

2139

/*

2142

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2140

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2143

* making forward progress without invoking OOM. Suspend also disables

2141

* making forward progress without invoking OOM. Suspend also disables

2144

* storage devices so kswapd will not help. Bail if we are suspending.

2142

* storage devices so kswapd will not help. Bail if we are suspending.

2145

*/

2143

*/

2146

if (!did_some_progress && pm_suspended_storage())

2144

if (!did_some_progress && pm_suspended_storage())

2147

return 0;

2145

return 0;

2148

2146

2149

/*

2147

/*

2150

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2148

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2151

* means __GFP_NOFAIL, but that may not be true in other

2149

* means __GFP_NOFAIL, but that may not be true in other

2152

* implementations.

2150

* implementations.

2153

*/

2151

*/

2154

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2152

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2155

return 1;

2153

return 1;

2156

2154

2157

/*

2155

/*

2158

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2156

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2159

* specified, then we retry until we no longer reclaim any pages

2157

* specified, then we retry until we no longer reclaim any pages

2160

* (above), or we've reclaimed an order of pages at least as

2158

* (above), or we've reclaimed an order of pages at least as

2161

* large as the allocation's order. In both cases, if the

2159

* large as the allocation's order. In both cases, if the

2162

* allocation still fails, we stop retrying.

2160

* allocation still fails, we stop retrying.

2163

*/

2161

*/

2164

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2162

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2165

return 1;

2163

return 1;

2166

2164

2167

return 0;

2165

return 0;

2168

}

2166

}

2169

2167

2170

static inline struct page *

2168

static inline struct page *

2171

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2169

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2172

struct zonelist *zonelist, enum zone_type high_zoneidx,

2170

struct zonelist *zonelist, enum zone_type high_zoneidx,

2173

nodemask_t *nodemask, struct zone *preferred_zone,

2171

nodemask_t *nodemask, struct zone *preferred_zone,

2174

int migratetype)

2172

int classzone_idx, int migratetype)

2175

{

2173

{

2176

struct page *page;

2174

struct page *page;

2177

2175

2178

/* Acquire the OOM killer lock for the zones in zonelist */

2176

/* Acquire the OOM killer lock for the zones in zonelist */

2179

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2177

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2180

schedule_timeout_uninterruptible(1);

2178

schedule_timeout_uninterruptible(1);

2181

return NULL;

2179

return NULL;

2182

}

2180

}

2183

2181

2184

/*

2182

/*

2185

* Go through the zonelist yet one more time, keep very high watermark

2183

* Go through the zonelist yet one more time, keep very high watermark

2186

* here, this is only to catch a parallel oom killing, we must fail if

2184

* here, this is only to catch a parallel oom killing, we must fail if

2187

* we're still under heavy pressure.

2185

* we're still under heavy pressure.

2188

*/

2186

*/

2189

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2187

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2190

order, zonelist, high_zoneidx,

2188

order, zonelist, high_zoneidx,

2191

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2189

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2192

preferred_zone, migratetype);

2190

preferred_zone, classzone_idx, migratetype);

2193

if (page)

2191

if (page)

2194

goto out;

2192

goto out;

2195

2193

2196

if (!(gfp_mask & __GFP_NOFAIL)) {

2194

if (!(gfp_mask & __GFP_NOFAIL)) {

2197

/* The OOM killer will not help higher order allocs */

2195

/* The OOM killer will not help higher order allocs */

2198

if (order > PAGE_ALLOC_COSTLY_ORDER)

2196

if (order > PAGE_ALLOC_COSTLY_ORDER)

2199

goto out;

2197

goto out;

2200

/* The OOM killer does not needlessly kill tasks for lowmem */

2198

/* The OOM killer does not needlessly kill tasks for lowmem */

2201

if (high_zoneidx < ZONE_NORMAL)

2199

if (high_zoneidx < ZONE_NORMAL)

2202

goto out;

2200

goto out;

2203

/*

2201

/*

2204

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2202

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2205

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2203

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2206

* The caller should handle page allocation failure by itself if

2204

* The caller should handle page allocation failure by itself if

2207

* it specifies __GFP_THISNODE.

2205

* it specifies __GFP_THISNODE.

2208

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2206

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2209

*/

2207

*/

2210

if (gfp_mask & __GFP_THISNODE)

2208

if (gfp_mask & __GFP_THISNODE)

2211

goto out;

2209

goto out;

2212

}

2210

}

2213

/* Exhausted what can be done so it's blamo time */

2211

/* Exhausted what can be done so it's blamo time */

2214

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2212

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2215

2213

2216

out:

2214

out:

2217

clear_zonelist_oom(zonelist, gfp_mask);

2215

clear_zonelist_oom(zonelist, gfp_mask);

2218

return page;

2216

return page;

2219

}

2217

}

2220

2218

2221

#ifdef CONFIG_COMPACTION

2219

#ifdef CONFIG_COMPACTION

2222

/* Try memory compaction for high-order allocations before reclaim */

2220

/* Try memory compaction for high-order allocations before reclaim */

2223

static struct page *

2221

static struct page *

2224

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2222

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2225

struct zonelist *zonelist, enum zone_type high_zoneidx,

2223

struct zonelist *zonelist, enum zone_type high_zoneidx,

2226

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2224

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2227

int migratetype, enum migrate_mode mode,

2225

int classzone_idx, int migratetype, enum migrate_mode mode,

2228

bool *contended_compaction, bool *deferred_compaction,

2226

bool *contended_compaction, bool *deferred_compaction,

2229

unsigned long *did_some_progress)

2227

unsigned long *did_some_progress)

2230

{

2228

{

2231

if (!order)

2229

if (!order)

2232

return NULL;

2230

return NULL;

2233

2231

2234

if (compaction_deferred(preferred_zone, order)) {

2232

if (compaction_deferred(preferred_zone, order)) {

2235

*deferred_compaction = true;

2233

*deferred_compaction = true;

2236

return NULL;

2234

return NULL;

2237

}

2235

}

2238

2236

2239

current->flags |= PF_MEMALLOC;

2237

current->flags |= PF_MEMALLOC;

2240

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2238

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2241

nodemask, mode,

2239

nodemask, mode,

2242

contended_compaction);

2240

contended_compaction);

2243

current->flags &= ~PF_MEMALLOC;

2241

current->flags &= ~PF_MEMALLOC;

2244

2242

2245

if (*did_some_progress != COMPACT_SKIPPED) {

2243

if (*did_some_progress != COMPACT_SKIPPED) {

2246

struct page *page;

2244

struct page *page;

2247

2245

2248

/* Page migration frees to the PCP lists but we want merging */

2246

/* Page migration frees to the PCP lists but we want merging */

2249

drain_pages(get_cpu());

2247

drain_pages(get_cpu());

2250

put_cpu();

2248

put_cpu();

2251

2249

2252

page = get_page_from_freelist(gfp_mask, nodemask,

2250

page = get_page_from_freelist(gfp_mask, nodemask,

2253

order, zonelist, high_zoneidx,

2251

order, zonelist, high_zoneidx,

2254

alloc_flags & ~ALLOC_NO_WATERMARKS,

2252

alloc_flags & ~ALLOC_NO_WATERMARKS,

2255

preferred_zone, migratetype);

2253

preferred_zone, classzone_idx, migratetype);

2256

if (page) {

2254

if (page) {

2257

preferred_zone->compact_blockskip_flush = false;

2255

preferred_zone->compact_blockskip_flush = false;

2258

compaction_defer_reset(preferred_zone, order, true);

2256

compaction_defer_reset(preferred_zone, order, true);

2259

count_vm_event(COMPACTSUCCESS);

2257

count_vm_event(COMPACTSUCCESS);

2260

return page;

2258

return page;

2261

}

2259

}

2262

2260

2263

/*

2261

/*

2264

* It's bad if compaction run occurs and fails.

2262

* It's bad if compaction run occurs and fails.

2265

* The most likely reason is that pages exist,

2263

* The most likely reason is that pages exist,

2266

* but not enough to satisfy watermarks.

2264

* but not enough to satisfy watermarks.

2267

*/

2265

*/

2268

count_vm_event(COMPACTFAIL);

2266

count_vm_event(COMPACTFAIL);

2269

2267

2270

/*

2268

/*

2271

* As async compaction considers a subset of pageblocks, only

2269

* As async compaction considers a subset of pageblocks, only

2272

* defer if the failure was a sync compaction failure.

2270

* defer if the failure was a sync compaction failure.

2273

*/

2271

*/

2274

if (mode != MIGRATE_ASYNC)

2272

if (mode != MIGRATE_ASYNC)

2275

defer_compaction(preferred_zone, order);

2273

defer_compaction(preferred_zone, order);

2276

2274

2277

cond_resched();

2275

cond_resched();

2278

}

2276

}

2279

2277

2280

return NULL;

2278

return NULL;

2281

}

2279

}

2282

#else

2280

#else

2283

static inline struct page *

2281

static inline struct page *

2284

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2282

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2285

struct zonelist *zonelist, enum zone_type high_zoneidx,

2283

struct zonelist *zonelist, enum zone_type high_zoneidx,

2286

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2284

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2287

int migratetype, enum migrate_mode mode, bool *contended_compaction,

2285

int classzone_idx, int migratetype,

2286

enum migrate_mode mode, bool *contended_compaction,

2288

bool *deferred_compaction, unsigned long *did_some_progress)

2287

bool *deferred_compaction, unsigned long *did_some_progress)

2289

{

2288

{

2290

return NULL;

2289

return NULL;

2291

}

2290

}

2292

#endif /* CONFIG_COMPACTION */

2291

#endif /* CONFIG_COMPACTION */

2293

2292

2294

/* Perform direct synchronous page reclaim */

2293

/* Perform direct synchronous page reclaim */

2295

static int

2294

static int

2296

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2295

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2297

nodemask_t *nodemask)

2296

nodemask_t *nodemask)

2298

{

2297

{

2299

struct reclaim_state reclaim_state;

2298

struct reclaim_state reclaim_state;

2300

int progress;

2299

int progress;

2301

2300

2302

cond_resched();

2301

cond_resched();

2303

2302

2304

/* We now go into synchronous reclaim */

2303

/* We now go into synchronous reclaim */

2305

cpuset_memory_pressure_bump();

2304

cpuset_memory_pressure_bump();

2306

current->flags |= PF_MEMALLOC;

2305

current->flags |= PF_MEMALLOC;

2307

lockdep_set_current_reclaim_state(gfp_mask);

2306

lockdep_set_current_reclaim_state(gfp_mask);

2308

reclaim_state.reclaimed_slab = 0;

2307

reclaim_state.reclaimed_slab = 0;

2309

current->reclaim_state = &reclaim_state;

2308

current->reclaim_state = &reclaim_state;

2310

2309

2311

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2310

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2312

2311

2313

current->reclaim_state = NULL;

2312

current->reclaim_state = NULL;

2314

lockdep_clear_current_reclaim_state();

2313

lockdep_clear_current_reclaim_state();

2315

current->flags &= ~PF_MEMALLOC;

2314

current->flags &= ~PF_MEMALLOC;

2316

2315

2317

cond_resched();

2316

cond_resched();

2318

2317

2319

return progress;

2318

return progress;

2320

}

2319

}

2321

2320

2322

/* The really slow allocator path where we enter direct reclaim */

2321

/* The really slow allocator path where we enter direct reclaim */

2323

static inline struct page *

2322

static inline struct page *

2324

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2323

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2325

struct zonelist *zonelist, enum zone_type high_zoneidx,

2324

struct zonelist *zonelist, enum zone_type high_zoneidx,

2326

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2325

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2327

int migratetype, unsigned long *did_some_progress)

2326

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2328

{

2327

{

2329

struct page *page = NULL;

2328

struct page *page = NULL;

2330

bool drained = false;

2329

bool drained = false;

2331

2330

2332

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2331

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2333

nodemask);

2332

nodemask);

2334

if (unlikely(!(*did_some_progress)))

2333

if (unlikely(!(*did_some_progress)))

2335

return NULL;

2334

return NULL;

2336

2335

2337

/* After successful reclaim, reconsider all zones for allocation */

2336

/* After successful reclaim, reconsider all zones for allocation */

2338

if (IS_ENABLED(CONFIG_NUMA))

2337

if (IS_ENABLED(CONFIG_NUMA))

2339

zlc_clear_zones_full(zonelist);

2338

zlc_clear_zones_full(zonelist);

2340

2339

2341

retry:

2340

retry:

2342

page = get_page_from_freelist(gfp_mask, nodemask, order,

2341

page = get_page_from_freelist(gfp_mask, nodemask, order,

2343

zonelist, high_zoneidx,

2342

zonelist, high_zoneidx,

2344

alloc_flags & ~ALLOC_NO_WATERMARKS,

2343

alloc_flags & ~ALLOC_NO_WATERMARKS,

2345

preferred_zone, migratetype);

2344

preferred_zone, classzone_idx,

2345

migratetype);

2346

2347

/*

2347

/*

2348

* If an allocation failed after direct reclaim, it could be because

2348

* If an allocation failed after direct reclaim, it could be because

2349

* pages are pinned on the per-cpu lists. Drain them and try again

2349

* pages are pinned on the per-cpu lists. Drain them and try again

2350

*/

2350

*/

2351

if (!page && !drained) {

2351

if (!page && !drained) {

2352

drain_all_pages();

2352

drain_all_pages();

2353

drained = true;

2353

drained = true;

2354

goto retry;

2354

goto retry;

2355

}

2355

}

2356

2357

return page;

2357

return page;

2358

}

2358

}

2359

2360

/*

2360

/*

2361

* This is called in the allocator slow-path if the allocation request is of

2361

* This is called in the allocator slow-path if the allocation request is of

2362

* sufficient urgency to ignore watermarks and take other desperate measures

2362

* sufficient urgency to ignore watermarks and take other desperate measures

2363

*/

2363

*/

2364

static inline struct page *

2364

static inline struct page *

2365

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2365

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2366

struct zonelist *zonelist, enum zone_type high_zoneidx,

2366

struct zonelist *zonelist, enum zone_type high_zoneidx,

2367

nodemask_t *nodemask, struct zone *preferred_zone,

2367

nodemask_t *nodemask, struct zone *preferred_zone,

2368

int migratetype)

2368

int classzone_idx, int migratetype)

2369

{

2369

{

2370

struct page *page;

2370

struct page *page;

2371

2372

do {

2372

do {

2373

page = get_page_from_freelist(gfp_mask, nodemask, order,

2373

page = get_page_from_freelist(gfp_mask, nodemask, order,

2374

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2374

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2375

preferred_zone, migratetype);

2375

preferred_zone, classzone_idx, migratetype);

2376

2377

if (!page && gfp_mask & __GFP_NOFAIL)

2377

if (!page && gfp_mask & __GFP_NOFAIL)

2378

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2378

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2379

} while (!page && (gfp_mask & __GFP_NOFAIL));

2379

} while (!page && (gfp_mask & __GFP_NOFAIL));

2380

2381

return page;

2381

return page;

2382

}

2382

}

2383

2384

static void reset_alloc_batches(struct zonelist *zonelist,

2384

static void reset_alloc_batches(struct zonelist *zonelist,

2385

enum zone_type high_zoneidx,

2385

enum zone_type high_zoneidx,

2386

struct zone *preferred_zone)

2386

struct zone *preferred_zone)

2387

{

2387

{

2388

struct zoneref *z;

2388

struct zoneref *z;

2389

struct zone *zone;

2389

struct zone *zone;

2390

2391

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2391

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2392

/*

2392

/*

2393

* Only reset the batches of zones that were actually

2393

* Only reset the batches of zones that were actually

2394

* considered in the fairness pass, we don't want to

2394

* considered in the fairness pass, we don't want to

2395

* trash fairness information for zones that are not

2395

* trash fairness information for zones that are not

2396

* actually part of this zonelist's round-robin cycle.

2396

* actually part of this zonelist's round-robin cycle.

2397

*/

2397

*/

2398

if (!zone_local(preferred_zone, zone))

2398

if (!zone_local(preferred_zone, zone))

2399

continue;

2399

continue;

2400

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2400

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2401

high_wmark_pages(zone) - low_wmark_pages(zone) -

2401

high_wmark_pages(zone) - low_wmark_pages(zone) -

2402

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2402

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2403

}

2403

}

2404

}

2404

}

2405

2406

static void wake_all_kswapds(unsigned int order,

2406

static void wake_all_kswapds(unsigned int order,

2407

struct zonelist *zonelist,

2407

struct zonelist *zonelist,

2408

enum zone_type high_zoneidx,

2408

enum zone_type high_zoneidx,

2409

struct zone *preferred_zone)

2409

struct zone *preferred_zone)

2410

{

2410

{

2411

struct zoneref *z;

2411

struct zoneref *z;

2412

struct zone *zone;

2412

struct zone *zone;

2413

2414

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2414

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2415

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2415

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2416

}

2416

}

2417

2418

static inline int

2418

static inline int

2419

gfp_to_alloc_flags(gfp_t gfp_mask)

2419

gfp_to_alloc_flags(gfp_t gfp_mask)

2420

{

2420

{

2421

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2421

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2422

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2422

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2423

2424

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2424

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2425

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2425

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2426

2427

/*

2427

/*

2428

* The caller may dip into page reserves a bit more if the caller

2428

* The caller may dip into page reserves a bit more if the caller

2429

* cannot run direct reclaim, or if the caller has realtime scheduling

2429

* cannot run direct reclaim, or if the caller has realtime scheduling

2430

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2430

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2431

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2431

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2432

*/

2432

*/

2433

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2433

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2434

2435

if (atomic) {

2435

if (atomic) {

2436

/*

2436

/*

2437

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2437

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2438

* if it can't schedule.

2438

* if it can't schedule.

2439

*/

2439

*/

2440

if (!(gfp_mask & __GFP_NOMEMALLOC))

2440

if (!(gfp_mask & __GFP_NOMEMALLOC))

2441

alloc_flags |= ALLOC_HARDER;

2441

alloc_flags |= ALLOC_HARDER;

2442

/*

2442

/*

2443

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2443

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2444

* comment for __cpuset_node_allowed_softwall().

2444

* comment for __cpuset_node_allowed_softwall().

2445

*/

2445

*/

2446

alloc_flags &= ~ALLOC_CPUSET;

2446

alloc_flags &= ~ALLOC_CPUSET;

2447

} else if (unlikely(rt_task(current)) && !in_interrupt())

2447

} else if (unlikely(rt_task(current)) && !in_interrupt())

2448

alloc_flags |= ALLOC_HARDER;

2448

alloc_flags |= ALLOC_HARDER;

2449

2450

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2450

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2451

if (gfp_mask & __GFP_MEMALLOC)

2451

if (gfp_mask & __GFP_MEMALLOC)

2452

alloc_flags |= ALLOC_NO_WATERMARKS;

2452

alloc_flags |= ALLOC_NO_WATERMARKS;

2453

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2453

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2454

alloc_flags |= ALLOC_NO_WATERMARKS;

2454

alloc_flags |= ALLOC_NO_WATERMARKS;

2455

else if (!in_interrupt() &&

2455

else if (!in_interrupt() &&

2456

((current->flags & PF_MEMALLOC) ||

2456

((current->flags & PF_MEMALLOC) ||

2457

unlikely(test_thread_flag(TIF_MEMDIE))))

2457

unlikely(test_thread_flag(TIF_MEMDIE))))

2458

alloc_flags |= ALLOC_NO_WATERMARKS;

2458

alloc_flags |= ALLOC_NO_WATERMARKS;

2459

}

2459

}

2460

#ifdef CONFIG_CMA

2460

#ifdef CONFIG_CMA

2461

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2461

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2462

alloc_flags |= ALLOC_CMA;

2462

alloc_flags |= ALLOC_CMA;

2463

#endif

2463

#endif

2464

return alloc_flags;

2464

return alloc_flags;

2465

}

2465

}

2466

2467

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2467

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2468

{

2468

{

2469

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2469

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2470

}

2470

}

2471

2472

static inline struct page *

2472

static inline struct page *

2473

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2473

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2474

struct zonelist *zonelist, enum zone_type high_zoneidx,

2474

struct zonelist *zonelist, enum zone_type high_zoneidx,

2475

nodemask_t *nodemask, struct zone *preferred_zone,

2475

nodemask_t *nodemask, struct zone *preferred_zone,

2476

int migratetype)

2476

int classzone_idx, int migratetype)

2477

{

2477

{

2478

const gfp_t wait = gfp_mask & __GFP_WAIT;

2478

const gfp_t wait = gfp_mask & __GFP_WAIT;

2479

struct page *page = NULL;

2479

struct page *page = NULL;

2480

int alloc_flags;

2480

int alloc_flags;

2481

unsigned long pages_reclaimed = 0;

2481

unsigned long pages_reclaimed = 0;

2482

unsigned long did_some_progress;

2482

unsigned long did_some_progress;

2483

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2483

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2484

bool deferred_compaction = false;

2484

bool deferred_compaction = false;

2485

bool contended_compaction = false;

2485

bool contended_compaction = false;

2486

2487

/*

2487

/*

2488

* In the slowpath, we sanity check order to avoid ever trying to

2488

* In the slowpath, we sanity check order to avoid ever trying to

2489

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2489

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2490

* be using allocators in order of preference for an area that is

2490

* be using allocators in order of preference for an area that is

2491

* too large.

2491

* too large.

2492

*/

2492

*/

2493

if (order >= MAX_ORDER) {

2493

if (order >= MAX_ORDER) {

2494

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2494

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2495

return NULL;

2495

return NULL;

2496

}

2496

}

2497

2498

/*

2498

/*

2499

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2499

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2500

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2500

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2501

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2501

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2502

* using a larger set of nodes after it has established that the

2502

* using a larger set of nodes after it has established that the

2503

* allowed per node queues are empty and that nodes are

2503

* allowed per node queues are empty and that nodes are

2504

* over allocated.

2504

* over allocated.

2505

*/

2505

*/

2506

if (IS_ENABLED(CONFIG_NUMA) &&

2506

if (IS_ENABLED(CONFIG_NUMA) &&

2507

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2507

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2508

goto nopage;

2508

goto nopage;

2509

2510

restart:

2510

restart:

2511

if (!(gfp_mask & __GFP_NO_KSWAPD))

2511

if (!(gfp_mask & __GFP_NO_KSWAPD))

2512

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2512

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2513

2514

/*

2514

/*

2515

* OK, we're below the kswapd watermark and have kicked background

2515

* OK, we're below the kswapd watermark and have kicked background

2516

* reclaim. Now things get more complex, so set up alloc_flags according

2516

* reclaim. Now things get more complex, so set up alloc_flags according

2517

* to how we want to proceed.

2517

* to how we want to proceed.

2518

*/

2518

*/

2519

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2519

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2520

2521

/*

2521

/*

2522

* Find the true preferred zone if the allocation is unconstrained by

2522

* Find the true preferred zone if the allocation is unconstrained by

2523

* cpusets.

2523

* cpusets.

2524

*/

2524

*/

2525

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2525

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2526

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2526

struct zoneref *preferred_zoneref;

2527

&preferred_zone);

2527

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2528

NULL,

2529

&preferred_zone);

2530

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2531

}

2528

2532

2529

rebalance:

2533

rebalance:

2530

/* This is the last chance, in general, before the goto nopage. */

2534

/* This is the last chance, in general, before the goto nopage. */

2531

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2535

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2532

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2536

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2533

preferred_zone, migratetype);

2537

preferred_zone, classzone_idx, migratetype);

2534

if (page)

2538

if (page)

2535

goto got_pg;

2539

goto got_pg;

2536

2540

2537

/* Allocate without watermarks if the context allows */

2541

/* Allocate without watermarks if the context allows */

2538

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2542

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2539

/*

2543

/*

2540

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2544

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2541

* the allocation is high priority and these type of

2545

* the allocation is high priority and these type of

2542

* allocations are system rather than user orientated

2546

* allocations are system rather than user orientated

2543

*/

2547

*/

2544

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2548

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2545

2549

2546

page = __alloc_pages_high_priority(gfp_mask, order,

2550

page = __alloc_pages_high_priority(gfp_mask, order,

2547

zonelist, high_zoneidx, nodemask,

2551

zonelist, high_zoneidx, nodemask,

2548

preferred_zone, migratetype);

2552

preferred_zone, classzone_idx, migratetype);

2549

if (page) {

2553

if (page) {

2550

goto got_pg;

2554

goto got_pg;

2551

}

2555

}

2552

}

2556

}

2553

2557

2554

/* Atomic allocations - we can't balance anything */

2558

/* Atomic allocations - we can't balance anything */

2555

if (!wait)

2559

if (!wait)

2556

goto nopage;

2560

goto nopage;

2557

2561

2558

/* Avoid recursion of direct reclaim */

2562

/* Avoid recursion of direct reclaim */

2559

if (current->flags & PF_MEMALLOC)

2563

if (current->flags & PF_MEMALLOC)

2560

goto nopage;

2564

goto nopage;

2561

2565

2562

/* Avoid allocations with no watermarks from looping endlessly */

2566

/* Avoid allocations with no watermarks from looping endlessly */

2563

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2567

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2564

goto nopage;

2568

goto nopage;

2565

2569

2566

/*

2570

/*

2567

* Try direct compaction. The first pass is asynchronous. Subsequent

2571

* Try direct compaction. The first pass is asynchronous. Subsequent

2568

* attempts after direct reclaim are synchronous

2572

* attempts after direct reclaim are synchronous

2569

*/

2573

*/

2570

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2574

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2571

high_zoneidx, nodemask, alloc_flags,

2575

high_zoneidx, nodemask, alloc_flags,

2572

preferred_zone, migratetype,

2576

preferred_zone,

2577

classzone_idx, migratetype,

2573

migration_mode, &contended_compaction,

2578

migration_mode, &contended_compaction,

2574

&deferred_compaction,

2579

&deferred_compaction,

2575

&did_some_progress);

2580

&did_some_progress);

2576

if (page)

2581

if (page)

2577

goto got_pg;

2582

goto got_pg;

2578

migration_mode = MIGRATE_SYNC_LIGHT;

2583

migration_mode = MIGRATE_SYNC_LIGHT;

2579

2584

2580

/*

2585

/*

2581

* If compaction is deferred for high-order allocations, it is because

2586

* If compaction is deferred for high-order allocations, it is because

2582

* sync compaction recently failed. In this is the case and the caller

2587

* sync compaction recently failed. In this is the case and the caller

2583

* requested a movable allocation that does not heavily disrupt the

2588

* requested a movable allocation that does not heavily disrupt the

2584

* system then fail the allocation instead of entering direct reclaim.

2589

* system then fail the allocation instead of entering direct reclaim.

2585

*/

2590

*/

2586

if ((deferred_compaction || contended_compaction) &&

2591

if ((deferred_compaction || contended_compaction) &&

2587

(gfp_mask & __GFP_NO_KSWAPD))

2592

(gfp_mask & __GFP_NO_KSWAPD))

2588

goto nopage;

2593

goto nopage;

2589

2594

2590

/* Try direct reclaim and then allocating */

2595

/* Try direct reclaim and then allocating */

2591

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2596

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2592

zonelist, high_zoneidx,

2597

zonelist, high_zoneidx,

2593

nodemask,

2598

nodemask,

2594

alloc_flags, preferred_zone,

2599

alloc_flags, preferred_zone,

2595

migratetype, &did_some_progress);

2600

classzone_idx, migratetype,

2601

&did_some_progress);

2596

if (page)

2602

if (page)

2597

goto got_pg;

2603

goto got_pg;

2598

2604

2599

/*

2605

/*

2600

* If we failed to make any progress reclaiming, then we are

2606

* If we failed to make any progress reclaiming, then we are

2601

* running out of options and have to consider going OOM

2607

* running out of options and have to consider going OOM

2602

*/

2608

*/

2603

if (!did_some_progress) {

2609

if (!did_some_progress) {

2604

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2610

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2605

if (oom_killer_disabled)

2611

if (oom_killer_disabled)

2606

goto nopage;

2612

goto nopage;

2607

/* Coredumps can quickly deplete all memory reserves */

2613

/* Coredumps can quickly deplete all memory reserves */

2608

if ((current->flags & PF_DUMPCORE) &&

2614

if ((current->flags & PF_DUMPCORE) &&

2609

!(gfp_mask & __GFP_NOFAIL))

2615

!(gfp_mask & __GFP_NOFAIL))

2610

goto nopage;

2616

goto nopage;

2611

page = __alloc_pages_may_oom(gfp_mask, order,

2617

page = __alloc_pages_may_oom(gfp_mask, order,

2612

zonelist, high_zoneidx,

2618

zonelist, high_zoneidx,

2613

nodemask, preferred_zone,

2619

nodemask, preferred_zone,

2614

migratetype);

2620

classzone_idx, migratetype);

2615

if (page)

2621

if (page)

2616

goto got_pg;

2622

goto got_pg;

2617

2623

2618

if (!(gfp_mask & __GFP_NOFAIL)) {

2624

if (!(gfp_mask & __GFP_NOFAIL)) {

2619

/*

2625

/*

2620

* The oom killer is not called for high-order

2626

* The oom killer is not called for high-order

2621

* allocations that may fail, so if no progress

2627

* allocations that may fail, so if no progress

2622

* is being made, there are no other options and

2628

* is being made, there are no other options and

2623

* retrying is unlikely to help.

2629

* retrying is unlikely to help.

2624

*/

2630

*/

2625

if (order > PAGE_ALLOC_COSTLY_ORDER)

2631

if (order > PAGE_ALLOC_COSTLY_ORDER)

2626

goto nopage;

2632

goto nopage;

2627

/*

2633

/*

2628

* The oom killer is not called for lowmem

2634

* The oom killer is not called for lowmem

2629

* allocations to prevent needlessly killing

2635

* allocations to prevent needlessly killing

2630

* innocent tasks.

2636

* innocent tasks.

2631

*/

2637

*/

2632

if (high_zoneidx < ZONE_NORMAL)

2638

if (high_zoneidx < ZONE_NORMAL)

2633

goto nopage;

2639

goto nopage;

2634

}

2640

}

2635

2641

2636

goto restart;

2642

goto restart;

2637

}

2643

}

2638

}

2644

}

2639

2645

2640

/* Check if we should retry the allocation */

2646

/* Check if we should retry the allocation */

2641

pages_reclaimed += did_some_progress;

2647

pages_reclaimed += did_some_progress;

2642

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2648

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2643

pages_reclaimed)) {

2649

pages_reclaimed)) {

2644

/* Wait for some write requests to complete then retry */

2650

/* Wait for some write requests to complete then retry */

2645

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2651

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2646

goto rebalance;

2652

goto rebalance;

2647

} else {

2653

} else {

2648

/*

2654

/*

2649

* High-order allocations do not necessarily loop after

2655

* High-order allocations do not necessarily loop after

2650

* direct reclaim and reclaim/compaction depends on compaction

2656

* direct reclaim and reclaim/compaction depends on compaction

2651

* being called after reclaim so call directly if necessary

2657

* being called after reclaim so call directly if necessary

2652

*/

2658

*/

2653

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2659

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2654

high_zoneidx, nodemask, alloc_flags,

2660

high_zoneidx, nodemask, alloc_flags,

2655

preferred_zone, migratetype,

2661

preferred_zone,

2662

classzone_idx, migratetype,

2656

migration_mode, &contended_compaction,

2663

migration_mode, &contended_compaction,

2657

&deferred_compaction,

2664

&deferred_compaction,

2658

&did_some_progress);

2665

&did_some_progress);

2659

if (page)

2666

if (page)

2660

goto got_pg;

2667

goto got_pg;

2661

}

2668

}

2662

2669

2663

nopage:

2670

nopage:

2664

warn_alloc_failed(gfp_mask, order, NULL);

2671

warn_alloc_failed(gfp_mask, order, NULL);

2665

return page;

2672

return page;

2666

got_pg:

2673

got_pg:

2667

if (kmemcheck_enabled)

2674

if (kmemcheck_enabled)

2668

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2675

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2669

2676

2670

return page;

2677

return page;

2671

}

2678

}

2672

2679

2673

/*

2680

/*

2674

* This is the 'heart' of the zoned buddy allocator.

2681

* This is the 'heart' of the zoned buddy allocator.

2675

*/

2682

*/

2676

struct page *

2683

struct page *

2677

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2684

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2678

struct zonelist *zonelist, nodemask_t *nodemask)

2685

struct zonelist *zonelist, nodemask_t *nodemask)

2679

{

2686

{

2680

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2687

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2681

struct zone *preferred_zone;

2688

struct zone *preferred_zone;

2689

struct zoneref *preferred_zoneref;

2682

struct page *page = NULL;

2690

struct page *page = NULL;

2683

int migratetype = allocflags_to_migratetype(gfp_mask);

2691

int migratetype = allocflags_to_migratetype(gfp_mask);

2684

unsigned int cpuset_mems_cookie;

2692

unsigned int cpuset_mems_cookie;

2685

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2693

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2686

struct mem_cgroup *memcg = NULL;

2694

struct mem_cgroup *memcg = NULL;

2695

int classzone_idx;

2687

2696

2688

gfp_mask &= gfp_allowed_mask;

2697

gfp_mask &= gfp_allowed_mask;

2689

2698

2690

lockdep_trace_alloc(gfp_mask);

2699

lockdep_trace_alloc(gfp_mask);

2691

2700

2692

might_sleep_if(gfp_mask & __GFP_WAIT);

2701

might_sleep_if(gfp_mask & __GFP_WAIT);

2693

2702

2694

if (should_fail_alloc_page(gfp_mask, order))

2703

if (should_fail_alloc_page(gfp_mask, order))

2695

return NULL;

2704

return NULL;

2696

2705

2697

/*

2706

/*

2698

* Check the zones suitable for the gfp_mask contain at least one

2707

* Check the zones suitable for the gfp_mask contain at least one

2699

* valid zone. It's possible to have an empty zonelist as a result

2708

* valid zone. It's possible to have an empty zonelist as a result

2700

* of GFP_THISNODE and a memoryless node

2709

* of GFP_THISNODE and a memoryless node

2701

*/

2710

*/

2702

if (unlikely(!zonelist->_zonerefs->zone))

2711

if (unlikely(!zonelist->_zonerefs->zone))

2703

return NULL;

2712

return NULL;

2704

2713

2705

/*

2714

/*

2706

* Will only have any effect when __GFP_KMEMCG is set. This is

2715

* Will only have any effect when __GFP_KMEMCG is set. This is

2707

* verified in the (always inline) callee

2716

* verified in the (always inline) callee

2708

*/

2717

*/

2709

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2718

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2710

return NULL;

2719

return NULL;

2711

2720

2712

retry_cpuset:

2721

retry_cpuset:

2713

cpuset_mems_cookie = read_mems_allowed_begin();

2722

cpuset_mems_cookie = read_mems_allowed_begin();

2714

2723

2715

/* The preferred zone is used for statistics later */

2724

/* The preferred zone is used for statistics later */

2716

first_zones_zonelist(zonelist, high_zoneidx,

2725

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2717

nodemask ? : &cpuset_current_mems_allowed,

2726

nodemask ? : &cpuset_current_mems_allowed,

2718

&preferred_zone);

2727

&preferred_zone);

2719

if (!preferred_zone)

2728

if (!preferred_zone)

2720

goto out;

2729

goto out;

2730

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2721

2731

2722

#ifdef CONFIG_CMA

2732

#ifdef CONFIG_CMA

2723

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2733

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2724

alloc_flags |= ALLOC_CMA;

2734

alloc_flags |= ALLOC_CMA;

2725

#endif

2735

#endif

2726

retry:

2736

retry:

2727

/* First allocation attempt */

2737

/* First allocation attempt */

2728

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2738

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2729

zonelist, high_zoneidx, alloc_flags,

2739

zonelist, high_zoneidx, alloc_flags,

2730

preferred_zone, migratetype);

2740

preferred_zone, classzone_idx, migratetype);

2731

if (unlikely(!page)) {

2741

if (unlikely(!page)) {

2732

/*

2742

/*

2733

* The first pass makes sure allocations are spread

2743

* The first pass makes sure allocations are spread

2734

* fairly within the local node. However, the local

2744

* fairly within the local node. However, the local

2735

* node might have free pages left after the fairness

2745

* node might have free pages left after the fairness

2736

* batches are exhausted, and remote zones haven't

2746

* batches are exhausted, and remote zones haven't

2737

* even been considered yet. Try once more without

2747

* even been considered yet. Try once more without

2738

* fairness, and include remote zones now, before

2748

* fairness, and include remote zones now, before

2739

* entering the slowpath and waking kswapd: prefer

2749

* entering the slowpath and waking kswapd: prefer

2740

* spilling to a remote zone over swapping locally.

2750

* spilling to a remote zone over swapping locally.

2741

*/

2751

*/

2742

if (alloc_flags & ALLOC_FAIR) {

2752

if (alloc_flags & ALLOC_FAIR) {

2743

reset_alloc_batches(zonelist, high_zoneidx,

2753

reset_alloc_batches(zonelist, high_zoneidx,

2744

preferred_zone);

2754

preferred_zone);

2745

alloc_flags &= ~ALLOC_FAIR;

2755

alloc_flags &= ~ALLOC_FAIR;

2746

goto retry;

2756

goto retry;

2747

}

2757

}

2748

/*

2758

/*

2749

* Runtime PM, block IO and its error handling path

2759

* Runtime PM, block IO and its error handling path

2750

* can deadlock because I/O on the device might not

2760

* can deadlock because I/O on the device might not

2751

* complete.

2761

* complete.

2752

*/

2762

*/

2753

gfp_mask = memalloc_noio_flags(gfp_mask);

2763

gfp_mask = memalloc_noio_flags(gfp_mask);

2754

page = __alloc_pages_slowpath(gfp_mask, order,

2764

page = __alloc_pages_slowpath(gfp_mask, order,

2755

zonelist, high_zoneidx, nodemask,

2765

zonelist, high_zoneidx, nodemask,

2756

preferred_zone, migratetype);

2766

preferred_zone, classzone_idx, migratetype);

2757

}

2767

}

2758

2768

2759

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2769

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2760

2770

2761

out:

2771

out:

2762

/*

2772

/*

2763

* When updating a task's mems_allowed, it is possible to race with

2773

* When updating a task's mems_allowed, it is possible to race with

2764

* parallel threads in such a way that an allocation can fail while

2774

* parallel threads in such a way that an allocation can fail while

2765

* the mask is being updated. If a page allocation is about to fail,

2775

* the mask is being updated. If a page allocation is about to fail,

2766

* check if the cpuset changed during allocation and if so, retry.

2776

* check if the cpuset changed during allocation and if so, retry.

2767

*/

2777

*/

2768

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2778

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2769

goto retry_cpuset;

2779

goto retry_cpuset;

2770

2780

2771

memcg_kmem_commit_charge(page, memcg, order);

2781

memcg_kmem_commit_charge(page, memcg, order);

2772

2782

2773

return page;

2783

return page;

2774

}

2784

}

2775

EXPORT_SYMBOL(__alloc_pages_nodemask);

2785

EXPORT_SYMBOL(__alloc_pages_nodemask);

2776

2786

2777

/*

2787

/*

2778

* Common helper functions.

2788

* Common helper functions.

2779

*/

2789

*/

2780

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2790

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2781

{

2791

{

2782

struct page *page;

2792

struct page *page;

2783

2793

2784

/*

2794

/*

2785

* __get_free_pages() returns a 32-bit address, which cannot represent

2795

* __get_free_pages() returns a 32-bit address, which cannot represent

2786

* a highmem page

2796

* a highmem page

2787

*/

2797

*/

2788

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2798

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2789

2799

2790

page = alloc_pages(gfp_mask, order);

2800

page = alloc_pages(gfp_mask, order);

2791

if (!page)

2801

if (!page)

2792

return 0;

2802

return 0;

2793

return (unsigned long) page_address(page);

2803

return (unsigned long) page_address(page);

2794

}

2804

}

2795

EXPORT_SYMBOL(__get_free_pages);

2805

EXPORT_SYMBOL(__get_free_pages);

2796

2806

2797

unsigned long get_zeroed_page(gfp_t gfp_mask)

2807

unsigned long get_zeroed_page(gfp_t gfp_mask)

2798

{

2808

{

2799

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2809

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2800

}

2810

}

2801

EXPORT_SYMBOL(get_zeroed_page);

2811

EXPORT_SYMBOL(get_zeroed_page);

2802

2812

2803

void __free_pages(struct page *page, unsigned int order)

2813

void __free_pages(struct page *page, unsigned int order)

2804

{

2814

{

2805

if (put_page_testzero(page)) {

2815

if (put_page_testzero(page)) {

2806

if (order == 0)

2816

if (order == 0)

2807

free_hot_cold_page(page, 0);

2817

free_hot_cold_page(page, 0);

2808

else

2818

else

2809

__free_pages_ok(page, order);

2819

__free_pages_ok(page, order);

2810

}

2820

}

2811

}

2821

}

2812

2822

2813

EXPORT_SYMBOL(__free_pages);

2823

EXPORT_SYMBOL(__free_pages);

2814

2824

2815

void free_pages(unsigned long addr, unsigned int order)

2825

void free_pages(unsigned long addr, unsigned int order)

2816

{

2826

{

2817

if (addr != 0) {

2827

if (addr != 0) {

2818

VM_BUG_ON(!virt_addr_valid((void *)addr));

2828

VM_BUG_ON(!virt_addr_valid((void *)addr));

2819

__free_pages(virt_to_page((void *)addr), order);

2829

__free_pages(virt_to_page((void *)addr), order);

2820

}

2830

}

2821

}

2831

}

2822

2832

2823

EXPORT_SYMBOL(free_pages);

2833

EXPORT_SYMBOL(free_pages);

2824

2834

2825

/*

2835

/*

2826

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2836

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2827

* pages allocated with __GFP_KMEMCG.

2837

* pages allocated with __GFP_KMEMCG.

2828

*

2838

*

2829

* Those pages are accounted to a particular memcg, embedded in the

2839

* Those pages are accounted to a particular memcg, embedded in the

2830

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2840

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2831

* for that information only to find out that it is NULL for users who have no

2841

* for that information only to find out that it is NULL for users who have no

2832

* interest in that whatsoever, we provide these functions.

2842

* interest in that whatsoever, we provide these functions.

2833

*

2843

*

2834

* The caller knows better which flags it relies on.

2844

* The caller knows better which flags it relies on.

2835

*/

2845

*/

2836

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2846

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2837

{

2847

{

2838

memcg_kmem_uncharge_pages(page, order);

2848

memcg_kmem_uncharge_pages(page, order);

2839

__free_pages(page, order);

2849

__free_pages(page, order);

2840

}

2850

}

2841

2851

2842

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2852

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2843

{

2853

{

2844

if (addr != 0) {

2854

if (addr != 0) {

2845

VM_BUG_ON(!virt_addr_valid((void *)addr));

2855

VM_BUG_ON(!virt_addr_valid((void *)addr));

2846

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2856

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2847

}

2857

}

2848

}

2858

}

2849

2859

2850

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2860

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2851

{

2861

{

2852

if (addr) {

2862

if (addr) {

2853

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2863

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2854

unsigned long used = addr + PAGE_ALIGN(size);

2864

unsigned long used = addr + PAGE_ALIGN(size);

2855

2865

2856

split_page(virt_to_page((void *)addr), order);

2866

split_page(virt_to_page((void *)addr), order);

2857

while (used < alloc_end) {

2867

while (used < alloc_end) {

2858

free_page(used);

2868

free_page(used);

2859

used += PAGE_SIZE;

2869

used += PAGE_SIZE;

2860

}

2870

}

2861

}

2871

}

2862

return (void *)addr;

2872

return (void *)addr;

2863

}

2873

}

2864

2874

2865

/**

2875

/**

2866

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2876

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2867

* @size: the number of bytes to allocate

2877

* @size: the number of bytes to allocate

2868

* @gfp_mask: GFP flags for the allocation

2878

* @gfp_mask: GFP flags for the allocation

2869

*

2879

*

2870

* This function is similar to alloc_pages(), except that it allocates the

2880

* This function is similar to alloc_pages(), except that it allocates the

2871

* minimum number of pages to satisfy the request. alloc_pages() can only

2881

* minimum number of pages to satisfy the request. alloc_pages() can only

2872

* allocate memory in power-of-two pages.

2882

* allocate memory in power-of-two pages.

2873

*

2883

*

2874

* This function is also limited by MAX_ORDER.

2884

* This function is also limited by MAX_ORDER.

2875

*

2885

*

2876

* Memory allocated by this function must be released by free_pages_exact().

2886

* Memory allocated by this function must be released by free_pages_exact().

2877

*/

2887

*/

2878

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2888

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2879

{

2889

{

2880

unsigned int order = get_order(size);

2890

unsigned int order = get_order(size);

2881

unsigned long addr;

2891

unsigned long addr;

2882

2892

2883

addr = __get_free_pages(gfp_mask, order);

2893

addr = __get_free_pages(gfp_mask, order);

2884

return make_alloc_exact(addr, order, size);

2894

return make_alloc_exact(addr, order, size);

2885

}

2895

}

2886

EXPORT_SYMBOL(alloc_pages_exact);

2896

EXPORT_SYMBOL(alloc_pages_exact);

2887

2897

2888

/**

2898

/**

2889

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2899

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2890

* pages on a node.

2900

* pages on a node.

2891

* @nid: the preferred node ID where memory should be allocated

2901

* @nid: the preferred node ID where memory should be allocated

2892

* @size: the number of bytes to allocate

2902

* @size: the number of bytes to allocate

2893

* @gfp_mask: GFP flags for the allocation

2903

* @gfp_mask: GFP flags for the allocation

2894

*

2904

*

2895

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2905

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2896

* back.

2906

* back.

2897

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2907

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2898

* but is not exact.

2908

* but is not exact.

2899

*/

2909

*/

2900

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2910

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2901

{

2911

{

2902

unsigned order = get_order(size);

2912

unsigned order = get_order(size);

2903

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2913

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2904

if (!p)

2914

if (!p)

2905

return NULL;

2915

return NULL;

2906

return make_alloc_exact((unsigned long)page_address(p), order, size);

2916

return make_alloc_exact((unsigned long)page_address(p), order, size);

2907

}

2917

}

2908

EXPORT_SYMBOL(alloc_pages_exact_nid);

2918

EXPORT_SYMBOL(alloc_pages_exact_nid);

2909

2919

2910

/**

2920

/**

2911

* free_pages_exact - release memory allocated via alloc_pages_exact()

2921

* free_pages_exact - release memory allocated via alloc_pages_exact()

2912

* @virt: the value returned by alloc_pages_exact.

2922

* @virt: the value returned by alloc_pages_exact.

2913

* @size: size of allocation, same value as passed to alloc_pages_exact().

2923

* @size: size of allocation, same value as passed to alloc_pages_exact().

2914

*

2924

*

2915

* Release the memory allocated by a previous call to alloc_pages_exact.

2925

* Release the memory allocated by a previous call to alloc_pages_exact.

2916

*/

2926

*/

2917

void free_pages_exact(void *virt, size_t size)

2927

void free_pages_exact(void *virt, size_t size)

2918

{

2928

{

2919

unsigned long addr = (unsigned long)virt;

2929

unsigned long addr = (unsigned long)virt;

2920

unsigned long end = addr + PAGE_ALIGN(size);

2930

unsigned long end = addr + PAGE_ALIGN(size);

2921

2931

2922

while (addr < end) {

2932

while (addr < end) {

2923

free_page(addr);

2933

free_page(addr);

2924

addr += PAGE_SIZE;

2934

addr += PAGE_SIZE;

2925

}

2935

}

2926

}

2936

}

2927

EXPORT_SYMBOL(free_pages_exact);

2937

EXPORT_SYMBOL(free_pages_exact);

2928

2938

2929

/**

2939

/**

2930

* nr_free_zone_pages - count number of pages beyond high watermark

2940

* nr_free_zone_pages - count number of pages beyond high watermark

2931

* @offset: The zone index of the highest zone

2941

* @offset: The zone index of the highest zone

2932

*

2942

*

2933

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2943

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2934

* high watermark within all zones at or below a given zone index. For each

2944

* high watermark within all zones at or below a given zone index. For each

2935

* zone, the number of pages is calculated as:

2945

* zone, the number of pages is calculated as:

2936

* managed_pages - high_pages

2946

* managed_pages - high_pages

2937

*/

2947

*/

2938

static unsigned long nr_free_zone_pages(int offset)

2948

static unsigned long nr_free_zone_pages(int offset)

2939

{

2949

{

2940

struct zoneref *z;

2950

struct zoneref *z;

2941

struct zone *zone;

2951

struct zone *zone;

2942

2952

2943

/* Just pick one node, since fallback list is circular */

2953

/* Just pick one node, since fallback list is circular */

2944

unsigned long sum = 0;

2954

unsigned long sum = 0;

2945

2955

2946

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2956

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2947

2957

2948

for_each_zone_zonelist(zone, z, zonelist, offset) {

2958

for_each_zone_zonelist(zone, z, zonelist, offset) {

2949

unsigned long size = zone->managed_pages;

2959

unsigned long size = zone->managed_pages;

2950

unsigned long high = high_wmark_pages(zone);

2960

unsigned long high = high_wmark_pages(zone);

2951

if (size > high)

2961

if (size > high)

2952

sum += size - high;

2962

sum += size - high;

2953

}

2963

}

2954

2964

2955

return sum;

2965

return sum;

2956

}

2966

}

2957

2967

2958

/**

2968

/**

2959

* nr_free_buffer_pages - count number of pages beyond high watermark

2969

* nr_free_buffer_pages - count number of pages beyond high watermark

2960

*

2970

*

2961

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2971

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2962

* watermark within ZONE_DMA and ZONE_NORMAL.

2972

* watermark within ZONE_DMA and ZONE_NORMAL.

2963

*/

2973

*/

2964

unsigned long nr_free_buffer_pages(void)

2974

unsigned long nr_free_buffer_pages(void)

2965

{

2975

{

2966

return nr_free_zone_pages(gfp_zone(GFP_USER));

2976

return nr_free_zone_pages(gfp_zone(GFP_USER));

2967

}

2977

}

2968

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2978

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2969

2979

2970

/**

2980

/**

2971

* nr_free_pagecache_pages - count number of pages beyond high watermark

2981

* nr_free_pagecache_pages - count number of pages beyond high watermark

2972

*

2982

*

2973

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2983

* nr_free_pagecache_pages() counts the number of pages which are beyond the

2974

* high watermark within all zones.

2984

* high watermark within all zones.

2975

*/

2985

*/

2976

unsigned long nr_free_pagecache_pages(void)

2986

unsigned long nr_free_pagecache_pages(void)

2977

{

2987

{

2978

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2988

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2979

}

2989

}

2980

2990

2981

static inline void show_node(struct zone *zone)

2991

static inline void show_node(struct zone *zone)

2982

{

2992

{

2983

if (IS_ENABLED(CONFIG_NUMA))

2993

if (IS_ENABLED(CONFIG_NUMA))

2984

printk("Node %d ", zone_to_nid(zone));

2994

printk("Node %d ", zone_to_nid(zone));

2985

}

2995

}

2986

2996

2987

void si_meminfo(struct sysinfo *val)

2997

void si_meminfo(struct sysinfo *val)

2988

{

2998

{

2989

val->totalram = totalram_pages;

2999

val->totalram = totalram_pages;

2990

val->sharedram = 0;

3000

val->sharedram = 0;

2991

val->freeram = global_page_state(NR_FREE_PAGES);

3001

val->freeram = global_page_state(NR_FREE_PAGES);

2992

val->bufferram = nr_blockdev_pages();

3002

val->bufferram = nr_blockdev_pages();

2993

val->totalhigh = totalhigh_pages;

3003

val->totalhigh = totalhigh_pages;

2994

val->freehigh = nr_free_highpages();

3004

val->freehigh = nr_free_highpages();

2995

val->mem_unit = PAGE_SIZE;

3005

val->mem_unit = PAGE_SIZE;

2996

}

3006

}

2997

3007

2998

EXPORT_SYMBOL(si_meminfo);

3008

EXPORT_SYMBOL(si_meminfo);

2999

3009

3000

#ifdef CONFIG_NUMA

3010

#ifdef CONFIG_NUMA

3001

void si_meminfo_node(struct sysinfo *val, int nid)

3011

void si_meminfo_node(struct sysinfo *val, int nid)

3002

{

3012

{

3003

int zone_type; /* needs to be signed */

3013

int zone_type; /* needs to be signed */

3004

unsigned long managed_pages = 0;

3014

unsigned long managed_pages = 0;

3005

pg_data_t *pgdat = NODE_DATA(nid);

3015

pg_data_t *pgdat = NODE_DATA(nid);

3006

3016

3007

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3017

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3008

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3018

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3009

val->totalram = managed_pages;

3019

val->totalram = managed_pages;

3010

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3020

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3011

#ifdef CONFIG_HIGHMEM

3021

#ifdef CONFIG_HIGHMEM

3012

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3022

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3013

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3023

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3014

NR_FREE_PAGES);

3024

NR_FREE_PAGES);

3015

#else

3025

#else

3016

val->totalhigh = 0;

3026

val->totalhigh = 0;

3017

val->freehigh = 0;

3027

val->freehigh = 0;

3018

#endif

3028

#endif

3019

val->mem_unit = PAGE_SIZE;

3029

val->mem_unit = PAGE_SIZE;

3020

}

3030

}

3021

#endif

3031

#endif

3022

3032

3023

/*

3033

/*

3024

* Determine whether the node should be displayed or not, depending on whether

3034

* Determine whether the node should be displayed or not, depending on whether

3025

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3035

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3026

*/

3036

*/

3027

bool skip_free_areas_node(unsigned int flags, int nid)

3037

bool skip_free_areas_node(unsigned int flags, int nid)

3028

{

3038

{

3029

bool ret = false;

3039

bool ret = false;

3030

unsigned int cpuset_mems_cookie;

3040

unsigned int cpuset_mems_cookie;

3031

3041

3032

if (!(flags & SHOW_MEM_FILTER_NODES))

3042

if (!(flags & SHOW_MEM_FILTER_NODES))

3033

goto out;

3043

goto out;

3034

3044

3035

do {

3045

do {

3036

cpuset_mems_cookie = read_mems_allowed_begin();

3046

cpuset_mems_cookie = read_mems_allowed_begin();

3037

ret = !node_isset(nid, cpuset_current_mems_allowed);

3047

ret = !node_isset(nid, cpuset_current_mems_allowed);

3038

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3048

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3039

out:

3049

out:

3040

return ret;

3050

return ret;

3041

}

3051

}

3042

3052

3043

#define K(x) ((x) << (PAGE_SHIFT-10))

3053

#define K(x) ((x) << (PAGE_SHIFT-10))

3044

3054

3045

static void show_migration_types(unsigned char type)

3055

static void show_migration_types(unsigned char type)

3046

{

3056

{

3047

static const char types[MIGRATE_TYPES] = {

3057

static const char types[MIGRATE_TYPES] = {

3048

[MIGRATE_UNMOVABLE] = 'U',

3058

[MIGRATE_UNMOVABLE] = 'U',

3049

[MIGRATE_RECLAIMABLE] = 'E',

3059

[MIGRATE_RECLAIMABLE] = 'E',

3050

[MIGRATE_MOVABLE] = 'M',

3060

[MIGRATE_MOVABLE] = 'M',

3051

[MIGRATE_RESERVE] = 'R',

3061

[MIGRATE_RESERVE] = 'R',

3052

#ifdef CONFIG_CMA

3062

#ifdef CONFIG_CMA

3053

[MIGRATE_CMA] = 'C',

3063

[MIGRATE_CMA] = 'C',

3054

#endif

3064

#endif

3055

#ifdef CONFIG_MEMORY_ISOLATION

3065

#ifdef CONFIG_MEMORY_ISOLATION

3056

[MIGRATE_ISOLATE] = 'I',

3066

[MIGRATE_ISOLATE] = 'I',

3057

#endif

3067

#endif

3058

};

3068

};

3059

char tmp[MIGRATE_TYPES + 1];

3069

char tmp[MIGRATE_TYPES + 1];

3060

char *p = tmp;

3070

char *p = tmp;

3061

int i;

3071

int i;

3062

3072

3063

for (i = 0; i < MIGRATE_TYPES; i++) {

3073

for (i = 0; i < MIGRATE_TYPES; i++) {

3064

if (type & (1 << i))

3074

if (type & (1 << i))

3065

*p++ = types[i];

3075

*p++ = types[i];

3066

}

3076

}

3067

3077

3068

*p = '\0';

3078

*p = '\0';

3069

printk("(%s) ", tmp);

3079

printk("(%s) ", tmp);

3070

}

3080

}

3071

3081

3072

/*

3082

/*

3073

* Show free area list (used inside shift_scroll-lock stuff)

3083

* Show free area list (used inside shift_scroll-lock stuff)

3074

* We also calculate the percentage fragmentation. We do this by counting the

3084

* We also calculate the percentage fragmentation. We do this by counting the

3075

* memory on each free list with the exception of the first item on the list.

3085

* memory on each free list with the exception of the first item on the list.

3076

* Suppresses nodes that are not allowed by current's cpuset if

3086

* Suppresses nodes that are not allowed by current's cpuset if

3077

* SHOW_MEM_FILTER_NODES is passed.

3087

* SHOW_MEM_FILTER_NODES is passed.

3078

*/

3088

*/

3079

void show_free_areas(unsigned int filter)

3089

void show_free_areas(unsigned int filter)

3080

{

3090

{

3081

int cpu;

3091

int cpu;

3082

struct zone *zone;

3092

struct zone *zone;

3083

3093

3084

for_each_populated_zone(zone) {

3094

for_each_populated_zone(zone) {

3085

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3095

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3086

continue;

3096

continue;

3087

show_node(zone);

3097

show_node(zone);

3088

printk("%s per-cpu:\n", zone->name);

3098

printk("%s per-cpu:\n", zone->name);

3089

3099

3090

for_each_online_cpu(cpu) {

3100

for_each_online_cpu(cpu) {

3091

struct per_cpu_pageset *pageset;

3101

struct per_cpu_pageset *pageset;

3092

3102

3093

pageset = per_cpu_ptr(zone->pageset, cpu);

3103

pageset = per_cpu_ptr(zone->pageset, cpu);

3094

3104

3095

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3105

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3096

cpu, pageset->pcp.high,

3106

cpu, pageset->pcp.high,

3097

pageset->pcp.batch, pageset->pcp.count);

3107

pageset->pcp.batch, pageset->pcp.count);

3098

}

3108

}

3099

}

3109

}

3100

3110

3101

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3111

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3102

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3112

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3103

" unevictable:%lu"

3113

" unevictable:%lu"

3104

" dirty:%lu writeback:%lu unstable:%lu\n"

3114

" dirty:%lu writeback:%lu unstable:%lu\n"

3105

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3115

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3106

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3116

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3107

" free_cma:%lu\n",

3117

" free_cma:%lu\n",

3108

global_page_state(NR_ACTIVE_ANON),

3118

global_page_state(NR_ACTIVE_ANON),

3109

global_page_state(NR_INACTIVE_ANON),

3119

global_page_state(NR_INACTIVE_ANON),

3110

global_page_state(NR_ISOLATED_ANON),

3120

global_page_state(NR_ISOLATED_ANON),

3111

global_page_state(NR_ACTIVE_FILE),

3121

global_page_state(NR_ACTIVE_FILE),

3112

global_page_state(NR_INACTIVE_FILE),

3122

global_page_state(NR_INACTIVE_FILE),

3113

global_page_state(NR_ISOLATED_FILE),

3123

global_page_state(NR_ISOLATED_FILE),

3114

global_page_state(NR_UNEVICTABLE),

3124

global_page_state(NR_UNEVICTABLE),

3115

global_page_state(NR_FILE_DIRTY),

3125

global_page_state(NR_FILE_DIRTY),

3116

global_page_state(NR_WRITEBACK),

3126

global_page_state(NR_WRITEBACK),

3117

global_page_state(NR_UNSTABLE_NFS),

3127

global_page_state(NR_UNSTABLE_NFS),

3118

global_page_state(NR_FREE_PAGES),

3128

global_page_state(NR_FREE_PAGES),

3119

global_page_state(NR_SLAB_RECLAIMABLE),

3129

global_page_state(NR_SLAB_RECLAIMABLE),

3120

global_page_state(NR_SLAB_UNRECLAIMABLE),

3130

global_page_state(NR_SLAB_UNRECLAIMABLE),

3121

global_page_state(NR_FILE_MAPPED),

3131

global_page_state(NR_FILE_MAPPED),

3122

global_page_state(NR_SHMEM),

3132

global_page_state(NR_SHMEM),

3123

global_page_state(NR_PAGETABLE),

3133

global_page_state(NR_PAGETABLE),

3124

global_page_state(NR_BOUNCE),

3134

global_page_state(NR_BOUNCE),

3125

global_page_state(NR_FREE_CMA_PAGES));

3135

global_page_state(NR_FREE_CMA_PAGES));

3126

3136

3127

for_each_populated_zone(zone) {

3137

for_each_populated_zone(zone) {

3128

int i;

3138

int i;

3129

3139

3130

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3140

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3131

continue;

3141

continue;

3132

show_node(zone);

3142

show_node(zone);

3133

printk("%s"

3143

printk("%s"

3134

" free:%lukB"

3144

" free:%lukB"

3135

" min:%lukB"

3145

" min:%lukB"

3136

" low:%lukB"

3146

" low:%lukB"

3137

" high:%lukB"

3147

" high:%lukB"

3138

" active_anon:%lukB"

3148

" active_anon:%lukB"

3139

" inactive_anon:%lukB"

3149

" inactive_anon:%lukB"

3140

" active_file:%lukB"

3150

" active_file:%lukB"

3141

" inactive_file:%lukB"

3151

" inactive_file:%lukB"

3142

" unevictable:%lukB"

3152

" unevictable:%lukB"

3143

" isolated(anon):%lukB"

3153

" isolated(anon):%lukB"

3144

" isolated(file):%lukB"

3154

" isolated(file):%lukB"

3145

" present:%lukB"

3155

" present:%lukB"

3146

" managed:%lukB"

3156

" managed:%lukB"

3147

" mlocked:%lukB"

3157

" mlocked:%lukB"

3148

" dirty:%lukB"

3158

" dirty:%lukB"

3149

" writeback:%lukB"

3159

" writeback:%lukB"

3150

" mapped:%lukB"

3160

" mapped:%lukB"

3151

" shmem:%lukB"

3161

" shmem:%lukB"

3152

" slab_reclaimable:%lukB"

3162

" slab_reclaimable:%lukB"

3153

" slab_unreclaimable:%lukB"

3163

" slab_unreclaimable:%lukB"

3154

" kernel_stack:%lukB"

3164

" kernel_stack:%lukB"

3155

" pagetables:%lukB"

3165

" pagetables:%lukB"

3156

" unstable:%lukB"

3166

" unstable:%lukB"

3157

" bounce:%lukB"

3167

" bounce:%lukB"

3158

" free_cma:%lukB"

3168

" free_cma:%lukB"

3159

" writeback_tmp:%lukB"

3169

" writeback_tmp:%lukB"

3160

" pages_scanned:%lu"

3170

" pages_scanned:%lu"

3161

" all_unreclaimable? %s"

3171

" all_unreclaimable? %s"

3162

"\n",

3172

"\n",

3163

zone->name,

3173

zone->name,

3164

K(zone_page_state(zone, NR_FREE_PAGES)),

3174

K(zone_page_state(zone, NR_FREE_PAGES)),

3165

K(min_wmark_pages(zone)),

3175

K(min_wmark_pages(zone)),

3166

K(low_wmark_pages(zone)),

3176

K(low_wmark_pages(zone)),

3167

K(high_wmark_pages(zone)),

3177

K(high_wmark_pages(zone)),

3168

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3178

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3169

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3179

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3170

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3180

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3171

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3181

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3172

K(zone_page_state(zone, NR_UNEVICTABLE)),

3182

K(zone_page_state(zone, NR_UNEVICTABLE)),

3173

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3183

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3174

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3184

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3175

K(zone->present_pages),

3185

K(zone->present_pages),

3176

K(zone->managed_pages),

3186

K(zone->managed_pages),

3177

K(zone_page_state(zone, NR_MLOCK)),

3187

K(zone_page_state(zone, NR_MLOCK)),

3178

K(zone_page_state(zone, NR_FILE_DIRTY)),

3188

K(zone_page_state(zone, NR_FILE_DIRTY)),

3179

K(zone_page_state(zone, NR_WRITEBACK)),

3189

K(zone_page_state(zone, NR_WRITEBACK)),

3180

K(zone_page_state(zone, NR_FILE_MAPPED)),

3190

K(zone_page_state(zone, NR_FILE_MAPPED)),

3181

K(zone_page_state(zone, NR_SHMEM)),

3191

K(zone_page_state(zone, NR_SHMEM)),

3182

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3192

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3183

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3193

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3184

zone_page_state(zone, NR_KERNEL_STACK) *

3194

zone_page_state(zone, NR_KERNEL_STACK) *

3185

THREAD_SIZE / 1024,

3195

THREAD_SIZE / 1024,

3186

K(zone_page_state(zone, NR_PAGETABLE)),

3196

K(zone_page_state(zone, NR_PAGETABLE)),

3187

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3197

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3188

K(zone_page_state(zone, NR_BOUNCE)),

3198

K(zone_page_state(zone, NR_BOUNCE)),

3189

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3199

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3190

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3200

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3191

zone->pages_scanned,

3201

zone->pages_scanned,

3192

(!zone_reclaimable(zone) ? "yes" : "no")

3202

(!zone_reclaimable(zone) ? "yes" : "no")

3193

);

3203

);

3194

printk("lowmem_reserve[]:");

3204

printk("lowmem_reserve[]:");

3195

for (i = 0; i < MAX_NR_ZONES; i++)

3205

for (i = 0; i < MAX_NR_ZONES; i++)

3196

printk(" %lu", zone->lowmem_reserve[i]);

3206

printk(" %lu", zone->lowmem_reserve[i]);

3197

printk("\n");

3207

printk("\n");

3198

}

3208

}

3199

3209

3200

for_each_populated_zone(zone) {

3210

for_each_populated_zone(zone) {

3201

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3211

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3202

unsigned char types[MAX_ORDER];

3212

unsigned char types[MAX_ORDER];

3203

3213

3204

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3214

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3205

continue;

3215

continue;

3206

show_node(zone);

3216

show_node(zone);

3207

printk("%s: ", zone->name);

3217

printk("%s: ", zone->name);

3208

3218

3209

spin_lock_irqsave(&zone->lock, flags);

3219

spin_lock_irqsave(&zone->lock, flags);

3210

for (order = 0; order < MAX_ORDER; order++) {

3220

for (order = 0; order < MAX_ORDER; order++) {

3211

struct free_area *area = &zone->free_area[order];

3221

struct free_area *area = &zone->free_area[order];

3212

int type;

3222

int type;

3213

3223

3214

nr[order] = area->nr_free;

3224

nr[order] = area->nr_free;

3215

total += nr[order] << order;

3225

total += nr[order] << order;

3216

3226

3217

types[order] = 0;

3227

types[order] = 0;

3218

for (type = 0; type < MIGRATE_TYPES; type++) {

3228

for (type = 0; type < MIGRATE_TYPES; type++) {

3219

if (!list_empty(&area->free_list[type]))

3229

if (!list_empty(&area->free_list[type]))

3220

types[order] |= 1 << type;

3230

types[order] |= 1 << type;

3221

}

3231

}

3222

}

3232

}

3223

spin_unlock_irqrestore(&zone->lock, flags);

3233

spin_unlock_irqrestore(&zone->lock, flags);

3224

for (order = 0; order < MAX_ORDER; order++) {

3234

for (order = 0; order < MAX_ORDER; order++) {

3225

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3235

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3226

if (nr[order])

3236

if (nr[order])

3227

show_migration_types(types[order]);

3237

show_migration_types(types[order]);

3228

}

3238

}

3229

printk("= %lukB\n", K(total));

3239

printk("= %lukB\n", K(total));

3230

}

3240

}

3231

3241

3232

hugetlb_show_meminfo();

3242

hugetlb_show_meminfo();

3233

3243

3234

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3244

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3235

3245

3236

show_swap_cache_info();

3246

show_swap_cache_info();

3237

}

3247

}

3238

3248

3239

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3249

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3240

{

3250

{

3241

zoneref->zone = zone;

3251

zoneref->zone = zone;

3242

zoneref->zone_idx = zone_idx(zone);

3252

zoneref->zone_idx = zone_idx(zone);

3243

}

3253

}

3244

3254

3245

/*

3255

/*

3246

* Builds allocation fallback zone lists.

3256

* Builds allocation fallback zone lists.

3247

*

3257

*

3248

* Add all populated zones of a node to the zonelist.

3258

* Add all populated zones of a node to the zonelist.

3249

*/

3259

*/

3250

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3260

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3251

int nr_zones)

3261

int nr_zones)

3252

{

3262

{

3253

struct zone *zone;

3263

struct zone *zone;

3254

enum zone_type zone_type = MAX_NR_ZONES;

3264

enum zone_type zone_type = MAX_NR_ZONES;

3255

3265

3256

do {

3266

do {

3257

zone_type--;

3267

zone_type--;

3258

zone = pgdat->node_zones + zone_type;

3268

zone = pgdat->node_zones + zone_type;

3259

if (populated_zone(zone)) {

3269

if (populated_zone(zone)) {

3260

zoneref_set_zone(zone,

3270

zoneref_set_zone(zone,

3261

&zonelist->_zonerefs[nr_zones++]);

3271

&zonelist->_zonerefs[nr_zones++]);

3262

check_highest_zone(zone_type);

3272

check_highest_zone(zone_type);

3263

}

3273

}

3264

} while (zone_type);

3274

} while (zone_type);

3265

3275

3266

return nr_zones;

3276

return nr_zones;

3267

}

3277

}

3268

3278

3269

3279

3270

/*

3280

/*

3271

* zonelist_order:

3281

* zonelist_order:

3272

* 0 = automatic detection of better ordering.

3282

* 0 = automatic detection of better ordering.

3273

* 1 = order by ([node] distance, -zonetype)

3283

* 1 = order by ([node] distance, -zonetype)

3274

* 2 = order by (-zonetype, [node] distance)

3284

* 2 = order by (-zonetype, [node] distance)

3275

*

3285

*

3276

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3286

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3277

* the same zonelist. So only NUMA can configure this param.

3287

* the same zonelist. So only NUMA can configure this param.

3278

*/

3288

*/

3279

#define ZONELIST_ORDER_DEFAULT 0

3289

#define ZONELIST_ORDER_DEFAULT 0

3280

#define ZONELIST_ORDER_NODE 1

3290

#define ZONELIST_ORDER_NODE 1

3281

#define ZONELIST_ORDER_ZONE 2

3291

#define ZONELIST_ORDER_ZONE 2

3282

3292

3283

/* zonelist order in the kernel.

3293

/* zonelist order in the kernel.

3284

* set_zonelist_order() will set this to NODE or ZONE.

3294

* set_zonelist_order() will set this to NODE or ZONE.

3285

*/

3295

*/

3286

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3296

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3287

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3297

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3288

3298

3289

3299

3290

#ifdef CONFIG_NUMA

3300

#ifdef CONFIG_NUMA

3291

/* The value user specified ....changed by config */

3301

/* The value user specified ....changed by config */

3292

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3302

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3293

/* string for sysctl */

3303

/* string for sysctl */

3294

#define NUMA_ZONELIST_ORDER_LEN 16

3304

#define NUMA_ZONELIST_ORDER_LEN 16

3295

char numa_zonelist_order[16] = "default";

3305

char numa_zonelist_order[16] = "default";

3296

3306

3297

/*

3307

/*

3298

* interface for configure zonelist ordering.

3308

* interface for configure zonelist ordering.

3299

* command line option "numa_zonelist_order"

3309

* command line option "numa_zonelist_order"

3300

* = "[dD]efault - default, automatic configuration.

3310

* = "[dD]efault - default, automatic configuration.

3301

* = "[nN]ode - order by node locality, then by zone within node

3311

* = "[nN]ode - order by node locality, then by zone within node

3302

* = "[zZ]one - order by zone, then by locality within zone

3312

* = "[zZ]one - order by zone, then by locality within zone

3303

*/

3313

*/

3304

3314

3305

static int __parse_numa_zonelist_order(char *s)

3315

static int __parse_numa_zonelist_order(char *s)

3306

{

3316

{

3307

if (*s == 'd' || *s == 'D') {

3317

if (*s == 'd' || *s == 'D') {

3308

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3318

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3309

} else if (*s == 'n' || *s == 'N') {

3319

} else if (*s == 'n' || *s == 'N') {

3310

user_zonelist_order = ZONELIST_ORDER_NODE;

3320

user_zonelist_order = ZONELIST_ORDER_NODE;

3311

} else if (*s == 'z' || *s == 'Z') {

3321

} else if (*s == 'z' || *s == 'Z') {

3312

user_zonelist_order = ZONELIST_ORDER_ZONE;

3322

user_zonelist_order = ZONELIST_ORDER_ZONE;

3313

} else {

3323

} else {

3314

printk(KERN_WARNING

3324

printk(KERN_WARNING

3315

"Ignoring invalid numa_zonelist_order value: "

3325

"Ignoring invalid numa_zonelist_order value: "

3316

"%s\n", s);

3326

"%s\n", s);

3317

return -EINVAL;

3327

return -EINVAL;

3318

}

3328

}

3319

return 0;

3329

return 0;

3320

}

3330

}

3321

3331

3322

static __init int setup_numa_zonelist_order(char *s)

3332

static __init int setup_numa_zonelist_order(char *s)

3323

{

3333

{

3324

int ret;

3334

int ret;

3325

3335

3326

if (!s)

3336

if (!s)

3327

return 0;

3337

return 0;

3328

3338

3329

ret = __parse_numa_zonelist_order(s);

3339

ret = __parse_numa_zonelist_order(s);

3330

if (ret == 0)

3340

if (ret == 0)

3331

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3341

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3332

3342

3333

return ret;

3343

return ret;

3334

}

3344

}

3335

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3345

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3336

3346

3337

/*

3347

/*

3338

* sysctl handler for numa_zonelist_order

3348

* sysctl handler for numa_zonelist_order

3339

*/

3349

*/

3340

int numa_zonelist_order_handler(ctl_table *table, int write,

3350

int numa_zonelist_order_handler(ctl_table *table, int write,

3341

void __user *buffer, size_t *length,

3351

void __user *buffer, size_t *length,

3342

loff_t *ppos)

3352

loff_t *ppos)

3343

{

3353

{

3344

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3354

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3345

int ret;

3355

int ret;

3346

static DEFINE_MUTEX(zl_order_mutex);

3356

static DEFINE_MUTEX(zl_order_mutex);

3347

3357

3348

mutex_lock(&zl_order_mutex);

3358

mutex_lock(&zl_order_mutex);

3349

if (write) {

3359

if (write) {

3350

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3360

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3351

ret = -EINVAL;

3361

ret = -EINVAL;

3352

goto out;

3362

goto out;

3353

}

3363

}

3354

strcpy(saved_string, (char *)table->data);

3364

strcpy(saved_string, (char *)table->data);

3355

}

3365

}

3356

ret = proc_dostring(table, write, buffer, length, ppos);

3366

ret = proc_dostring(table, write, buffer, length, ppos);

3357

if (ret)

3367

if (ret)

3358

goto out;

3368

goto out;

3359

if (write) {

3369

if (write) {

3360

int oldval = user_zonelist_order;

3370

int oldval = user_zonelist_order;

3361

3371

3362

ret = __parse_numa_zonelist_order((char *)table->data);

3372

ret = __parse_numa_zonelist_order((char *)table->data);

3363

if (ret) {

3373

if (ret) {

3364

/*

3374

/*

3365

* bogus value. restore saved string

3375

* bogus value. restore saved string

3366

*/

3376

*/

3367

strncpy((char *)table->data, saved_string,

3377

strncpy((char *)table->data, saved_string,

3368

NUMA_ZONELIST_ORDER_LEN);

3378

NUMA_ZONELIST_ORDER_LEN);

3369

user_zonelist_order = oldval;

3379

user_zonelist_order = oldval;

3370

} else if (oldval != user_zonelist_order) {

3380

} else if (oldval != user_zonelist_order) {

3371

mutex_lock(&zonelists_mutex);

3381

mutex_lock(&zonelists_mutex);

3372

build_all_zonelists(NULL, NULL);

3382

build_all_zonelists(NULL, NULL);

3373

mutex_unlock(&zonelists_mutex);

3383

mutex_unlock(&zonelists_mutex);

3374

}

3384

}

3375

}

3385

}

3376

out:

3386

out:

3377

mutex_unlock(&zl_order_mutex);

3387

mutex_unlock(&zl_order_mutex);

3378

return ret;

3388

return ret;

3379

}

3389

}

3380

3390

3381

3391

3382

#define MAX_NODE_LOAD (nr_online_nodes)

3392

#define MAX_NODE_LOAD (nr_online_nodes)

3383

static int node_load[MAX_NUMNODES];

3393

static int node_load[MAX_NUMNODES];

3384

3394

3385

/**

3395

/**

3386

* find_next_best_node - find the next node that should appear in a given node's fallback list

3396

* find_next_best_node - find the next node that should appear in a given node's fallback list

3387

* @node: node whose fallback list we're appending

3397

* @node: node whose fallback list we're appending

3388

* @used_node_mask: nodemask_t of already used nodes

3398

* @used_node_mask: nodemask_t of already used nodes

3389

*

3399

*

3390

* We use a number of factors to determine which is the next node that should

3400

* We use a number of factors to determine which is the next node that should

3391

* appear on a given node's fallback list. The node should not have appeared

3401

* appear on a given node's fallback list. The node should not have appeared

3392

* already in @node's fallback list, and it should be the next closest node

3402

* already in @node's fallback list, and it should be the next closest node

3393

* according to the distance array (which contains arbitrary distance values

3403

* according to the distance array (which contains arbitrary distance values

3394

* from each node to each node in the system), and should also prefer nodes

3404

* from each node to each node in the system), and should also prefer nodes

3395

* with no CPUs, since presumably they'll have very little allocation pressure

3405

* with no CPUs, since presumably they'll have very little allocation pressure

3396

* on them otherwise.

3406

* on them otherwise.

3397

* It returns -1 if no node is found.

3407

* It returns -1 if no node is found.

3398

*/

3408

*/

3399

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3409

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3400

{

3410

{

3401

int n, val;

3411

int n, val;

3402

int min_val = INT_MAX;

3412

int min_val = INT_MAX;

3403

int best_node = NUMA_NO_NODE;

3413

int best_node = NUMA_NO_NODE;

3404

const struct cpumask *tmp = cpumask_of_node(0);

3414

const struct cpumask *tmp = cpumask_of_node(0);

3405

3415

3406

/* Use the local node if we haven't already */

3416

/* Use the local node if we haven't already */

3407

if (!node_isset(node, *used_node_mask)) {

3417

if (!node_isset(node, *used_node_mask)) {

3408

node_set(node, *used_node_mask);

3418

node_set(node, *used_node_mask);

3409

return node;

3419

return node;

3410

}

3420

}

3411

3421

3412

for_each_node_state(n, N_MEMORY) {

3422

for_each_node_state(n, N_MEMORY) {

3413

3423

3414

/* Don't want a node to appear more than once */

3424

/* Don't want a node to appear more than once */

3415

if (node_isset(n, *used_node_mask))

3425

if (node_isset(n, *used_node_mask))

3416

continue;

3426

continue;

3417

3427

3418

/* Use the distance array to find the distance */

3428

/* Use the distance array to find the distance */

3419

val = node_distance(node, n);

3429

val = node_distance(node, n);

3420

3430

3421

/* Penalize nodes under us ("prefer the next node") */

3431

/* Penalize nodes under us ("prefer the next node") */

3422

val += (n < node);

3432

val += (n < node);

3423

3433

3424

/* Give preference to headless and unused nodes */

3434

/* Give preference to headless and unused nodes */

3425

tmp = cpumask_of_node(n);

3435

tmp = cpumask_of_node(n);

3426

if (!cpumask_empty(tmp))

3436

if (!cpumask_empty(tmp))

3427

val += PENALTY_FOR_NODE_WITH_CPUS;

3437

val += PENALTY_FOR_NODE_WITH_CPUS;

3428

3438

3429

/* Slight preference for less loaded node */

3439

/* Slight preference for less loaded node */

3430

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3440

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3431

val += node_load[n];

3441

val += node_load[n];

3432

3442

3433

if (val < min_val) {

3443

if (val < min_val) {

3434

min_val = val;

3444

min_val = val;

3435

best_node = n;

3445

best_node = n;

3436

}

3446

}

3437

}

3447

}

3438

3448

3439

if (best_node >= 0)

3449

if (best_node >= 0)

3440

node_set(best_node, *used_node_mask);

3450

node_set(best_node, *used_node_mask);

3441

3451

3442

return best_node;

3452

return best_node;

3443

}

3453

}

3444

3454

3445

3455

3446

/*

3456

/*

3447

* Build zonelists ordered by node and zones within node.

3457

* Build zonelists ordered by node and zones within node.

3448

* This results in maximum locality--normal zone overflows into local

3458

* This results in maximum locality--normal zone overflows into local

3449

* DMA zone, if any--but risks exhausting DMA zone.

3459

* DMA zone, if any--but risks exhausting DMA zone.

3450

*/

3460

*/

3451

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3461

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3452

{

3462

{

3453

int j;

3463

int j;

3454

struct zonelist *zonelist;

3464

struct zonelist *zonelist;

3455

3465

3456

zonelist = &pgdat->node_zonelists[0];

3466

zonelist = &pgdat->node_zonelists[0];

3457

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3467

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3458

;

3468

;

3459

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3469

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3460

zonelist->_zonerefs[j].zone = NULL;

3470

zonelist->_zonerefs[j].zone = NULL;

3461

zonelist->_zonerefs[j].zone_idx = 0;

3471

zonelist->_zonerefs[j].zone_idx = 0;

3462

}

3472

}

3463

3473

3464

/*

3474

/*

3465

* Build gfp_thisnode zonelists

3475

* Build gfp_thisnode zonelists

3466

*/

3476

*/

3467

static void build_thisnode_zonelists(pg_data_t *pgdat)

3477

static void build_thisnode_zonelists(pg_data_t *pgdat)

3468

{

3478

{

3469

int j;

3479

int j;

3470

struct zonelist *zonelist;

3480

struct zonelist *zonelist;

3471

3481

3472

zonelist = &pgdat->node_zonelists[1];

3482

zonelist = &pgdat->node_zonelists[1];

3473

j = build_zonelists_node(pgdat, zonelist, 0);

3483

j = build_zonelists_node(pgdat, zonelist, 0);

3474

zonelist->_zonerefs[j].zone = NULL;

3484

zonelist->_zonerefs[j].zone = NULL;

3475

zonelist->_zonerefs[j].zone_idx = 0;

3485

zonelist->_zonerefs[j].zone_idx = 0;

3476

}

3486

}

3477

3487

3478

/*

3488

/*

3479

* Build zonelists ordered by zone and nodes within zones.

3489

* Build zonelists ordered by zone and nodes within zones.

3480

* This results in conserving DMA zone[s] until all Normal memory is

3490

* This results in conserving DMA zone[s] until all Normal memory is

3481

* exhausted, but results in overflowing to remote node while memory

3491

* exhausted, but results in overflowing to remote node while memory

3482

* may still exist in local DMA zone.

3492

* may still exist in local DMA zone.

3483

*/

3493

*/

3484

static int node_order[MAX_NUMNODES];

3494

static int node_order[MAX_NUMNODES];

3485

3495

3486

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3496

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3487

{

3497

{

3488

int pos, j, node;

3498

int pos, j, node;

3489

int zone_type; /* needs to be signed */

3499

int zone_type; /* needs to be signed */

3490

struct zone *z;

3500

struct zone *z;

3491

struct zonelist *zonelist;

3501

struct zonelist *zonelist;

3492

3502

3493

zonelist = &pgdat->node_zonelists[0];

3503

zonelist = &pgdat->node_zonelists[0];

3494

pos = 0;

3504

pos = 0;

3495

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3505

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3496

for (j = 0; j < nr_nodes; j++) {

3506

for (j = 0; j < nr_nodes; j++) {

3497

node = node_order[j];

3507

node = node_order[j];

3498

z = &NODE_DATA(node)->node_zones[zone_type];

3508

z = &NODE_DATA(node)->node_zones[zone_type];

3499

if (populated_zone(z)) {

3509

if (populated_zone(z)) {

3500

zoneref_set_zone(z,

3510

zoneref_set_zone(z,

3501

&zonelist->_zonerefs[pos++]);

3511

&zonelist->_zonerefs[pos++]);

3502

check_highest_zone(zone_type);

3512

check_highest_zone(zone_type);

3503

}

3513

}

3504

}

3514

}

3505

}

3515

}

3506

zonelist->_zonerefs[pos].zone = NULL;

3516

zonelist->_zonerefs[pos].zone = NULL;

3507

zonelist->_zonerefs[pos].zone_idx = 0;

3517

zonelist->_zonerefs[pos].zone_idx = 0;

3508

}

3518

}

3509

3519

3510

static int default_zonelist_order(void)

3520

static int default_zonelist_order(void)

3511

{

3521

{

3512

int nid, zone_type;

3522

int nid, zone_type;

3513

unsigned long low_kmem_size, total_size;

3523

unsigned long low_kmem_size, total_size;

3514

struct zone *z;

3524

struct zone *z;

3515

int average_size;

3525

int average_size;

3516

/*

3526

/*

3517

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3527

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3518

* If they are really small and used heavily, the system can fall

3528

* If they are really small and used heavily, the system can fall

3519

* into OOM very easily.

3529

* into OOM very easily.

3520

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3530

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3521

*/

3531

*/

3522

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3532

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3523

low_kmem_size = 0;

3533

low_kmem_size = 0;

3524

total_size = 0;

3534

total_size = 0;

3525

for_each_online_node(nid) {

3535

for_each_online_node(nid) {

3526

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3536

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3527

z = &NODE_DATA(nid)->node_zones[zone_type];

3537

z = &NODE_DATA(nid)->node_zones[zone_type];

3528

if (populated_zone(z)) {

3538

if (populated_zone(z)) {

3529

if (zone_type < ZONE_NORMAL)

3539

if (zone_type < ZONE_NORMAL)

3530

low_kmem_size += z->managed_pages;

3540

low_kmem_size += z->managed_pages;

3531

total_size += z->managed_pages;

3541

total_size += z->managed_pages;

3532

} else if (zone_type == ZONE_NORMAL) {

3542

} else if (zone_type == ZONE_NORMAL) {

3533

/*

3543

/*

3534

* If any node has only lowmem, then node order

3544

* If any node has only lowmem, then node order

3535

* is preferred to allow kernel allocations

3545

* is preferred to allow kernel allocations

3536

* locally; otherwise, they can easily infringe

3546

* locally; otherwise, they can easily infringe

3537

* on other nodes when there is an abundance of

3547

* on other nodes when there is an abundance of

3538

* lowmem available to allocate from.

3548

* lowmem available to allocate from.

3539

*/

3549

*/

3540

return ZONELIST_ORDER_NODE;

3550

return ZONELIST_ORDER_NODE;

3541

}

3551

}

3542

}

3552

}

3543

}

3553

}

3544

if (!low_kmem_size || /* there are no DMA area. */

3554

if (!low_kmem_size || /* there are no DMA area. */

3545

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3555

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3546

return ZONELIST_ORDER_NODE;

3556

return ZONELIST_ORDER_NODE;

3547

/*

3557

/*

3548

* look into each node's config.

3558

* look into each node's config.

3549

* If there is a node whose DMA/DMA32 memory is very big area on

3559

* If there is a node whose DMA/DMA32 memory is very big area on

3550

* local memory, NODE_ORDER may be suitable.

3560

* local memory, NODE_ORDER may be suitable.

3551

*/

3561

*/

3552

average_size = total_size /

3562

average_size = total_size /

3553

(nodes_weight(node_states[N_MEMORY]) + 1);

3563

(nodes_weight(node_states[N_MEMORY]) + 1);

3554

for_each_online_node(nid) {

3564

for_each_online_node(nid) {

3555

low_kmem_size = 0;

3565

low_kmem_size = 0;

3556

total_size = 0;

3566

total_size = 0;

3557

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3567

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3558

z = &NODE_DATA(nid)->node_zones[zone_type];

3568

z = &NODE_DATA(nid)->node_zones[zone_type];

3559

if (populated_zone(z)) {

3569

if (populated_zone(z)) {

3560

if (zone_type < ZONE_NORMAL)

3570

if (zone_type < ZONE_NORMAL)

3561

low_kmem_size += z->present_pages;

3571

low_kmem_size += z->present_pages;

3562

total_size += z->present_pages;

3572

total_size += z->present_pages;

3563

}

3573

}

3564

}

3574

}

3565

if (low_kmem_size &&

3575

if (low_kmem_size &&

3566

total_size > average_size && /* ignore small node */

3576

total_size > average_size && /* ignore small node */

3567

low_kmem_size > total_size * 70/100)

3577

low_kmem_size > total_size * 70/100)

3568

return ZONELIST_ORDER_NODE;

3578

return ZONELIST_ORDER_NODE;

3569

}

3579

}

3570

return ZONELIST_ORDER_ZONE;

3580

return ZONELIST_ORDER_ZONE;

3571

}

3581

}

3572

3582

3573

static void set_zonelist_order(void)

3583

static void set_zonelist_order(void)

3574

{

3584

{

3575

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3585

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3576

current_zonelist_order = default_zonelist_order();

3586

current_zonelist_order = default_zonelist_order();

3577

else

3587

else

3578

current_zonelist_order = user_zonelist_order;

3588

current_zonelist_order = user_zonelist_order;

3579

}

3589

}

3580

3590

3581

static void build_zonelists(pg_data_t *pgdat)

3591

static void build_zonelists(pg_data_t *pgdat)

3582

{

3592

{

3583

int j, node, load;

3593

int j, node, load;

3584

enum zone_type i;

3594

enum zone_type i;

3585

nodemask_t used_mask;

3595

nodemask_t used_mask;

3586

int local_node, prev_node;

3596

int local_node, prev_node;

3587

struct zonelist *zonelist;

3597

struct zonelist *zonelist;

3588

int order = current_zonelist_order;

3598

int order = current_zonelist_order;

3589

3599

3590

/* initialize zonelists */

3600

/* initialize zonelists */

3591

for (i = 0; i < MAX_ZONELISTS; i++) {

3601

for (i = 0; i < MAX_ZONELISTS; i++) {

3592

zonelist = pgdat->node_zonelists + i;

3602

zonelist = pgdat->node_zonelists + i;

3593

zonelist->_zonerefs[0].zone = NULL;

3603

zonelist->_zonerefs[0].zone = NULL;

3594

zonelist->_zonerefs[0].zone_idx = 0;

3604

zonelist->_zonerefs[0].zone_idx = 0;

3595

}

3605

}

3596

3606

3597

/* NUMA-aware ordering of nodes */

3607

/* NUMA-aware ordering of nodes */

3598

local_node = pgdat->node_id;

3608

local_node = pgdat->node_id;

3599

load = nr_online_nodes;

3609

load = nr_online_nodes;

3600

prev_node = local_node;

3610

prev_node = local_node;

3601

nodes_clear(used_mask);

3611

nodes_clear(used_mask);

3602

3612

3603

memset(node_order, 0, sizeof(node_order));

3613

memset(node_order, 0, sizeof(node_order));

3604

j = 0;

3614

j = 0;

3605

3615

3606

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3616

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3607

/*

3617

/*

3608

* We don't want to pressure a particular node.

3618

* We don't want to pressure a particular node.

3609

* So adding penalty to the first node in same

3619

* So adding penalty to the first node in same

3610

* distance group to make it round-robin.

3620

* distance group to make it round-robin.

3611

*/

3621

*/

3612

if (node_distance(local_node, node) !=

3622

if (node_distance(local_node, node) !=

3613

node_distance(local_node, prev_node))

3623

node_distance(local_node, prev_node))

3614

node_load[node] = load;

3624

node_load[node] = load;

3615

3625

3616

prev_node = node;

3626

prev_node = node;

3617

load--;

3627

load--;

3618

if (order == ZONELIST_ORDER_NODE)

3628

if (order == ZONELIST_ORDER_NODE)

3619

build_zonelists_in_node_order(pgdat, node);

3629

build_zonelists_in_node_order(pgdat, node);

3620

else

3630

else

3621

node_order[j++] = node; /* remember order */

3631

node_order[j++] = node; /* remember order */

3622

}

3632

}

3623

3633

3624

if (order == ZONELIST_ORDER_ZONE) {

3634

if (order == ZONELIST_ORDER_ZONE) {

3625

/* calculate node order -- i.e., DMA last! */

3635

/* calculate node order -- i.e., DMA last! */

3626

build_zonelists_in_zone_order(pgdat, j);

3636

build_zonelists_in_zone_order(pgdat, j);

3627

}

3637

}

3628

3638

3629

build_thisnode_zonelists(pgdat);

3639

build_thisnode_zonelists(pgdat);

3630

}

3640

}

3631

3641

3632

/* Construct the zonelist performance cache - see further mmzone.h */

3642

/* Construct the zonelist performance cache - see further mmzone.h */

3633

static void build_zonelist_cache(pg_data_t *pgdat)

3643

static void build_zonelist_cache(pg_data_t *pgdat)

3634

{

3644

{

3635

struct zonelist *zonelist;

3645

struct zonelist *zonelist;

3636

struct zonelist_cache *zlc;

3646

struct zonelist_cache *zlc;

3637

struct zoneref *z;

3647

struct zoneref *z;

3638

3648

3639

zonelist = &pgdat->node_zonelists[0];

3649

zonelist = &pgdat->node_zonelists[0];

3640

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3650

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3641

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3651

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3642

for (z = zonelist->_zonerefs; z->zone; z++)

3652

for (z = zonelist->_zonerefs; z->zone; z++)

3643

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3653

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3644

}

3654

}

3645

3655

3646

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3656

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3647

/*

3657

/*

3648

* Return node id of node used for "local" allocations.

3658

* Return node id of node used for "local" allocations.

3649

* I.e., first node id of first zone in arg node's generic zonelist.

3659

* I.e., first node id of first zone in arg node's generic zonelist.

3650

* Used for initializing percpu 'numa_mem', which is used primarily

3660

* Used for initializing percpu 'numa_mem', which is used primarily

3651

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3661

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3652

*/

3662

*/

3653

int local_memory_node(int node)

3663

int local_memory_node(int node)

3654

{

3664

{

3655

struct zone *zone;

3665

struct zone *zone;

3656

3666

3657

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3667

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3658

gfp_zone(GFP_KERNEL),

3668

gfp_zone(GFP_KERNEL),

3659

NULL,

3669

NULL,

3660

&zone);

3670

&zone);

3661

return zone->node;

3671

return zone->node;

3662

}

3672

}

3663

#endif

3673

#endif

3664

3674

3665

#else /* CONFIG_NUMA */

3675

#else /* CONFIG_NUMA */

3666

3676

3667

static void set_zonelist_order(void)

3677

static void set_zonelist_order(void)

3668

{

3678

{

3669

current_zonelist_order = ZONELIST_ORDER_ZONE;

3679

current_zonelist_order = ZONELIST_ORDER_ZONE;

3670

}

3680

}

3671

3681

3672

static void build_zonelists(pg_data_t *pgdat)

3682

static void build_zonelists(pg_data_t *pgdat)

3673

{

3683

{

3674

int node, local_node;

3684

int node, local_node;

3675

enum zone_type j;

3685

enum zone_type j;

3676

struct zonelist *zonelist;

3686

struct zonelist *zonelist;

3677

3687

3678

local_node = pgdat->node_id;

3688

local_node = pgdat->node_id;

3679

3689

3680

zonelist = &pgdat->node_zonelists[0];

3690

zonelist = &pgdat->node_zonelists[0];

3681

j = build_zonelists_node(pgdat, zonelist, 0);

3691

j = build_zonelists_node(pgdat, zonelist, 0);

3682

3692

3683

/*

3693

/*

3684

* Now we build the zonelist so that it contains the zones

3694

* Now we build the zonelist so that it contains the zones

3685

* of all the other nodes.

3695

* of all the other nodes.

3686

* We don't want to pressure a particular node, so when

3696

* We don't want to pressure a particular node, so when

3687

* building the zones for node N, we make sure that the

3697

* building the zones for node N, we make sure that the

3688

* zones coming right after the local ones are those from

3698

* zones coming right after the local ones are those from

3689

* node N+1 (modulo N)

3699

* node N+1 (modulo N)

3690

*/

3700

*/

3691

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3701

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3692

if (!node_online(node))

3702

if (!node_online(node))

3693

continue;

3703

continue;

3694

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3704

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3695

}

3705

}

3696

for (node = 0; node < local_node; node++) {

3706

for (node = 0; node < local_node; node++) {

3697

if (!node_online(node))

3707

if (!node_online(node))

3698

continue;

3708

continue;

3699

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3709

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3700

}

3710

}

3701

3711

3702

zonelist->_zonerefs[j].zone = NULL;

3712

zonelist->_zonerefs[j].zone = NULL;

3703

zonelist->_zonerefs[j].zone_idx = 0;

3713

zonelist->_zonerefs[j].zone_idx = 0;

3704

}

3714

}

3705

3715

3706

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3716

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3707

static void build_zonelist_cache(pg_data_t *pgdat)

3717

static void build_zonelist_cache(pg_data_t *pgdat)

3708

{

3718

{

3709

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3719

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3710

}

3720

}

3711

3721

3712

#endif /* CONFIG_NUMA */

3722

#endif /* CONFIG_NUMA */

3713

3723

3714

/*

3724

/*

3715

* Boot pageset table. One per cpu which is going to be used for all

3725

* Boot pageset table. One per cpu which is going to be used for all

3716

* zones and all nodes. The parameters will be set in such a way

3726

* zones and all nodes. The parameters will be set in such a way

3717

* that an item put on a list will immediately be handed over to

3727

* that an item put on a list will immediately be handed over to

3718

* the buddy list. This is safe since pageset manipulation is done

3728

* the buddy list. This is safe since pageset manipulation is done

3719

* with interrupts disabled.

3729

* with interrupts disabled.

3720

*

3730

*

3721

* The boot_pagesets must be kept even after bootup is complete for

3731

* The boot_pagesets must be kept even after bootup is complete for

3722

* unused processors and/or zones. They do play a role for bootstrapping

3732

* unused processors and/or zones. They do play a role for bootstrapping

3723

* hotplugged processors.

3733

* hotplugged processors.

3724

*

3734

*

3725

* zoneinfo_show() and maybe other functions do

3735

* zoneinfo_show() and maybe other functions do

3726

* not check if the processor is online before following the pageset pointer.

3736

* not check if the processor is online before following the pageset pointer.

3727

* Other parts of the kernel may not check if the zone is available.

3737

* Other parts of the kernel may not check if the zone is available.

3728

*/

3738

*/

3729

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3739

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3730

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3740

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3731

static void setup_zone_pageset(struct zone *zone);

3741

static void setup_zone_pageset(struct zone *zone);

3732

3742

3733

/*

3743

/*

3734

* Global mutex to protect against size modification of zonelists

3744

* Global mutex to protect against size modification of zonelists

3735

* as well as to serialize pageset setup for the new populated zone.

3745

* as well as to serialize pageset setup for the new populated zone.

3736

*/

3746

*/

3737

DEFINE_MUTEX(zonelists_mutex);

3747

DEFINE_MUTEX(zonelists_mutex);

3738

3748

3739

/* return values int ....just for stop_machine() */

3749

/* return values int ....just for stop_machine() */

3740

static int __build_all_zonelists(void *data)

3750

static int __build_all_zonelists(void *data)

3741

{

3751

{

3742

int nid;

3752

int nid;

3743

int cpu;

3753

int cpu;

3744

pg_data_t *self = data;

3754

pg_data_t *self = data;

3745

3755

3746

#ifdef CONFIG_NUMA

3756

#ifdef CONFIG_NUMA

3747

memset(node_load, 0, sizeof(node_load));

3757

memset(node_load, 0, sizeof(node_load));

3748

#endif

3758

#endif

3749

3759

3750

if (self && !node_online(self->node_id)) {

3760

if (self && !node_online(self->node_id)) {

3751

build_zonelists(self);

3761

build_zonelists(self);

3752

build_zonelist_cache(self);

3762

build_zonelist_cache(self);

3753

}

3763

}

3754

3764

3755

for_each_online_node(nid) {

3765

for_each_online_node(nid) {

3756

pg_data_t *pgdat = NODE_DATA(nid);

3766

pg_data_t *pgdat = NODE_DATA(nid);

3757

3767

3758

build_zonelists(pgdat);

3768

build_zonelists(pgdat);

3759

build_zonelist_cache(pgdat);

3769

build_zonelist_cache(pgdat);

3760

}

3770

}

3761

3771

3762

/*

3772

/*

3763

* Initialize the boot_pagesets that are going to be used

3773

* Initialize the boot_pagesets that are going to be used

3764

* for bootstrapping processors. The real pagesets for

3774

* for bootstrapping processors. The real pagesets for

3765

* each zone will be allocated later when the per cpu

3775

* each zone will be allocated later when the per cpu

3766

* allocator is available.

3776

* allocator is available.

3767

*

3777

*

3768

* boot_pagesets are used also for bootstrapping offline

3778

* boot_pagesets are used also for bootstrapping offline

3769

* cpus if the system is already booted because the pagesets

3779

* cpus if the system is already booted because the pagesets

3770

* are needed to initialize allocators on a specific cpu too.

3780

* are needed to initialize allocators on a specific cpu too.

3771

* F.e. the percpu allocator needs the page allocator which

3781

* F.e. the percpu allocator needs the page allocator which

3772

* needs the percpu allocator in order to allocate its pagesets

3782

* needs the percpu allocator in order to allocate its pagesets

3773

* (a chicken-egg dilemma).

3783

* (a chicken-egg dilemma).

3774

*/

3784

*/

3775

for_each_possible_cpu(cpu) {

3785

for_each_possible_cpu(cpu) {

3776

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3786

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3777

3787

3778

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3788

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3779

/*

3789

/*

3780

* We now know the "local memory node" for each node--

3790

* We now know the "local memory node" for each node--

3781

* i.e., the node of the first zone in the generic zonelist.

3791

* i.e., the node of the first zone in the generic zonelist.

3782

* Set up numa_mem percpu variable for on-line cpus. During

3792

* Set up numa_mem percpu variable for on-line cpus. During

3783

* boot, only the boot cpu should be on-line; we'll init the

3793

* boot, only the boot cpu should be on-line; we'll init the

3784

* secondary cpus' numa_mem as they come on-line. During

3794

* secondary cpus' numa_mem as they come on-line. During

3785

* node/memory hotplug, we'll fixup all on-line cpus.

3795

* node/memory hotplug, we'll fixup all on-line cpus.

3786

*/

3796

*/

3787

if (cpu_online(cpu))

3797

if (cpu_online(cpu))

3788

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3798

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3789

#endif

3799

#endif

3790

}

3800

}

3791

3801

3792

return 0;

3802

return 0;

3793

}

3803

}

3794

3804

3795

/*

3805

/*

3796

* Called with zonelists_mutex held always

3806

* Called with zonelists_mutex held always

3797

* unless system_state == SYSTEM_BOOTING.

3807

* unless system_state == SYSTEM_BOOTING.

3798

*/

3808

*/

3799

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3809

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3800

{

3810

{

3801

set_zonelist_order();

3811

set_zonelist_order();

3802

3812

3803

if (system_state == SYSTEM_BOOTING) {

3813

if (system_state == SYSTEM_BOOTING) {

3804

__build_all_zonelists(NULL);

3814

__build_all_zonelists(NULL);

3805

mminit_verify_zonelist();

3815

mminit_verify_zonelist();

3806

cpuset_init_current_mems_allowed();

3816

cpuset_init_current_mems_allowed();

3807

} else {

3817

} else {

3808

#ifdef CONFIG_MEMORY_HOTPLUG

3818

#ifdef CONFIG_MEMORY_HOTPLUG

3809

if (zone)

3819

if (zone)

3810

setup_zone_pageset(zone);

3820

setup_zone_pageset(zone);

3811

#endif

3821

#endif

3812

/* we have to stop all cpus to guarantee there is no user

3822

/* we have to stop all cpus to guarantee there is no user

3813

of zonelist */

3823

of zonelist */

3814

stop_machine(__build_all_zonelists, pgdat, NULL);

3824

stop_machine(__build_all_zonelists, pgdat, NULL);

3815

/* cpuset refresh routine should be here */

3825

/* cpuset refresh routine should be here */

3816

}

3826

}

3817

vm_total_pages = nr_free_pagecache_pages();

3827

vm_total_pages = nr_free_pagecache_pages();

3818

/*

3828

/*

3819

* Disable grouping by mobility if the number of pages in the

3829

* Disable grouping by mobility if the number of pages in the

3820

* system is too low to allow the mechanism to work. It would be

3830

* system is too low to allow the mechanism to work. It would be

3821

* more accurate, but expensive to check per-zone. This check is

3831

* more accurate, but expensive to check per-zone. This check is

3822

* made on memory-hotadd so a system can start with mobility

3832

* made on memory-hotadd so a system can start with mobility

3823

* disabled and enable it later

3833

* disabled and enable it later

3824

*/

3834

*/

3825

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3835

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3826

page_group_by_mobility_disabled = 1;

3836

page_group_by_mobility_disabled = 1;

3827

else

3837

else

3828

page_group_by_mobility_disabled = 0;

3838

page_group_by_mobility_disabled = 0;

3829

3839

3830

printk("Built %i zonelists in %s order, mobility grouping %s. "

3840

printk("Built %i zonelists in %s order, mobility grouping %s. "

3831

"Total pages: %ld\n",

3841

"Total pages: %ld\n",

3832

nr_online_nodes,

3842

nr_online_nodes,

3833

zonelist_order_name[current_zonelist_order],

3843

zonelist_order_name[current_zonelist_order],

3834

page_group_by_mobility_disabled ? "off" : "on",

3844

page_group_by_mobility_disabled ? "off" : "on",

3835

vm_total_pages);

3845

vm_total_pages);

3836

#ifdef CONFIG_NUMA

3846

#ifdef CONFIG_NUMA

3837

printk("Policy zone: %s\n", zone_names[policy_zone]);

3847

printk("Policy zone: %s\n", zone_names[policy_zone]);

3838

#endif

3848

#endif

3839

}

3849

}

3840

3850

3841

/*

3851

/*

3842

* Helper functions to size the waitqueue hash table.

3852

* Helper functions to size the waitqueue hash table.

3843

* Essentially these want to choose hash table sizes sufficiently

3853

* Essentially these want to choose hash table sizes sufficiently

3844

* large so that collisions trying to wait on pages are rare.

3854

* large so that collisions trying to wait on pages are rare.

3845

* But in fact, the number of active page waitqueues on typical

3855

* But in fact, the number of active page waitqueues on typical

3846

* systems is ridiculously low, less than 200. So this is even

3856

* systems is ridiculously low, less than 200. So this is even

3847

* conservative, even though it seems large.

3857

* conservative, even though it seems large.

3848

*

3858

*

3849

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3859

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3850

* waitqueues, i.e. the size of the waitq table given the number of pages.

3860

* waitqueues, i.e. the size of the waitq table given the number of pages.

3851

*/

3861

*/

3852

#define PAGES_PER_WAITQUEUE 256

3862

#define PAGES_PER_WAITQUEUE 256

3853

3863

3854

#ifndef CONFIG_MEMORY_HOTPLUG

3864

#ifndef CONFIG_MEMORY_HOTPLUG

3855

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3865

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3856

{

3866

{

3857

unsigned long size = 1;

3867

unsigned long size = 1;

3858

3868

3859

pages /= PAGES_PER_WAITQUEUE;

3869

pages /= PAGES_PER_WAITQUEUE;

3860

3870

3861

while (size < pages)

3871

while (size < pages)

3862

size <<= 1;

3872

size <<= 1;

3863

3873

3864

/*

3874

/*

3865

* Once we have dozens or even hundreds of threads sleeping

3875

* Once we have dozens or even hundreds of threads sleeping

3866

* on IO we've got bigger problems than wait queue collision.

3876

* on IO we've got bigger problems than wait queue collision.

3867

* Limit the size of the wait table to a reasonable size.

3877

* Limit the size of the wait table to a reasonable size.

3868

*/

3878

*/

3869

size = min(size, 4096UL);

3879

size = min(size, 4096UL);

3870

3880

3871

return max(size, 4UL);

3881

return max(size, 4UL);

3872

}

3882

}

3873

#else

3883

#else

3874

/*

3884

/*

3875

* A zone's size might be changed by hot-add, so it is not possible to determine

3885

* A zone's size might be changed by hot-add, so it is not possible to determine

3876

* a suitable size for its wait_table. So we use the maximum size now.

3886

* a suitable size for its wait_table. So we use the maximum size now.

3877

*

3887

*

3878

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3888

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3879

*

3889

*

3880

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3890

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3881

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3891

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3882

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3892

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3883

*

3893

*

3884

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3894

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3885

* or more by the traditional way. (See above). It equals:

3895

* or more by the traditional way. (See above). It equals:

3886

*

3896

*

3887

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3897

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3888

* ia64(16K page size) : = ( 8G + 4M)byte.

3898

* ia64(16K page size) : = ( 8G + 4M)byte.

3889

* powerpc (64K page size) : = (32G +16M)byte.

3899

* powerpc (64K page size) : = (32G +16M)byte.

3890

*/

3900

*/

3891

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3901

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3892

{

3902

{

3893

return 4096UL;

3903

return 4096UL;

3894

}

3904

}

3895

#endif

3905

#endif

3896

3906

3897

/*

3907

/*

3898

* This is an integer logarithm so that shifts can be used later

3908

* This is an integer logarithm so that shifts can be used later

3899

* to extract the more random high bits from the multiplicative

3909

* to extract the more random high bits from the multiplicative

3900

* hash function before the remainder is taken.

3910

* hash function before the remainder is taken.

3901

*/

3911

*/

3902

static inline unsigned long wait_table_bits(unsigned long size)

3912

static inline unsigned long wait_table_bits(unsigned long size)

3903

{

3913

{

3904

return ffz(~size);

3914

return ffz(~size);

3905

}

3915

}

3906

3916

3907

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3917

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3908

3918

3909

/*

3919

/*

3910

* Check if a pageblock contains reserved pages

3920

* Check if a pageblock contains reserved pages

3911

*/

3921

*/

3912

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3922

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3913

{

3923

{

3914

unsigned long pfn;

3924

unsigned long pfn;

3915

3925

3916

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3926

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3917

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3927

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3918

return 1;

3928

return 1;

3919

}

3929

}

3920

return 0;

3930

return 0;

3921

}

3931

}

3922

3932

3923

/*

3933

/*

3924

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3934

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3925

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3935

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3926

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3936

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3927

* higher will lead to a bigger reserve which will get freed as contiguous

3937

* higher will lead to a bigger reserve which will get freed as contiguous

3928

* blocks as reclaim kicks in

3938

* blocks as reclaim kicks in

3929

*/

3939

*/

3930

static void setup_zone_migrate_reserve(struct zone *zone)

3940

static void setup_zone_migrate_reserve(struct zone *zone)

3931

{

3941

{

3932

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3942

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3933

struct page *page;

3943

struct page *page;

3934

unsigned long block_migratetype;

3944

unsigned long block_migratetype;

3935

int reserve;

3945

int reserve;

3936

int old_reserve;

3946

int old_reserve;

3937

3947

3938

/*

3948

/*

3939

* Get the start pfn, end pfn and the number of blocks to reserve

3949

* Get the start pfn, end pfn and the number of blocks to reserve

3940

* We have to be careful to be aligned to pageblock_nr_pages to

3950

* We have to be careful to be aligned to pageblock_nr_pages to

3941

* make sure that we always check pfn_valid for the first page in

3951

* make sure that we always check pfn_valid for the first page in

3942

* the block.

3952

* the block.

3943

*/

3953

*/

3944

start_pfn = zone->zone_start_pfn;

3954

start_pfn = zone->zone_start_pfn;

3945

end_pfn = zone_end_pfn(zone);

3955

end_pfn = zone_end_pfn(zone);

3946

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3956

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3947

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3957

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3948

pageblock_order;

3958

pageblock_order;

3949

3959

3950

/*

3960

/*

3951

* Reserve blocks are generally in place to help high-order atomic

3961

* Reserve blocks are generally in place to help high-order atomic

3952

* allocations that are short-lived. A min_free_kbytes value that

3962

* allocations that are short-lived. A min_free_kbytes value that

3953

* would result in more than 2 reserve blocks for atomic allocations

3963

* would result in more than 2 reserve blocks for atomic allocations

3954

* is assumed to be in place to help anti-fragmentation for the

3964

* is assumed to be in place to help anti-fragmentation for the

3955

* future allocation of hugepages at runtime.

3965

* future allocation of hugepages at runtime.

3956

*/

3966

*/

3957

reserve = min(2, reserve);

3967

reserve = min(2, reserve);

3958

old_reserve = zone->nr_migrate_reserve_block;

3968

old_reserve = zone->nr_migrate_reserve_block;

3959

3969

3960

/* When memory hot-add, we almost always need to do nothing */

3970

/* When memory hot-add, we almost always need to do nothing */

3961

if (reserve == old_reserve)

3971

if (reserve == old_reserve)

3962

return;

3972

return;

3963

zone->nr_migrate_reserve_block = reserve;

3973

zone->nr_migrate_reserve_block = reserve;

3964

3974

3965

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3975

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3966

if (!pfn_valid(pfn))

3976

if (!pfn_valid(pfn))

3967

continue;

3977

continue;

3968

page = pfn_to_page(pfn);

3978

page = pfn_to_page(pfn);

3969

3979

3970

/* Watch out for overlapping nodes */

3980

/* Watch out for overlapping nodes */

3971

if (page_to_nid(page) != zone_to_nid(zone))

3981

if (page_to_nid(page) != zone_to_nid(zone))

3972

continue;

3982

continue;

3973

3983

3974

block_migratetype = get_pageblock_migratetype(page);

3984

block_migratetype = get_pageblock_migratetype(page);

3975

3985

3976

/* Only test what is necessary when the reserves are not met */

3986

/* Only test what is necessary when the reserves are not met */

3977

if (reserve > 0) {

3987

if (reserve > 0) {

3978

/*

3988

/*

3979

* Blocks with reserved pages will never free, skip

3989

* Blocks with reserved pages will never free, skip

3980

* them.

3990

* them.

3981

*/

3991

*/

3982

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3992

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3983

if (pageblock_is_reserved(pfn, block_end_pfn))

3993

if (pageblock_is_reserved(pfn, block_end_pfn))

3984

continue;

3994

continue;

3985

3995

3986

/* If this block is reserved, account for it */

3996

/* If this block is reserved, account for it */

3987

if (block_migratetype == MIGRATE_RESERVE) {

3997

if (block_migratetype == MIGRATE_RESERVE) {

3988

reserve--;

3998

reserve--;

3989

continue;

3999

continue;

3990

}

4000

}

3991

4001

3992

/* Suitable for reserving if this block is movable */

4002

/* Suitable for reserving if this block is movable */

3993

if (block_migratetype == MIGRATE_MOVABLE) {

4003

if (block_migratetype == MIGRATE_MOVABLE) {

3994

set_pageblock_migratetype(page,

4004

set_pageblock_migratetype(page,

3995

MIGRATE_RESERVE);

4005

MIGRATE_RESERVE);

3996

move_freepages_block(zone, page,

4006

move_freepages_block(zone, page,

3997

MIGRATE_RESERVE);

4007

MIGRATE_RESERVE);

3998

reserve--;

4008

reserve--;

3999

continue;

4009

continue;

4000

}

4010

}

4001

} else if (!old_reserve) {

4011

} else if (!old_reserve) {

4002

/*

4012

/*

4003

* At boot time we don't need to scan the whole zone

4013

* At boot time we don't need to scan the whole zone

4004

* for turning off MIGRATE_RESERVE.

4014

* for turning off MIGRATE_RESERVE.

4005

*/

4015

*/

4006

break;

4016

break;

4007

}

4017

}

4008

4018

4009

/*

4019

/*

4010

* If the reserve is met and this is a previous reserved block,

4020

* If the reserve is met and this is a previous reserved block,

4011

* take it back

4021

* take it back

4012

*/

4022

*/

4013

if (block_migratetype == MIGRATE_RESERVE) {

4023

if (block_migratetype == MIGRATE_RESERVE) {

4014

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4024

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4015

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4025

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4016

}

4026

}

4017

}

4027

}

4018

}

4028

}

4019

4029

4020

/*

4030

/*

4021

* Initially all pages are reserved - free ones are freed

4031

* Initially all pages are reserved - free ones are freed

4022

* up by free_all_bootmem() once the early boot process is

4032

* up by free_all_bootmem() once the early boot process is

4023

* done. Non-atomic initialization, single-pass.

4033

* done. Non-atomic initialization, single-pass.

4024

*/

4034

*/

4025

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4035

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4026

unsigned long start_pfn, enum memmap_context context)

4036

unsigned long start_pfn, enum memmap_context context)

4027

{

4037

{

4028

struct page *page;

4038

struct page *page;

4029

unsigned long end_pfn = start_pfn + size;

4039

unsigned long end_pfn = start_pfn + size;

4030

unsigned long pfn;

4040

unsigned long pfn;

4031

struct zone *z;

4041

struct zone *z;

4032

4042

4033

if (highest_memmap_pfn < end_pfn - 1)

4043

if (highest_memmap_pfn < end_pfn - 1)

4034

highest_memmap_pfn = end_pfn - 1;

4044

highest_memmap_pfn = end_pfn - 1;

4035

4045

4036

z = &NODE_DATA(nid)->node_zones[zone];

4046

z = &NODE_DATA(nid)->node_zones[zone];

4037

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4047

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4038

/*

4048

/*

4039

* There can be holes in boot-time mem_map[]s

4049

* There can be holes in boot-time mem_map[]s

4040

* handed to this function. They do not

4050

* handed to this function. They do not

4041

* exist on hotplugged memory.

4051

* exist on hotplugged memory.

4042

*/

4052

*/

4043

if (context == MEMMAP_EARLY) {

4053

if (context == MEMMAP_EARLY) {

4044

if (!early_pfn_valid(pfn))

4054

if (!early_pfn_valid(pfn))

4045

continue;

4055

continue;

4046

if (!early_pfn_in_nid(pfn, nid))

4056

if (!early_pfn_in_nid(pfn, nid))

4047

continue;

4057

continue;

4048

}

4058

}

4049

page = pfn_to_page(pfn);

4059

page = pfn_to_page(pfn);

4050

set_page_links(page, zone, nid, pfn);

4060

set_page_links(page, zone, nid, pfn);

4051

mminit_verify_page_links(page, zone, nid, pfn);

4061

mminit_verify_page_links(page, zone, nid, pfn);

4052

init_page_count(page);

4062

init_page_count(page);

4053

page_mapcount_reset(page);

4063

page_mapcount_reset(page);

4054

page_nid_reset_last(page);

4064

page_nid_reset_last(page);

4055

SetPageReserved(page);

4065

SetPageReserved(page);

4056

/*

4066

/*

4057

* Mark the block movable so that blocks are reserved for

4067

* Mark the block movable so that blocks are reserved for

4058

* movable at startup. This will force kernel allocations

4068

* movable at startup. This will force kernel allocations

4059

* to reserve their blocks rather than leaking throughout

4069

* to reserve their blocks rather than leaking throughout

4060

* the address space during boot when many long-lived

4070

* the address space during boot when many long-lived

4061

* kernel allocations are made. Later some blocks near

4071

* kernel allocations are made. Later some blocks near

4062

* the start are marked MIGRATE_RESERVE by

4072

* the start are marked MIGRATE_RESERVE by

4063

* setup_zone_migrate_reserve()

4073

* setup_zone_migrate_reserve()

4064

*

4074

*

4065

* bitmap is created for zone's valid pfn range. but memmap

4075

* bitmap is created for zone's valid pfn range. but memmap

4066

* can be created for invalid pages (for alignment)

4076

* can be created for invalid pages (for alignment)

4067

* check here not to call set_pageblock_migratetype() against

4077

* check here not to call set_pageblock_migratetype() against

4068

* pfn out of zone.

4078

* pfn out of zone.

4069

*/

4079

*/

4070

if ((z->zone_start_pfn <= pfn)

4080

if ((z->zone_start_pfn <= pfn)

4071

&& (pfn < zone_end_pfn(z))

4081

&& (pfn < zone_end_pfn(z))

4072

&& !(pfn & (pageblock_nr_pages - 1)))

4082

&& !(pfn & (pageblock_nr_pages - 1)))

4073

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4083

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4074

4084

4075

INIT_LIST_HEAD(&page->lru);

4085

INIT_LIST_HEAD(&page->lru);

4076

#ifdef WANT_PAGE_VIRTUAL

4086

#ifdef WANT_PAGE_VIRTUAL

4077

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4087

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4078

if (!is_highmem_idx(zone))

4088

if (!is_highmem_idx(zone))

4079

set_page_address(page, __va(pfn << PAGE_SHIFT));

4089

set_page_address(page, __va(pfn << PAGE_SHIFT));

4080

#endif

4090

#endif

4081

}

4091

}

4082

}

4092

}

4083

4093

4084

static void __meminit zone_init_free_lists(struct zone *zone)

4094

static void __meminit zone_init_free_lists(struct zone *zone)

4085

{

4095

{

4086

int order, t;

4096

int order, t;

4087

for_each_migratetype_order(order, t) {

4097

for_each_migratetype_order(order, t) {

4088

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4098

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4089

zone->free_area[order].nr_free = 0;

4099

zone->free_area[order].nr_free = 0;

4090

}

4100

}

4091

}

4101

}

4092

4102

4093

#ifndef __HAVE_ARCH_MEMMAP_INIT

4103

#ifndef __HAVE_ARCH_MEMMAP_INIT

4094

#define memmap_init(size, nid, zone, start_pfn) \

4104

#define memmap_init(size, nid, zone, start_pfn) \

4095

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4105

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4096

#endif

4106

#endif

4097

4107

4098

static int zone_batchsize(struct zone *zone)

4108

static int zone_batchsize(struct zone *zone)

4099

{

4109

{

4100

#ifdef CONFIG_MMU

4110

#ifdef CONFIG_MMU

4101

int batch;

4111

int batch;

4102

4112

4103

/*

4113

/*

4104

* The per-cpu-pages pools are set to around 1000th of the

4114

* The per-cpu-pages pools are set to around 1000th of the

4105

* size of the zone. But no more than 1/2 of a meg.

4115

* size of the zone. But no more than 1/2 of a meg.

4106

*

4116

*

4107

* OK, so we don't know how big the cache is. So guess.

4117

* OK, so we don't know how big the cache is. So guess.

4108

*/

4118

*/

4109

batch = zone->managed_pages / 1024;

4119

batch = zone->managed_pages / 1024;

4110

if (batch * PAGE_SIZE > 512 * 1024)

4120

if (batch * PAGE_SIZE > 512 * 1024)

4111

batch = (512 * 1024) / PAGE_SIZE;

4121

batch = (512 * 1024) / PAGE_SIZE;

4112

batch /= 4; /* We effectively *= 4 below */

4122

batch /= 4; /* We effectively *= 4 below */

4113

if (batch < 1)

4123

if (batch < 1)

4114

batch = 1;

4124

batch = 1;

4115

4125

4116

/*

4126

/*

4117

* Clamp the batch to a 2^n - 1 value. Having a power

4127

* Clamp the batch to a 2^n - 1 value. Having a power

4118

* of 2 value was found to be more likely to have

4128

* of 2 value was found to be more likely to have

4119

* suboptimal cache aliasing properties in some cases.

4129

* suboptimal cache aliasing properties in some cases.

4120

*

4130

*

4121

* For example if 2 tasks are alternately allocating

4131

* For example if 2 tasks are alternately allocating

4122

* batches of pages, one task can end up with a lot

4132

* batches of pages, one task can end up with a lot

4123

* of pages of one half of the possible page colors

4133

* of pages of one half of the possible page colors

4124

* and the other with pages of the other colors.

4134

* and the other with pages of the other colors.

4125

*/

4135

*/

4126

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4136

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4127

4137

4128

return batch;

4138

return batch;

4129

4139

4130

#else

4140

#else

4131

/* The deferral and batching of frees should be suppressed under NOMMU

4141

/* The deferral and batching of frees should be suppressed under NOMMU

4132

* conditions.

4142

* conditions.

4133

*

4143

*

4134

* The problem is that NOMMU needs to be able to allocate large chunks

4144

* The problem is that NOMMU needs to be able to allocate large chunks

4135

* of contiguous memory as there's no hardware page translation to

4145

* of contiguous memory as there's no hardware page translation to

4136

* assemble apparent contiguous memory from discontiguous pages.

4146

* assemble apparent contiguous memory from discontiguous pages.

4137

*

4147

*

4138

* Queueing large contiguous runs of pages for batching, however,

4148

* Queueing large contiguous runs of pages for batching, however,

4139

* causes the pages to actually be freed in smaller chunks. As there

4149

* causes the pages to actually be freed in smaller chunks. As there

4140

* can be a significant delay between the individual batches being

4150

* can be a significant delay between the individual batches being

4141

* recycled, this leads to the once large chunks of space being

4151

* recycled, this leads to the once large chunks of space being

4142

* fragmented and becoming unavailable for high-order allocations.

4152

* fragmented and becoming unavailable for high-order allocations.

4143

*/

4153

*/

4144

return 0;

4154

return 0;

4145

#endif

4155

#endif

4146

}

4156

}

4147

4157

4148

/*

4158

/*

4149

* pcp->high and pcp->batch values are related and dependent on one another:

4159

* pcp->high and pcp->batch values are related and dependent on one another:

4150

* ->batch must never be higher then ->high.

4160

* ->batch must never be higher then ->high.

4151

* The following function updates them in a safe manner without read side

4161

* The following function updates them in a safe manner without read side

4152

* locking.

4162

* locking.

4153

*

4163

*

4154

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4164

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4155

* those fields changing asynchronously (acording the the above rule).

4165

* those fields changing asynchronously (acording the the above rule).

4156

*

4166

*

4157

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4167

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4158

* outside of boot time (or some other assurance that no concurrent updaters

4168

* outside of boot time (or some other assurance that no concurrent updaters

4159

* exist).

4169

* exist).

4160

*/

4170

*/

4161

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4171

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4162

unsigned long batch)

4172

unsigned long batch)

4163

{

4173

{

4164

/* start with a fail safe value for batch */

4174

/* start with a fail safe value for batch */

4165

pcp->batch = 1;

4175

pcp->batch = 1;

4166

smp_wmb();

4176

smp_wmb();

4167

4177

4168

/* Update high, then batch, in order */

4178

/* Update high, then batch, in order */

4169

pcp->high = high;

4179

pcp->high = high;

4170

smp_wmb();

4180

smp_wmb();

4171

4181

4172

pcp->batch = batch;

4182

pcp->batch = batch;

4173

}

4183

}

4174

4184

4175

/* a companion to pageset_set_high() */

4185

/* a companion to pageset_set_high() */

4176

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4186

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4177

{

4187

{

4178

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4188

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4179

}

4189

}

4180

4190

4181

static void pageset_init(struct per_cpu_pageset *p)

4191

static void pageset_init(struct per_cpu_pageset *p)

4182

{

4192

{

4183

struct per_cpu_pages *pcp;

4193

struct per_cpu_pages *pcp;

4184

int migratetype;

4194

int migratetype;

4185

4195

4186

memset(p, 0, sizeof(*p));

4196

memset(p, 0, sizeof(*p));

4187

4197

4188

pcp = &p->pcp;

4198

pcp = &p->pcp;

4189

pcp->count = 0;

4199

pcp->count = 0;

4190

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4200

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4191

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4201

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4192

}

4202

}

4193

4203

4194

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4204

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4195

{

4205

{

4196

pageset_init(p);

4206

pageset_init(p);

4197

pageset_set_batch(p, batch);

4207

pageset_set_batch(p, batch);

4198

}

4208

}

4199

4209

4200

/*

4210

/*

4201

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4211

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4202

* to the value high for the pageset p.

4212

* to the value high for the pageset p.

4203

*/

4213

*/

4204

static void pageset_set_high(struct per_cpu_pageset *p,

4214

static void pageset_set_high(struct per_cpu_pageset *p,

4205

unsigned long high)

4215

unsigned long high)

4206

{

4216

{

4207

unsigned long batch = max(1UL, high / 4);

4217

unsigned long batch = max(1UL, high / 4);

4208

if ((high / 4) > (PAGE_SHIFT * 8))

4218

if ((high / 4) > (PAGE_SHIFT * 8))

4209

batch = PAGE_SHIFT * 8;

4219

batch = PAGE_SHIFT * 8;

4210

4220

4211

pageset_update(&p->pcp, high, batch);

4221

pageset_update(&p->pcp, high, batch);

4212

}

4222

}

4213

4223

4214

static void pageset_set_high_and_batch(struct zone *zone,

4224

static void pageset_set_high_and_batch(struct zone *zone,

4215

struct per_cpu_pageset *pcp)

4225

struct per_cpu_pageset *pcp)

4216

{

4226

{

4217

if (percpu_pagelist_fraction)

4227

if (percpu_pagelist_fraction)

4218

pageset_set_high(pcp,

4228

pageset_set_high(pcp,

4219

(zone->managed_pages /

4229

(zone->managed_pages /

4220

percpu_pagelist_fraction));

4230

percpu_pagelist_fraction));

4221

else

4231

else

4222

pageset_set_batch(pcp, zone_batchsize(zone));

4232

pageset_set_batch(pcp, zone_batchsize(zone));

4223

}

4233

}

4224

4234

4225

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4235

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4226

{

4236

{

4227

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4237

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4228

4238

4229

pageset_init(pcp);

4239

pageset_init(pcp);

4230

pageset_set_high_and_batch(zone, pcp);

4240

pageset_set_high_and_batch(zone, pcp);

4231

}

4241

}

4232

4242

4233

static void __meminit setup_zone_pageset(struct zone *zone)

4243

static void __meminit setup_zone_pageset(struct zone *zone)

4234

{

4244

{

4235

int cpu;

4245

int cpu;

4236

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4246

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4237

for_each_possible_cpu(cpu)

4247

for_each_possible_cpu(cpu)

4238

zone_pageset_init(zone, cpu);

4248

zone_pageset_init(zone, cpu);

4239

}

4249

}

4240

4250

4241

/*

4251

/*

4242

* Allocate per cpu pagesets and initialize them.

4252

* Allocate per cpu pagesets and initialize them.

4243

* Before this call only boot pagesets were available.

4253

* Before this call only boot pagesets were available.

4244

*/

4254

*/

4245

void __init setup_per_cpu_pageset(void)

4255

void __init setup_per_cpu_pageset(void)

4246

{

4256

{

4247

struct zone *zone;

4257

struct zone *zone;

4248

4258

4249

for_each_populated_zone(zone)

4259

for_each_populated_zone(zone)

4250

setup_zone_pageset(zone);

4260

setup_zone_pageset(zone);

4251

}

4261

}

4252

4262

4253

static noinline __init_refok

4263

static noinline __init_refok

4254

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4264

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4255

{

4265

{

4256

int i;

4266

int i;

4257

struct pglist_data *pgdat = zone->zone_pgdat;

4267

struct pglist_data *pgdat = zone->zone_pgdat;

4258

size_t alloc_size;

4268

size_t alloc_size;

4259

4269

4260

/*

4270

/*

4261

* The per-page waitqueue mechanism uses hashed waitqueues

4271

* The per-page waitqueue mechanism uses hashed waitqueues

4262

* per zone.

4272

* per zone.

4263

*/

4273

*/

4264

zone->wait_table_hash_nr_entries =

4274

zone->wait_table_hash_nr_entries =

4265

wait_table_hash_nr_entries(zone_size_pages);

4275

wait_table_hash_nr_entries(zone_size_pages);

4266

zone->wait_table_bits =

4276

zone->wait_table_bits =

4267

wait_table_bits(zone->wait_table_hash_nr_entries);

4277

wait_table_bits(zone->wait_table_hash_nr_entries);

4268

alloc_size = zone->wait_table_hash_nr_entries

4278

alloc_size = zone->wait_table_hash_nr_entries

4269

* sizeof(wait_queue_head_t);

4279

* sizeof(wait_queue_head_t);

4270

4280

4271

if (!slab_is_available()) {

4281

if (!slab_is_available()) {

4272

zone->wait_table = (wait_queue_head_t *)

4282

zone->wait_table = (wait_queue_head_t *)

4273

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4283

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4274

} else {

4284

} else {

4275

/*

4285

/*

4276

* This case means that a zone whose size was 0 gets new memory

4286

* This case means that a zone whose size was 0 gets new memory

4277

* via memory hot-add.

4287

* via memory hot-add.

4278

* But it may be the case that a new node was hot-added. In

4288

* But it may be the case that a new node was hot-added. In

4279

* this case vmalloc() will not be able to use this new node's

4289

* this case vmalloc() will not be able to use this new node's

4280

* memory - this wait_table must be initialized to use this new

4290

* memory - this wait_table must be initialized to use this new

4281

* node itself as well.

4291

* node itself as well.

4282

* To use this new node's memory, further consideration will be

4292

* To use this new node's memory, further consideration will be

4283

* necessary.

4293

* necessary.

4284

*/

4294

*/

4285

zone->wait_table = vmalloc(alloc_size);

4295

zone->wait_table = vmalloc(alloc_size);

4286

}

4296

}

4287

if (!zone->wait_table)

4297

if (!zone->wait_table)

4288

return -ENOMEM;

4298

return -ENOMEM;

4289

4299

4290

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4300

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4291

init_waitqueue_head(zone->wait_table + i);

4301

init_waitqueue_head(zone->wait_table + i);

4292

4302

4293

return 0;

4303

return 0;

4294

}

4304

}

4295

4305

4296

static __meminit void zone_pcp_init(struct zone *zone)

4306

static __meminit void zone_pcp_init(struct zone *zone)

4297

{

4307

{

4298

/*

4308

/*

4299

* per cpu subsystem is not up at this point. The following code

4309

* per cpu subsystem is not up at this point. The following code

4300

* relies on the ability of the linker to provide the

4310

* relies on the ability of the linker to provide the

4301

* offset of a (static) per cpu variable into the per cpu area.

4311

* offset of a (static) per cpu variable into the per cpu area.

4302

*/

4312

*/

4303

zone->pageset = &boot_pageset;

4313

zone->pageset = &boot_pageset;

4304

4314

4305

if (zone->present_pages)

4315

if (zone->present_pages)

4306

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4316

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4307

zone->name, zone->present_pages,

4317

zone->name, zone->present_pages,

4308

zone_batchsize(zone));

4318

zone_batchsize(zone));

4309

}

4319

}

4310

4320

4311

int __meminit init_currently_empty_zone(struct zone *zone,

4321

int __meminit init_currently_empty_zone(struct zone *zone,

4312

unsigned long zone_start_pfn,

4322

unsigned long zone_start_pfn,

4313

unsigned long size,

4323

unsigned long size,

4314

enum memmap_context context)

4324

enum memmap_context context)

4315

{

4325

{

4316

struct pglist_data *pgdat = zone->zone_pgdat;

4326

struct pglist_data *pgdat = zone->zone_pgdat;

4317

int ret;

4327

int ret;

4318

ret = zone_wait_table_init(zone, size);

4328

ret = zone_wait_table_init(zone, size);

4319

if (ret)

4329

if (ret)

4320

return ret;

4330

return ret;

4321

pgdat->nr_zones = zone_idx(zone) + 1;

4331

pgdat->nr_zones = zone_idx(zone) + 1;

4322

4332

4323

zone->zone_start_pfn = zone_start_pfn;

4333

zone->zone_start_pfn = zone_start_pfn;

4324

4334

4325

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4335

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4326

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4336

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4327

pgdat->node_id,

4337

pgdat->node_id,

4328

(unsigned long)zone_idx(zone),

4338

(unsigned long)zone_idx(zone),

4329

zone_start_pfn, (zone_start_pfn + size));

4339

zone_start_pfn, (zone_start_pfn + size));

4330

4340

4331

zone_init_free_lists(zone);

4341

zone_init_free_lists(zone);

4332

4342

4333

return 0;

4343

return 0;

4334

}

4344

}

4335

4345

4336

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4346

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4337

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4347

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4338

/*

4348

/*

4339

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4349

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4340

* Architectures may implement their own version but if add_active_range()

4350

* Architectures may implement their own version but if add_active_range()

4341

* was used and there are no special requirements, this is a convenient

4351

* was used and there are no special requirements, this is a convenient

4342

* alternative

4352

* alternative

4343

*/

4353

*/

4344

int __meminit __early_pfn_to_nid(unsigned long pfn)

4354

int __meminit __early_pfn_to_nid(unsigned long pfn)

4345

{

4355

{

4346

unsigned long start_pfn, end_pfn;

4356

unsigned long start_pfn, end_pfn;

4347

int nid;

4357

int nid;

4348

/*

4358

/*

4349

* NOTE: The following SMP-unsafe globals are only used early in boot

4359

* NOTE: The following SMP-unsafe globals are only used early in boot

4350

* when the kernel is running single-threaded.

4360

* when the kernel is running single-threaded.

4351

*/

4361

*/

4352

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4362

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4353

static int __meminitdata last_nid;

4363

static int __meminitdata last_nid;

4354

4364

4355

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4365

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4356

return last_nid;

4366

return last_nid;

4357

4367

4358

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4368

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4359

if (nid != -1) {

4369

if (nid != -1) {

4360

last_start_pfn = start_pfn;

4370

last_start_pfn = start_pfn;

4361

last_end_pfn = end_pfn;

4371

last_end_pfn = end_pfn;

4362

last_nid = nid;

4372

last_nid = nid;

4363

}

4373

}

4364

4374

4365

return nid;

4375

return nid;

4366

}

4376

}

4367

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4377

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4368

4378

4369

int __meminit early_pfn_to_nid(unsigned long pfn)

4379

int __meminit early_pfn_to_nid(unsigned long pfn)

4370

{

4380

{

4371

int nid;

4381

int nid;

4372

4382

4373

nid = __early_pfn_to_nid(pfn);

4383

nid = __early_pfn_to_nid(pfn);

4374

if (nid >= 0)

4384

if (nid >= 0)

4375

return nid;

4385

return nid;

4376

/* just returns 0 */

4386

/* just returns 0 */

4377

return 0;

4387

return 0;

4378

}

4388

}

4379

4389

4380

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4390

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4381

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4391

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4382

{

4392

{

4383

int nid;

4393

int nid;

4384

4394

4385

nid = __early_pfn_to_nid(pfn);

4395

nid = __early_pfn_to_nid(pfn);

4386

if (nid >= 0 && nid != node)

4396

if (nid >= 0 && nid != node)

4387

return false;

4397

return false;

4388

return true;

4398

return true;

4389

}

4399

}

4390

#endif

4400

#endif

4391

4401

4392

/**

4402

/**

4393

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4403

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4394

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4404

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4395

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4405

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4396

*

4406

*

4397

* If an architecture guarantees that all ranges registered with

4407

* If an architecture guarantees that all ranges registered with

4398

* add_active_ranges() contain no holes and may be freed, this

4408

* add_active_ranges() contain no holes and may be freed, this

4399

* this function may be used instead of calling free_bootmem() manually.

4409

* this function may be used instead of calling free_bootmem() manually.

4400

*/

4410

*/

4401

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4411

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4402

{

4412

{

4403

unsigned long start_pfn, end_pfn;

4413

unsigned long start_pfn, end_pfn;

4404

int i, this_nid;

4414

int i, this_nid;

4405

4415

4406

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4416

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4407

start_pfn = min(start_pfn, max_low_pfn);

4417

start_pfn = min(start_pfn, max_low_pfn);

4408

end_pfn = min(end_pfn, max_low_pfn);

4418

end_pfn = min(end_pfn, max_low_pfn);

4409

4419

4410

if (start_pfn < end_pfn)

4420

if (start_pfn < end_pfn)

4411

free_bootmem_node(NODE_DATA(this_nid),

4421

free_bootmem_node(NODE_DATA(this_nid),

4412

PFN_PHYS(start_pfn),

4422

PFN_PHYS(start_pfn),

4413

(end_pfn - start_pfn) << PAGE_SHIFT);

4423

(end_pfn - start_pfn) << PAGE_SHIFT);

4414

}

4424

}

4415

}

4425

}

4416

4426

4417

/**

4427

/**

4418

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4428

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4419

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4429

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4420

*

4430

*

4421

* If an architecture guarantees that all ranges registered with

4431

* If an architecture guarantees that all ranges registered with

4422

* add_active_ranges() contain no holes and may be freed, this

4432

* add_active_ranges() contain no holes and may be freed, this

4423

* function may be used instead of calling memory_present() manually.

4433

* function may be used instead of calling memory_present() manually.

4424

*/

4434

*/

4425

void __init sparse_memory_present_with_active_regions(int nid)

4435

void __init sparse_memory_present_with_active_regions(int nid)

4426

{

4436

{

4427

unsigned long start_pfn, end_pfn;

4437

unsigned long start_pfn, end_pfn;

4428

int i, this_nid;

4438

int i, this_nid;

4429

4439

4430

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4440

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4431

memory_present(this_nid, start_pfn, end_pfn);

4441

memory_present(this_nid, start_pfn, end_pfn);

4432

}

4442

}

4433

4443

4434

/**

4444

/**

4435

* get_pfn_range_for_nid - Return the start and end page frames for a node

4445

* get_pfn_range_for_nid - Return the start and end page frames for a node

4436

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4446

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4437

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4447

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4438

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4448

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4439

*

4449

*

4440

* It returns the start and end page frame of a node based on information

4450

* It returns the start and end page frame of a node based on information

4441

* provided by an arch calling add_active_range(). If called for a node

4451

* provided by an arch calling add_active_range(). If called for a node

4442

* with no available memory, a warning is printed and the start and end

4452

* with no available memory, a warning is printed and the start and end

4443

* PFNs will be 0.

4453

* PFNs will be 0.

4444

*/

4454

*/

4445

void __meminit get_pfn_range_for_nid(unsigned int nid,

4455

void __meminit get_pfn_range_for_nid(unsigned int nid,

4446

unsigned long *start_pfn, unsigned long *end_pfn)

4456

unsigned long *start_pfn, unsigned long *end_pfn)

4447

{

4457

{

4448

unsigned long this_start_pfn, this_end_pfn;

4458

unsigned long this_start_pfn, this_end_pfn;

4449

int i;

4459

int i;

4450

4460

4451

*start_pfn = -1UL;

4461

*start_pfn = -1UL;

4452

*end_pfn = 0;

4462

*end_pfn = 0;

4453

4463

4454

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4464

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4455

*start_pfn = min(*start_pfn, this_start_pfn);

4465

*start_pfn = min(*start_pfn, this_start_pfn);

4456

*end_pfn = max(*end_pfn, this_end_pfn);

4466

*end_pfn = max(*end_pfn, this_end_pfn);

4457

}

4467

}

4458

4468

4459

if (*start_pfn == -1UL)

4469

if (*start_pfn == -1UL)

4460

*start_pfn = 0;

4470

*start_pfn = 0;

4461

}

4471

}

4462

4472

4463

/*

4473

/*

4464

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4474

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4465

* assumption is made that zones within a node are ordered in monotonic

4475

* assumption is made that zones within a node are ordered in monotonic

4466

* increasing memory addresses so that the "highest" populated zone is used

4476

* increasing memory addresses so that the "highest" populated zone is used

4467

*/

4477

*/

4468

static void __init find_usable_zone_for_movable(void)

4478

static void __init find_usable_zone_for_movable(void)

4469

{

4479

{

4470

int zone_index;

4480

int zone_index;

4471

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4481

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4472

if (zone_index == ZONE_MOVABLE)

4482

if (zone_index == ZONE_MOVABLE)

4473

continue;

4483

continue;

4474

4484

4475

if (arch_zone_highest_possible_pfn[zone_index] >

4485

if (arch_zone_highest_possible_pfn[zone_index] >

4476

arch_zone_lowest_possible_pfn[zone_index])

4486

arch_zone_lowest_possible_pfn[zone_index])

4477

break;

4487

break;

4478

}

4488

}

4479

4489

4480

VM_BUG_ON(zone_index == -1);

4490

VM_BUG_ON(zone_index == -1);

4481

movable_zone = zone_index;

4491

movable_zone = zone_index;

4482

}

4492

}

4483

4493

4484

/*

4494

/*

4485

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4495

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4486

* because it is sized independent of architecture. Unlike the other zones,

4496

* because it is sized independent of architecture. Unlike the other zones,

4487

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4497

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4488

* in each node depending on the size of each node and how evenly kernelcore

4498

* in each node depending on the size of each node and how evenly kernelcore

4489

* is distributed. This helper function adjusts the zone ranges

4499

* is distributed. This helper function adjusts the zone ranges

4490

* provided by the architecture for a given node by using the end of the

4500

* provided by the architecture for a given node by using the end of the

4491

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4501

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4492

* zones within a node are in order of monotonic increases memory addresses

4502

* zones within a node are in order of monotonic increases memory addresses

4493

*/

4503

*/

4494

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4504

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4495

unsigned long zone_type,

4505

unsigned long zone_type,

4496

unsigned long node_start_pfn,

4506

unsigned long node_start_pfn,

4497

unsigned long node_end_pfn,

4507

unsigned long node_end_pfn,

4498

unsigned long *zone_start_pfn,

4508

unsigned long *zone_start_pfn,

4499

unsigned long *zone_end_pfn)

4509

unsigned long *zone_end_pfn)

4500

{

4510

{

4501

/* Only adjust if ZONE_MOVABLE is on this node */

4511

/* Only adjust if ZONE_MOVABLE is on this node */

4502

if (zone_movable_pfn[nid]) {

4512

if (zone_movable_pfn[nid]) {

4503

/* Size ZONE_MOVABLE */

4513

/* Size ZONE_MOVABLE */

4504

if (zone_type == ZONE_MOVABLE) {

4514

if (zone_type == ZONE_MOVABLE) {

4505

*zone_start_pfn = zone_movable_pfn[nid];

4515

*zone_start_pfn = zone_movable_pfn[nid];

4506

*zone_end_pfn = min(node_end_pfn,

4516

*zone_end_pfn = min(node_end_pfn,

4507

arch_zone_highest_possible_pfn[movable_zone]);

4517

arch_zone_highest_possible_pfn[movable_zone]);

4508

4518

4509

/* Adjust for ZONE_MOVABLE starting within this range */

4519

/* Adjust for ZONE_MOVABLE starting within this range */

4510

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4520

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4511

*zone_end_pfn > zone_movable_pfn[nid]) {

4521

*zone_end_pfn > zone_movable_pfn[nid]) {

4512

*zone_end_pfn = zone_movable_pfn[nid];

4522

*zone_end_pfn = zone_movable_pfn[nid];

4513

4523

4514

/* Check if this whole range is within ZONE_MOVABLE */

4524

/* Check if this whole range is within ZONE_MOVABLE */

4515

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4525

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4516

*zone_start_pfn = *zone_end_pfn;

4526

*zone_start_pfn = *zone_end_pfn;

4517

}

4527

}

4518

}

4528

}

4519

4529

4520

/*

4530

/*

4521

* Return the number of pages a zone spans in a node, including holes

4531

* Return the number of pages a zone spans in a node, including holes

4522

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4532

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4523

*/

4533

*/

4524

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4534

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4525

unsigned long zone_type,

4535

unsigned long zone_type,

4526

unsigned long node_start_pfn,

4536

unsigned long node_start_pfn,

4527

unsigned long node_end_pfn,

4537

unsigned long node_end_pfn,

4528

unsigned long *ignored)

4538

unsigned long *ignored)

4529

{

4539

{

4530

unsigned long zone_start_pfn, zone_end_pfn;

4540

unsigned long zone_start_pfn, zone_end_pfn;

4531

4541

4532

/* Get the start and end of the zone */

4542

/* Get the start and end of the zone */

4533

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4543

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4534

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4544

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4535

adjust_zone_range_for_zone_movable(nid, zone_type,

4545

adjust_zone_range_for_zone_movable(nid, zone_type,

4536

node_start_pfn, node_end_pfn,

4546

node_start_pfn, node_end_pfn,

4537

&zone_start_pfn, &zone_end_pfn);

4547

&zone_start_pfn, &zone_end_pfn);

4538

4548

4539

/* Check that this node has pages within the zone's required range */

4549

/* Check that this node has pages within the zone's required range */

4540

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4550

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4541

return 0;

4551

return 0;

4542

4552

4543

/* Move the zone boundaries inside the node if necessary */

4553

/* Move the zone boundaries inside the node if necessary */

4544

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4554

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4545

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4555

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4546

4556

4547

/* Return the spanned pages */

4557

/* Return the spanned pages */

4548

return zone_end_pfn - zone_start_pfn;

4558

return zone_end_pfn - zone_start_pfn;

4549

}

4559

}

4550

4560

4551

/*

4561

/*

4552

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4562

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4553

* then all holes in the requested range will be accounted for.

4563

* then all holes in the requested range will be accounted for.

4554

*/

4564

*/

4555

unsigned long __meminit __absent_pages_in_range(int nid,

4565

unsigned long __meminit __absent_pages_in_range(int nid,

4556

unsigned long range_start_pfn,

4566

unsigned long range_start_pfn,

4557

unsigned long range_end_pfn)

4567

unsigned long range_end_pfn)

4558

{

4568

{

4559

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4569

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4560

unsigned long start_pfn, end_pfn;

4570

unsigned long start_pfn, end_pfn;

4561

int i;

4571

int i;

4562

4572

4563

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4573

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4564

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4574

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4565

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4575

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4566

nr_absent -= end_pfn - start_pfn;

4576

nr_absent -= end_pfn - start_pfn;

4567

}

4577

}

4568

return nr_absent;

4578

return nr_absent;

4569

}

4579

}

4570

4580

4571

/**

4581

/**

4572

* absent_pages_in_range - Return number of page frames in holes within a range

4582

* absent_pages_in_range - Return number of page frames in holes within a range

4573

* @start_pfn: The start PFN to start searching for holes

4583

* @start_pfn: The start PFN to start searching for holes

4574

* @end_pfn: The end PFN to stop searching for holes

4584

* @end_pfn: The end PFN to stop searching for holes

4575

*

4585

*

4576

* It returns the number of pages frames in memory holes within a range.

4586

* It returns the number of pages frames in memory holes within a range.

4577

*/

4587

*/

4578

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4588

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4579

unsigned long end_pfn)

4589

unsigned long end_pfn)

4580

{

4590

{

4581

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4591

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4582

}

4592

}

4583

4593

4584

/* Return the number of page frames in holes in a zone on a node */

4594

/* Return the number of page frames in holes in a zone on a node */

4585

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4595

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4586

unsigned long zone_type,

4596

unsigned long zone_type,

4587

unsigned long node_start_pfn,

4597

unsigned long node_start_pfn,

4588

unsigned long node_end_pfn,

4598

unsigned long node_end_pfn,

4589

unsigned long *ignored)

4599

unsigned long *ignored)

4590

{

4600

{

4591

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4601

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4592

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4602

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4593

unsigned long zone_start_pfn, zone_end_pfn;

4603

unsigned long zone_start_pfn, zone_end_pfn;

4594

4604

4595

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4605

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4596

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4606

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4597

4607

4598

adjust_zone_range_for_zone_movable(nid, zone_type,

4608

adjust_zone_range_for_zone_movable(nid, zone_type,

4599

node_start_pfn, node_end_pfn,

4609

node_start_pfn, node_end_pfn,

4600

&zone_start_pfn, &zone_end_pfn);

4610

&zone_start_pfn, &zone_end_pfn);

4601

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4611

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4602

}

4612

}

4603

4613

4604

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4614

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4605

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4615

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4606

unsigned long zone_type,

4616

unsigned long zone_type,

4607

unsigned long node_start_pfn,

4617

unsigned long node_start_pfn,

4608

unsigned long node_end_pfn,

4618

unsigned long node_end_pfn,

4609

unsigned long *zones_size)

4619

unsigned long *zones_size)

4610

{

4620

{

4611

return zones_size[zone_type];

4621

return zones_size[zone_type];

4612

}

4622

}

4613

4623

4614

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4624

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4615

unsigned long zone_type,

4625

unsigned long zone_type,

4616

unsigned long node_start_pfn,

4626

unsigned long node_start_pfn,

4617

unsigned long node_end_pfn,

4627

unsigned long node_end_pfn,

4618

unsigned long *zholes_size)

4628

unsigned long *zholes_size)

4619

{

4629

{

4620

if (!zholes_size)

4630

if (!zholes_size)

4621

return 0;

4631

return 0;

4622

4632

4623

return zholes_size[zone_type];

4633

return zholes_size[zone_type];

4624

}

4634

}

4625

4635

4626

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4636

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4627

4637

4628

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4638

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4629

unsigned long node_start_pfn,

4639

unsigned long node_start_pfn,

4630

unsigned long node_end_pfn,

4640

unsigned long node_end_pfn,

4631

unsigned long *zones_size,

4641

unsigned long *zones_size,

4632

unsigned long *zholes_size)

4642

unsigned long *zholes_size)

4633

{

4643

{

4634

unsigned long realtotalpages, totalpages = 0;

4644

unsigned long realtotalpages, totalpages = 0;

4635

enum zone_type i;

4645

enum zone_type i;

4636

4646

4637

for (i = 0; i < MAX_NR_ZONES; i++)

4647

for (i = 0; i < MAX_NR_ZONES; i++)

4638

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4648

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4639

node_start_pfn,

4649

node_start_pfn,

4640

node_end_pfn,

4650

node_end_pfn,

4641

zones_size);

4651

zones_size);

4642

pgdat->node_spanned_pages = totalpages;

4652

pgdat->node_spanned_pages = totalpages;

4643

4653

4644

realtotalpages = totalpages;

4654

realtotalpages = totalpages;

4645

for (i = 0; i < MAX_NR_ZONES; i++)

4655

for (i = 0; i < MAX_NR_ZONES; i++)

4646

realtotalpages -=

4656

realtotalpages -=

4647

zone_absent_pages_in_node(pgdat->node_id, i,

4657

zone_absent_pages_in_node(pgdat->node_id, i,

4648

node_start_pfn, node_end_pfn,

4658

node_start_pfn, node_end_pfn,

4649

zholes_size);

4659

zholes_size);

4650

pgdat->node_present_pages = realtotalpages;

4660

pgdat->node_present_pages = realtotalpages;

4651

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4661

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4652

realtotalpages);

4662

realtotalpages);

4653

}

4663

}

4654

4664

4655

#ifndef CONFIG_SPARSEMEM

4665

#ifndef CONFIG_SPARSEMEM

4656

/*

4666

/*

4657

* Calculate the size of the zone->blockflags rounded to an unsigned long

4667

* Calculate the size of the zone->blockflags rounded to an unsigned long

4658

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4668

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4659

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4669

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4660

* round what is now in bits to nearest long in bits, then return it in

4670

* round what is now in bits to nearest long in bits, then return it in

4661

* bytes.

4671

* bytes.

4662

*/

4672

*/

4663

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4673

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4664

{

4674

{

4665

unsigned long usemapsize;

4675

unsigned long usemapsize;

4666

4676

4667

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4677

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4668

usemapsize = roundup(zonesize, pageblock_nr_pages);

4678

usemapsize = roundup(zonesize, pageblock_nr_pages);

4669

usemapsize = usemapsize >> pageblock_order;

4679

usemapsize = usemapsize >> pageblock_order;

4670

usemapsize *= NR_PAGEBLOCK_BITS;

4680

usemapsize *= NR_PAGEBLOCK_BITS;

4671

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4681

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4672

4682

4673

return usemapsize / 8;

4683

return usemapsize / 8;

4674

}

4684

}

4675

4685

4676

static void __init setup_usemap(struct pglist_data *pgdat,

4686

static void __init setup_usemap(struct pglist_data *pgdat,

4677

struct zone *zone,

4687

struct zone *zone,

4678

unsigned long zone_start_pfn,

4688

unsigned long zone_start_pfn,

4679

unsigned long zonesize)

4689

unsigned long zonesize)

4680

{

4690

{

4681

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4691

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4682

zone->pageblock_flags = NULL;

4692

zone->pageblock_flags = NULL;

4683

if (usemapsize)

4693

if (usemapsize)

4684

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4694

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4685

usemapsize);

4695

usemapsize);

4686

}

4696

}

4687

#else

4697

#else

4688

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4698

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4689

unsigned long zone_start_pfn, unsigned long zonesize) {}

4699

unsigned long zone_start_pfn, unsigned long zonesize) {}

4690

#endif /* CONFIG_SPARSEMEM */

4700

#endif /* CONFIG_SPARSEMEM */

4691

4701

4692

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4702

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4693

4703

4694

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4704

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4695

void __paginginit set_pageblock_order(void)

4705

void __paginginit set_pageblock_order(void)

4696

{

4706

{

4697

unsigned int order;

4707

unsigned int order;

4698

4708

4699

/* Check that pageblock_nr_pages has not already been setup */

4709

/* Check that pageblock_nr_pages has not already been setup */

4700

if (pageblock_order)

4710

if (pageblock_order)

4701

return;

4711

return;

4702

4712

4703

if (HPAGE_SHIFT > PAGE_SHIFT)

4713

if (HPAGE_SHIFT > PAGE_SHIFT)

4704

order = HUGETLB_PAGE_ORDER;

4714

order = HUGETLB_PAGE_ORDER;

4705

else

4715

else

4706

order = MAX_ORDER - 1;

4716

order = MAX_ORDER - 1;

4707

4717

4708

/*

4718

/*

4709

* Assume the largest contiguous order of interest is a huge page.

4719

* Assume the largest contiguous order of interest is a huge page.

4710

* This value may be variable depending on boot parameters on IA64 and

4720

* This value may be variable depending on boot parameters on IA64 and

4711

* powerpc.

4721

* powerpc.

4712

*/

4722

*/

4713

pageblock_order = order;

4723

pageblock_order = order;

4714

}

4724

}

4715

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4725

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4716

4726

4717

/*

4727

/*

4718

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4728

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4719

* is unused as pageblock_order is set at compile-time. See

4729

* is unused as pageblock_order is set at compile-time. See

4720

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4730

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4721

* the kernel config

4731

* the kernel config

4722

*/

4732

*/

4723

void __paginginit set_pageblock_order(void)

4733

void __paginginit set_pageblock_order(void)

4724

{

4734

{

4725

}

4735

}

4726

4736

4727

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4737

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4728

4738

4729

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4739

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4730

unsigned long present_pages)

4740

unsigned long present_pages)

4731

{

4741

{

4732

unsigned long pages = spanned_pages;

4742

unsigned long pages = spanned_pages;

4733

4743

4734

/*

4744

/*

4735

* Provide a more accurate estimation if there are holes within

4745

* Provide a more accurate estimation if there are holes within

4736

* the zone and SPARSEMEM is in use. If there are holes within the

4746

* the zone and SPARSEMEM is in use. If there are holes within the

4737

* zone, each populated memory region may cost us one or two extra

4747

* zone, each populated memory region may cost us one or two extra

4738

* memmap pages due to alignment because memmap pages for each

4748

* memmap pages due to alignment because memmap pages for each

4739

* populated regions may not naturally algined on page boundary.

4749

* populated regions may not naturally algined on page boundary.

4740

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4750

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4741

*/

4751

*/

4742

if (spanned_pages > present_pages + (present_pages >> 4) &&

4752

if (spanned_pages > present_pages + (present_pages >> 4) &&

4743

IS_ENABLED(CONFIG_SPARSEMEM))

4753

IS_ENABLED(CONFIG_SPARSEMEM))

4744

pages = present_pages;

4754

pages = present_pages;

4745

4755

4746

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4756

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4747

}

4757

}

4748

4758

4749

/*

4759

/*

4750

* Set up the zone data structures:

4760

* Set up the zone data structures:

4751

* - mark all pages reserved

4761

* - mark all pages reserved

4752

* - mark all memory queues empty

4762

* - mark all memory queues empty

4753

* - clear the memory bitmaps

4763

* - clear the memory bitmaps

4754

*

4764

*

4755

* NOTE: pgdat should get zeroed by caller.

4765

* NOTE: pgdat should get zeroed by caller.

4756

*/

4766

*/

4757

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4767

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4758

unsigned long node_start_pfn, unsigned long node_end_pfn,

4768

unsigned long node_start_pfn, unsigned long node_end_pfn,

4759

unsigned long *zones_size, unsigned long *zholes_size)

4769

unsigned long *zones_size, unsigned long *zholes_size)

4760

{

4770

{

4761

enum zone_type j;

4771

enum zone_type j;

4762

int nid = pgdat->node_id;

4772

int nid = pgdat->node_id;

4763

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4773

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4764

int ret;

4774

int ret;

4765

4775

4766

pgdat_resize_init(pgdat);

4776

pgdat_resize_init(pgdat);

4767

#ifdef CONFIG_NUMA_BALANCING

4777

#ifdef CONFIG_NUMA_BALANCING

4768

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4778

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4769

pgdat->numabalancing_migrate_nr_pages = 0;

4779

pgdat->numabalancing_migrate_nr_pages = 0;

4770

pgdat->numabalancing_migrate_next_window = jiffies;

4780

pgdat->numabalancing_migrate_next_window = jiffies;

4771

#endif

4781

#endif

4772

init_waitqueue_head(&pgdat->kswapd_wait);

4782

init_waitqueue_head(&pgdat->kswapd_wait);

4773

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4783

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4774

pgdat_page_cgroup_init(pgdat);

4784

pgdat_page_cgroup_init(pgdat);

4775

4785

4776

for (j = 0; j < MAX_NR_ZONES; j++) {

4786

for (j = 0; j < MAX_NR_ZONES; j++) {

4777

struct zone *zone = pgdat->node_zones + j;

4787

struct zone *zone = pgdat->node_zones + j;

4778

unsigned long size, realsize, freesize, memmap_pages;

4788

unsigned long size, realsize, freesize, memmap_pages;

4779

4789

4780

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4790

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4781

node_end_pfn, zones_size);

4791

node_end_pfn, zones_size);

4782

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4792

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4783

node_start_pfn,

4793

node_start_pfn,

4784

node_end_pfn,

4794

node_end_pfn,

4785

zholes_size);

4795

zholes_size);

4786

4796

4787

/*

4797

/*

4788

* Adjust freesize so that it accounts for how much memory

4798

* Adjust freesize so that it accounts for how much memory

4789

* is used by this zone for memmap. This affects the watermark

4799

* is used by this zone for memmap. This affects the watermark

4790

* and per-cpu initialisations

4800

* and per-cpu initialisations

4791

*/

4801

*/

4792

memmap_pages = calc_memmap_size(size, realsize);

4802

memmap_pages = calc_memmap_size(size, realsize);

4793

if (freesize >= memmap_pages) {

4803

if (freesize >= memmap_pages) {

4794

freesize -= memmap_pages;

4804

freesize -= memmap_pages;

4795

if (memmap_pages)

4805

if (memmap_pages)

4796

printk(KERN_DEBUG

4806

printk(KERN_DEBUG

4797

" %s zone: %lu pages used for memmap\n",

4807

" %s zone: %lu pages used for memmap\n",

4798

zone_names[j], memmap_pages);

4808

zone_names[j], memmap_pages);

4799

} else

4809

} else

4800

printk(KERN_WARNING

4810

printk(KERN_WARNING

4801

" %s zone: %lu pages exceeds freesize %lu\n",

4811

" %s zone: %lu pages exceeds freesize %lu\n",

4802

zone_names[j], memmap_pages, freesize);

4812

zone_names[j], memmap_pages, freesize);

4803

4813

4804

/* Account for reserved pages */

4814

/* Account for reserved pages */

4805

if (j == 0 && freesize > dma_reserve) {

4815

if (j == 0 && freesize > dma_reserve) {

4806

freesize -= dma_reserve;

4816

freesize -= dma_reserve;

4807

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4817

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4808

zone_names[0], dma_reserve);

4818

zone_names[0], dma_reserve);

4809

}

4819

}

4810

4820

4811

if (!is_highmem_idx(j))

4821

if (!is_highmem_idx(j))

4812

nr_kernel_pages += freesize;

4822

nr_kernel_pages += freesize;

4813

/* Charge for highmem memmap if there are enough kernel pages */

4823

/* Charge for highmem memmap if there are enough kernel pages */

4814

else if (nr_kernel_pages > memmap_pages * 2)

4824

else if (nr_kernel_pages > memmap_pages * 2)

4815

nr_kernel_pages -= memmap_pages;

4825

nr_kernel_pages -= memmap_pages;

4816

nr_all_pages += freesize;

4826

nr_all_pages += freesize;

4817

4827

4818

zone->spanned_pages = size;

4828

zone->spanned_pages = size;

4819

zone->present_pages = realsize;

4829

zone->present_pages = realsize;

4820

/*

4830

/*

4821

* Set an approximate value for lowmem here, it will be adjusted

4831

* Set an approximate value for lowmem here, it will be adjusted

4822

* when the bootmem allocator frees pages into the buddy system.

4832

* when the bootmem allocator frees pages into the buddy system.

4823

* And all highmem pages will be managed by the buddy system.

4833

* And all highmem pages will be managed by the buddy system.

4824

*/

4834

*/

4825

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4835

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4826

#ifdef CONFIG_NUMA

4836

#ifdef CONFIG_NUMA

4827

zone->node = nid;

4837

zone->node = nid;

4828

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4838

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4829

/ 100;

4839

/ 100;

4830

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4840

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4831

#endif

4841

#endif

4832

zone->name = zone_names[j];

4842

zone->name = zone_names[j];

4833

spin_lock_init(&zone->lock);

4843

spin_lock_init(&zone->lock);

4834

spin_lock_init(&zone->lru_lock);

4844

spin_lock_init(&zone->lru_lock);

4835

zone_seqlock_init(zone);

4845

zone_seqlock_init(zone);

4836

zone->zone_pgdat = pgdat;

4846

zone->zone_pgdat = pgdat;

4837

zone_pcp_init(zone);

4847

zone_pcp_init(zone);

4838

4848

4839

/* For bootup, initialized properly in watermark setup */

4849

/* For bootup, initialized properly in watermark setup */

4840

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4850

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4841

4851

4842

lruvec_init(&zone->lruvec);

4852

lruvec_init(&zone->lruvec);

4843

if (!size)

4853

if (!size)

4844

continue;

4854

continue;

4845

4855

4846

set_pageblock_order();

4856

set_pageblock_order();

4847

setup_usemap(pgdat, zone, zone_start_pfn, size);

4857

setup_usemap(pgdat, zone, zone_start_pfn, size);

4848

ret = init_currently_empty_zone(zone, zone_start_pfn,

4858

ret = init_currently_empty_zone(zone, zone_start_pfn,

4849

size, MEMMAP_EARLY);

4859

size, MEMMAP_EARLY);

4850

BUG_ON(ret);

4860

BUG_ON(ret);

4851

memmap_init(size, nid, j, zone_start_pfn);

4861

memmap_init(size, nid, j, zone_start_pfn);

4852

zone_start_pfn += size;

4862

zone_start_pfn += size;

4853

}

4863

}

4854

}

4864

}

4855

4865

4856

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4866

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4857

{

4867

{

4858

/* Skip empty nodes */

4868

/* Skip empty nodes */

4859

if (!pgdat->node_spanned_pages)

4869

if (!pgdat->node_spanned_pages)

4860

return;

4870

return;

4861

4871

4862

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4872

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4863

/* ia64 gets its own node_mem_map, before this, without bootmem */

4873

/* ia64 gets its own node_mem_map, before this, without bootmem */

4864

if (!pgdat->node_mem_map) {

4874

if (!pgdat->node_mem_map) {

4865

unsigned long size, start, end;

4875

unsigned long size, start, end;

4866

struct page *map;

4876

struct page *map;

4867

4877

4868

/*

4878

/*

4869

* The zone's endpoints aren't required to be MAX_ORDER

4879

* The zone's endpoints aren't required to be MAX_ORDER

4870

* aligned but the node_mem_map endpoints must be in order

4880

* aligned but the node_mem_map endpoints must be in order

4871

* for the buddy allocator to function correctly.

4881

* for the buddy allocator to function correctly.

4872

*/

4882

*/

4873

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4883

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4874

end = pgdat_end_pfn(pgdat);

4884

end = pgdat_end_pfn(pgdat);

4875

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4885

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4876

size = (end - start) * sizeof(struct page);

4886

size = (end - start) * sizeof(struct page);

4877

map = alloc_remap(pgdat->node_id, size);

4887

map = alloc_remap(pgdat->node_id, size);

4878

if (!map)

4888

if (!map)

4879

map = alloc_bootmem_node_nopanic(pgdat, size);

4889

map = alloc_bootmem_node_nopanic(pgdat, size);

4880

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4890

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4881

}

4891

}

4882

#ifndef CONFIG_NEED_MULTIPLE_NODES

4892

#ifndef CONFIG_NEED_MULTIPLE_NODES

4883

/*

4893

/*

4884

* With no DISCONTIG, the global mem_map is just set as node 0's

4894

* With no DISCONTIG, the global mem_map is just set as node 0's

4885

*/

4895

*/

4886

if (pgdat == NODE_DATA(0)) {

4896

if (pgdat == NODE_DATA(0)) {

4887

mem_map = NODE_DATA(0)->node_mem_map;

4897

mem_map = NODE_DATA(0)->node_mem_map;

4888

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4898

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4889

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4899

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4890

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4900

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4891

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4901

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4892

}

4902

}

4893

#endif

4903

#endif

4894

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4904

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4895

}

4905

}

4896

4906

4897

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4907

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4898

unsigned long node_start_pfn, unsigned long *zholes_size)

4908

unsigned long node_start_pfn, unsigned long *zholes_size)

4899

{

4909

{

4900

pg_data_t *pgdat = NODE_DATA(nid);

4910

pg_data_t *pgdat = NODE_DATA(nid);

4901

unsigned long start_pfn = 0;

4911

unsigned long start_pfn = 0;

4902

unsigned long end_pfn = 0;

4912

unsigned long end_pfn = 0;

4903

4913

4904

/* pg_data_t should be reset to zero when it's allocated */

4914

/* pg_data_t should be reset to zero when it's allocated */

4905

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4915

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4906

4916

4907

pgdat->node_id = nid;

4917

pgdat->node_id = nid;

4908

pgdat->node_start_pfn = node_start_pfn;

4918

pgdat->node_start_pfn = node_start_pfn;

4909

if (node_state(nid, N_MEMORY))

4919

if (node_state(nid, N_MEMORY))

4910

init_zone_allows_reclaim(nid);

4920

init_zone_allows_reclaim(nid);

4911

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4921

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4912

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4922

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4913

#endif

4923

#endif

4914

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4924

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4915

zones_size, zholes_size);

4925

zones_size, zholes_size);

4916

4926

4917

alloc_node_mem_map(pgdat);

4927

alloc_node_mem_map(pgdat);

4918

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4928

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4919

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4929

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4920

nid, (unsigned long)pgdat,

4930

nid, (unsigned long)pgdat,

4921

(unsigned long)pgdat->node_mem_map);

4931

(unsigned long)pgdat->node_mem_map);

4922

#endif

4932

#endif

4923

4933

4924

free_area_init_core(pgdat, start_pfn, end_pfn,

4934

free_area_init_core(pgdat, start_pfn, end_pfn,

4925

zones_size, zholes_size);

4935

zones_size, zholes_size);

4926

}

4936

}

4927

4937

4928

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4938

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4929

4939

4930

#if MAX_NUMNODES > 1

4940

#if MAX_NUMNODES > 1

4931

/*

4941

/*

4932

* Figure out the number of possible node ids.

4942

* Figure out the number of possible node ids.

4933

*/

4943

*/

4934

void __init setup_nr_node_ids(void)

4944

void __init setup_nr_node_ids(void)

4935

{

4945

{

4936

unsigned int node;

4946

unsigned int node;

4937

unsigned int highest = 0;

4947

unsigned int highest = 0;

4938

4948

4939

for_each_node_mask(node, node_possible_map)

4949

for_each_node_mask(node, node_possible_map)

4940

highest = node;

4950

highest = node;

4941

nr_node_ids = highest + 1;

4951

nr_node_ids = highest + 1;

4942

}

4952

}

4943

#endif

4953

#endif

4944

4954

4945

/**

4955

/**

4946

* node_map_pfn_alignment - determine the maximum internode alignment

4956

* node_map_pfn_alignment - determine the maximum internode alignment

4947

*

4957

*

4948

* This function should be called after node map is populated and sorted.

4958

* This function should be called after node map is populated and sorted.

4949

* It calculates the maximum power of two alignment which can distinguish

4959

* It calculates the maximum power of two alignment which can distinguish

4950

* all the nodes.

4960

* all the nodes.

4951

*

4961

*

4952

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4962

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4953

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4963

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4954

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4964

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4955

* shifted, 1GiB is enough and this function will indicate so.

4965

* shifted, 1GiB is enough and this function will indicate so.

4956

*

4966

*

4957

* This is used to test whether pfn -> nid mapping of the chosen memory

4967

* This is used to test whether pfn -> nid mapping of the chosen memory

4958

* model has fine enough granularity to avoid incorrect mapping for the

4968

* model has fine enough granularity to avoid incorrect mapping for the

4959

* populated node map.

4969

* populated node map.

4960

*

4970

*

4961

* Returns the determined alignment in pfn's. 0 if there is no alignment

4971

* Returns the determined alignment in pfn's. 0 if there is no alignment

4962

* requirement (single node).

4972

* requirement (single node).

4963

*/

4973

*/

4964

unsigned long __init node_map_pfn_alignment(void)

4974

unsigned long __init node_map_pfn_alignment(void)

4965

{

4975

{

4966

unsigned long accl_mask = 0, last_end = 0;

4976

unsigned long accl_mask = 0, last_end = 0;

4967

unsigned long start, end, mask;

4977

unsigned long start, end, mask;

4968

int last_nid = -1;

4978

int last_nid = -1;

4969

int i, nid;

4979

int i, nid;

4970

4980

4971

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4981

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4972

if (!start || last_nid < 0 || last_nid == nid) {

4982

if (!start || last_nid < 0 || last_nid == nid) {

4973

last_nid = nid;

4983

last_nid = nid;

4974

last_end = end;

4984

last_end = end;

4975

continue;

4985

continue;

4976

}

4986

}

4977

4987

4978

/*

4988

/*

4979

* Start with a mask granular enough to pin-point to the

4989

* Start with a mask granular enough to pin-point to the

4980

* start pfn and tick off bits one-by-one until it becomes

4990

* start pfn and tick off bits one-by-one until it becomes

4981

* too coarse to separate the current node from the last.

4991

* too coarse to separate the current node from the last.

4982

*/

4992

*/

4983

mask = ~((1 << __ffs(start)) - 1);

4993

mask = ~((1 << __ffs(start)) - 1);

4984

while (mask && last_end <= (start & (mask << 1)))

4994

while (mask && last_end <= (start & (mask << 1)))

4985

mask <<= 1;

4995

mask <<= 1;

4986

4996

4987

/* accumulate all internode masks */

4997

/* accumulate all internode masks */

4988

accl_mask |= mask;

4998

accl_mask |= mask;

4989

}

4999

}

4990

5000

4991

/* convert mask to number of pages */

5001

/* convert mask to number of pages */

4992

return ~accl_mask + 1;

5002

return ~accl_mask + 1;

4993

}

5003

}

4994

5004

4995

/* Find the lowest pfn for a node */

5005

/* Find the lowest pfn for a node */

4996

static unsigned long __init find_min_pfn_for_node(int nid)

5006

static unsigned long __init find_min_pfn_for_node(int nid)

4997

{

5007

{

4998

unsigned long min_pfn = ULONG_MAX;

5008

unsigned long min_pfn = ULONG_MAX;

4999

unsigned long start_pfn;

5009

unsigned long start_pfn;

5000

int i;

5010

int i;

5001

5011

5002

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5012

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5003

min_pfn = min(min_pfn, start_pfn);

5013

min_pfn = min(min_pfn, start_pfn);

5004

5014

5005

if (min_pfn == ULONG_MAX) {

5015

if (min_pfn == ULONG_MAX) {

5006

printk(KERN_WARNING

5016

printk(KERN_WARNING

5007

"Could not find start_pfn for node %d\n", nid);

5017

"Could not find start_pfn for node %d\n", nid);

5008

return 0;

5018

return 0;

5009

}

5019

}

5010

5020

5011

return min_pfn;

5021

return min_pfn;

5012

}

5022

}

5013

5023

5014

/**

5024

/**

5015

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5025

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5016

*

5026

*

5017

* It returns the minimum PFN based on information provided via

5027

* It returns the minimum PFN based on information provided via

5018

* add_active_range().

5028

* add_active_range().

5019

*/

5029

*/

5020

unsigned long __init find_min_pfn_with_active_regions(void)

5030

unsigned long __init find_min_pfn_with_active_regions(void)

5021

{

5031

{

5022

return find_min_pfn_for_node(MAX_NUMNODES);

5032

return find_min_pfn_for_node(MAX_NUMNODES);

5023

}

5033

}

5024

5034

5025

/*

5035

/*

5026

* early_calculate_totalpages()

5036

* early_calculate_totalpages()

5027

* Sum pages in active regions for movable zone.

5037

* Sum pages in active regions for movable zone.

5028

* Populate N_MEMORY for calculating usable_nodes.

5038

* Populate N_MEMORY for calculating usable_nodes.

5029

*/

5039

*/

5030

static unsigned long __init early_calculate_totalpages(void)

5040

static unsigned long __init early_calculate_totalpages(void)

5031

{

5041

{

5032

unsigned long totalpages = 0;

5042

unsigned long totalpages = 0;

5033

unsigned long start_pfn, end_pfn;

5043

unsigned long start_pfn, end_pfn;

5034

int i, nid;

5044

int i, nid;

5035

5045

5036

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5046

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5037

unsigned long pages = end_pfn - start_pfn;

5047

unsigned long pages = end_pfn - start_pfn;

5038

5048

5039

totalpages += pages;

5049

totalpages += pages;

5040

if (pages)

5050

if (pages)

5041

node_set_state(nid, N_MEMORY);

5051

node_set_state(nid, N_MEMORY);

5042

}

5052

}

5043

return totalpages;

5053

return totalpages;

5044

}

5054

}

5045

5055

5046

/*

5056

/*

5047

* Find the PFN the Movable zone begins in each node. Kernel memory

5057

* Find the PFN the Movable zone begins in each node. Kernel memory

5048

* is spread evenly between nodes as long as the nodes have enough

5058

* is spread evenly between nodes as long as the nodes have enough

5049

* memory. When they don't, some nodes will have more kernelcore than

5059

* memory. When they don't, some nodes will have more kernelcore than

5050

* others

5060

* others

5051

*/

5061

*/

5052

static void __init find_zone_movable_pfns_for_nodes(void)

5062

static void __init find_zone_movable_pfns_for_nodes(void)

5053

{

5063

{

5054

int i, nid;

5064

int i, nid;

5055

unsigned long usable_startpfn;

5065

unsigned long usable_startpfn;

5056

unsigned long kernelcore_node, kernelcore_remaining;

5066

unsigned long kernelcore_node, kernelcore_remaining;

5057

/* save the state before borrow the nodemask */

5067

/* save the state before borrow the nodemask */

5058

nodemask_t saved_node_state = node_states[N_MEMORY];

5068

nodemask_t saved_node_state = node_states[N_MEMORY];

5059

unsigned long totalpages = early_calculate_totalpages();

5069

unsigned long totalpages = early_calculate_totalpages();

5060

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5070

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5061

5071

5062

/*

5072

/*

5063

* If movablecore was specified, calculate what size of

5073

* If movablecore was specified, calculate what size of

5064

* kernelcore that corresponds so that memory usable for

5074

* kernelcore that corresponds so that memory usable for

5065

* any allocation type is evenly spread. If both kernelcore

5075

* any allocation type is evenly spread. If both kernelcore

5066

* and movablecore are specified, then the value of kernelcore

5076

* and movablecore are specified, then the value of kernelcore

5067

* will be used for required_kernelcore if it's greater than

5077

* will be used for required_kernelcore if it's greater than

5068

* what movablecore would have allowed.

5078

* what movablecore would have allowed.

5069

*/

5079

*/

5070

if (required_movablecore) {

5080

if (required_movablecore) {

5071

unsigned long corepages;

5081

unsigned long corepages;

5072

5082

5073

/*

5083

/*

5074

* Round-up so that ZONE_MOVABLE is at least as large as what

5084

* Round-up so that ZONE_MOVABLE is at least as large as what

5075

* was requested by the user

5085

* was requested by the user

5076

*/

5086

*/

5077

required_movablecore =

5087

required_movablecore =

5078

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5088

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5079

corepages = totalpages - required_movablecore;

5089

corepages = totalpages - required_movablecore;

5080

5090

5081

required_kernelcore = max(required_kernelcore, corepages);

5091

required_kernelcore = max(required_kernelcore, corepages);

5082

}

5092

}

5083

5093

5084

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5094

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5085

if (!required_kernelcore)

5095

if (!required_kernelcore)

5086

goto out;

5096

goto out;

5087

5097

5088

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5098

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5089

find_usable_zone_for_movable();

5099

find_usable_zone_for_movable();

5090

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5100

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5091

5101

5092

restart:

5102

restart:

5093

/* Spread kernelcore memory as evenly as possible throughout nodes */

5103

/* Spread kernelcore memory as evenly as possible throughout nodes */

5094

kernelcore_node = required_kernelcore / usable_nodes;

5104

kernelcore_node = required_kernelcore / usable_nodes;

5095

for_each_node_state(nid, N_MEMORY) {

5105

for_each_node_state(nid, N_MEMORY) {

5096

unsigned long start_pfn, end_pfn;

5106

unsigned long start_pfn, end_pfn;

5097

5107

5098

/*

5108

/*

5099

* Recalculate kernelcore_node if the division per node

5109

* Recalculate kernelcore_node if the division per node

5100

* now exceeds what is necessary to satisfy the requested

5110

* now exceeds what is necessary to satisfy the requested

5101

* amount of memory for the kernel

5111

* amount of memory for the kernel

5102

*/

5112

*/

5103

if (required_kernelcore < kernelcore_node)

5113

if (required_kernelcore < kernelcore_node)

5104

kernelcore_node = required_kernelcore / usable_nodes;

5114

kernelcore_node = required_kernelcore / usable_nodes;

5105

5115

5106

/*

5116

/*

5107

* As the map is walked, we track how much memory is usable

5117

* As the map is walked, we track how much memory is usable

5108

* by the kernel using kernelcore_remaining. When it is

5118

* by the kernel using kernelcore_remaining. When it is

5109

* 0, the rest of the node is usable by ZONE_MOVABLE

5119

* 0, the rest of the node is usable by ZONE_MOVABLE

5110

*/

5120

*/

5111

kernelcore_remaining = kernelcore_node;

5121

kernelcore_remaining = kernelcore_node;

5112

5122

5113

/* Go through each range of PFNs within this node */

5123

/* Go through each range of PFNs within this node */

5114

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5124

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5115

unsigned long size_pages;

5125

unsigned long size_pages;

5116

5126

5117

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5127

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5118

if (start_pfn >= end_pfn)

5128

if (start_pfn >= end_pfn)

5119

continue;

5129

continue;

5120

5130

5121

/* Account for what is only usable for kernelcore */

5131

/* Account for what is only usable for kernelcore */

5122

if (start_pfn < usable_startpfn) {

5132

if (start_pfn < usable_startpfn) {

5123

unsigned long kernel_pages;

5133

unsigned long kernel_pages;

5124

kernel_pages = min(end_pfn, usable_startpfn)

5134

kernel_pages = min(end_pfn, usable_startpfn)

5125

- start_pfn;

5135

- start_pfn;

5126

5136

5127

kernelcore_remaining -= min(kernel_pages,

5137

kernelcore_remaining -= min(kernel_pages,

5128

kernelcore_remaining);

5138

kernelcore_remaining);

5129

required_kernelcore -= min(kernel_pages,

5139

required_kernelcore -= min(kernel_pages,

5130

required_kernelcore);

5140

required_kernelcore);

5131

5141

5132

/* Continue if range is now fully accounted */

5142

/* Continue if range is now fully accounted */

5133

if (end_pfn <= usable_startpfn) {

5143

if (end_pfn <= usable_startpfn) {

5134

5144

5135

/*

5145

/*

5136

* Push zone_movable_pfn to the end so

5146

* Push zone_movable_pfn to the end so

5137

* that if we have to rebalance

5147

* that if we have to rebalance

5138

* kernelcore across nodes, we will

5148

* kernelcore across nodes, we will

5139

* not double account here

5149

* not double account here

5140

*/

5150

*/

5141

zone_movable_pfn[nid] = end_pfn;

5151

zone_movable_pfn[nid] = end_pfn;

5142

continue;

5152

continue;

5143

}

5153

}

5144

start_pfn = usable_startpfn;

5154

start_pfn = usable_startpfn;

5145

}

5155

}

5146

5156

5147

/*

5157

/*

5148

* The usable PFN range for ZONE_MOVABLE is from

5158

* The usable PFN range for ZONE_MOVABLE is from

5149

* start_pfn->end_pfn. Calculate size_pages as the

5159

* start_pfn->end_pfn. Calculate size_pages as the

5150

* number of pages used as kernelcore

5160

* number of pages used as kernelcore

5151

*/

5161

*/

5152

size_pages = end_pfn - start_pfn;

5162

size_pages = end_pfn - start_pfn;

5153

if (size_pages > kernelcore_remaining)

5163

if (size_pages > kernelcore_remaining)

5154

size_pages = kernelcore_remaining;

5164

size_pages = kernelcore_remaining;

5155

zone_movable_pfn[nid] = start_pfn + size_pages;

5165

zone_movable_pfn[nid] = start_pfn + size_pages;

5156

5166

5157

/*

5167

/*

5158

* Some kernelcore has been met, update counts and

5168

* Some kernelcore has been met, update counts and

5159

* break if the kernelcore for this node has been

5169

* break if the kernelcore for this node has been

5160

* satisfied

5170

* satisfied

5161

*/

5171

*/

5162

required_kernelcore -= min(required_kernelcore,

5172

required_kernelcore -= min(required_kernelcore,

5163

size_pages);

5173

size_pages);

5164

kernelcore_remaining -= size_pages;

5174

kernelcore_remaining -= size_pages;

5165

if (!kernelcore_remaining)

5175

if (!kernelcore_remaining)

5166

break;

5176

break;

5167

}

5177

}

5168

}

5178

}

5169

5179

5170

/*

5180

/*

5171

* If there is still required_kernelcore, we do another pass with one

5181

* If there is still required_kernelcore, we do another pass with one

5172

* less node in the count. This will push zone_movable_pfn[nid] further

5182

* less node in the count. This will push zone_movable_pfn[nid] further

5173

* along on the nodes that still have memory until kernelcore is

5183

* along on the nodes that still have memory until kernelcore is

5174

* satisfied

5184

* satisfied

5175

*/

5185

*/

5176

usable_nodes--;

5186

usable_nodes--;

5177

if (usable_nodes && required_kernelcore > usable_nodes)

5187

if (usable_nodes && required_kernelcore > usable_nodes)

5178

goto restart;

5188

goto restart;

5179

5189

5180

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5190

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5181

for (nid = 0; nid < MAX_NUMNODES; nid++)

5191

for (nid = 0; nid < MAX_NUMNODES; nid++)

5182

zone_movable_pfn[nid] =

5192

zone_movable_pfn[nid] =

5183

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5193

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5184

5194

5185

out:

5195

out:

5186

/* restore the node_state */

5196

/* restore the node_state */

5187

node_states[N_MEMORY] = saved_node_state;

5197

node_states[N_MEMORY] = saved_node_state;

5188

}

5198

}

5189

5199

5190

/* Any regular or high memory on that node ? */

5200

/* Any regular or high memory on that node ? */

5191

static void check_for_memory(pg_data_t *pgdat, int nid)

5201

static void check_for_memory(pg_data_t *pgdat, int nid)

5192

{

5202

{

5193

enum zone_type zone_type;

5203

enum zone_type zone_type;

5194

5204

5195

if (N_MEMORY == N_NORMAL_MEMORY)

5205

if (N_MEMORY == N_NORMAL_MEMORY)

5196

return;

5206

return;

5197

5207

5198

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5208

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5199

struct zone *zone = &pgdat->node_zones[zone_type];

5209

struct zone *zone = &pgdat->node_zones[zone_type];

5200

if (zone->present_pages) {

5210

if (zone->present_pages) {

5201

node_set_state(nid, N_HIGH_MEMORY);

5211

node_set_state(nid, N_HIGH_MEMORY);

5202

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5212

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5203

zone_type <= ZONE_NORMAL)

5213

zone_type <= ZONE_NORMAL)

5204

node_set_state(nid, N_NORMAL_MEMORY);

5214

node_set_state(nid, N_NORMAL_MEMORY);

5205

break;

5215

break;

5206

}

5216

}

5207

}

5217

}

5208

}

5218

}

5209

5219

5210

/**

5220

/**

5211

* free_area_init_nodes - Initialise all pg_data_t and zone data

5221

* free_area_init_nodes - Initialise all pg_data_t and zone data

5212

* @max_zone_pfn: an array of max PFNs for each zone

5222

* @max_zone_pfn: an array of max PFNs for each zone

5213

*

5223

*

5214

* This will call free_area_init_node() for each active node in the system.

5224

* This will call free_area_init_node() for each active node in the system.

5215

* Using the page ranges provided by add_active_range(), the size of each

5225

* Using the page ranges provided by add_active_range(), the size of each

5216

* zone in each node and their holes is calculated. If the maximum PFN

5226

* zone in each node and their holes is calculated. If the maximum PFN

5217

* between two adjacent zones match, it is assumed that the zone is empty.

5227

* between two adjacent zones match, it is assumed that the zone is empty.

5218

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5228

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5219

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5229

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5220

* starts where the previous one ended. For example, ZONE_DMA32 starts

5230

* starts where the previous one ended. For example, ZONE_DMA32 starts

5221

* at arch_max_dma_pfn.

5231

* at arch_max_dma_pfn.

5222

*/

5232

*/

5223

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5233

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5224

{

5234

{

5225

unsigned long start_pfn, end_pfn;

5235

unsigned long start_pfn, end_pfn;

5226

int i, nid;

5236

int i, nid;

5227

5237

5228

/* Record where the zone boundaries are */

5238

/* Record where the zone boundaries are */

5229

memset(arch_zone_lowest_possible_pfn, 0,

5239

memset(arch_zone_lowest_possible_pfn, 0,

5230

sizeof(arch_zone_lowest_possible_pfn));

5240

sizeof(arch_zone_lowest_possible_pfn));

5231

memset(arch_zone_highest_possible_pfn, 0,

5241

memset(arch_zone_highest_possible_pfn, 0,

5232

sizeof(arch_zone_highest_possible_pfn));

5242

sizeof(arch_zone_highest_possible_pfn));

5233

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5243

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5234

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5244

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5235

for (i = 1; i < MAX_NR_ZONES; i++) {

5245

for (i = 1; i < MAX_NR_ZONES; i++) {

5236

if (i == ZONE_MOVABLE)

5246

if (i == ZONE_MOVABLE)

5237

continue;

5247

continue;

5238

arch_zone_lowest_possible_pfn[i] =

5248

arch_zone_lowest_possible_pfn[i] =

5239

arch_zone_highest_possible_pfn[i-1];

5249

arch_zone_highest_possible_pfn[i-1];

5240

arch_zone_highest_possible_pfn[i] =

5250

arch_zone_highest_possible_pfn[i] =

5241

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5251

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5242

}

5252

}

5243

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5253

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5244

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5254

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5245

5255

5246

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5256

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5247

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5257

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5248

find_zone_movable_pfns_for_nodes();

5258

find_zone_movable_pfns_for_nodes();

5249

5259

5250

/* Print out the zone ranges */

5260

/* Print out the zone ranges */

5251

printk("Zone ranges:\n");

5261

printk("Zone ranges:\n");

5252

for (i = 0; i < MAX_NR_ZONES; i++) {

5262

for (i = 0; i < MAX_NR_ZONES; i++) {

5253

if (i == ZONE_MOVABLE)

5263

if (i == ZONE_MOVABLE)

5254

continue;

5264

continue;

5255

printk(KERN_CONT " %-8s ", zone_names[i]);

5265

printk(KERN_CONT " %-8s ", zone_names[i]);

5256

if (arch_zone_lowest_possible_pfn[i] ==

5266

if (arch_zone_lowest_possible_pfn[i] ==

5257

arch_zone_highest_possible_pfn[i])

5267

arch_zone_highest_possible_pfn[i])

5258

printk(KERN_CONT "empty\n");

5268

printk(KERN_CONT "empty\n");

5259

else

5269

else

5260

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5270

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5261

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5271

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5262

(arch_zone_highest_possible_pfn[i]

5272

(arch_zone_highest_possible_pfn[i]

5263

<< PAGE_SHIFT) - 1);

5273

<< PAGE_SHIFT) - 1);

5264

}

5274

}

5265

5275

5266

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5276

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5267

printk("Movable zone start for each node\n");

5277

printk("Movable zone start for each node\n");

5268

for (i = 0; i < MAX_NUMNODES; i++) {

5278

for (i = 0; i < MAX_NUMNODES; i++) {

5269

if (zone_movable_pfn[i])

5279

if (zone_movable_pfn[i])

5270

printk(" Node %d: %#010lx\n", i,

5280

printk(" Node %d: %#010lx\n", i,

5271

zone_movable_pfn[i] << PAGE_SHIFT);

5281

zone_movable_pfn[i] << PAGE_SHIFT);

5272

}

5282

}

5273

5283

5274

/* Print out the early node map */

5284

/* Print out the early node map */

5275

printk("Early memory node ranges\n");

5285

printk("Early memory node ranges\n");

5276

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5286

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5277

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5287

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5278

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5288

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5279

5289

5280

/* Initialise every node */

5290

/* Initialise every node */

5281

mminit_verify_pageflags_layout();

5291

mminit_verify_pageflags_layout();

5282

setup_nr_node_ids();

5292

setup_nr_node_ids();

5283

for_each_online_node(nid) {

5293

for_each_online_node(nid) {

5284

pg_data_t *pgdat = NODE_DATA(nid);

5294

pg_data_t *pgdat = NODE_DATA(nid);

5285

free_area_init_node(nid, NULL,

5295

free_area_init_node(nid, NULL,

5286

find_min_pfn_for_node(nid), NULL);

5296

find_min_pfn_for_node(nid), NULL);

5287

5297

5288

/* Any memory on that node */

5298

/* Any memory on that node */

5289

if (pgdat->node_present_pages)

5299

if (pgdat->node_present_pages)

5290

node_set_state(nid, N_MEMORY);

5300

node_set_state(nid, N_MEMORY);

5291

check_for_memory(pgdat, nid);

5301

check_for_memory(pgdat, nid);

5292

}

5302

}

5293

}

5303

}

5294

5304

5295

static int __init cmdline_parse_core(char *p, unsigned long *core)

5305

static int __init cmdline_parse_core(char *p, unsigned long *core)

5296

{

5306

{

5297

unsigned long long coremem;

5307

unsigned long long coremem;

5298

if (!p)

5308

if (!p)

5299

return -EINVAL;

5309

return -EINVAL;

5300

5310

5301

coremem = memparse(p, &p);

5311

coremem = memparse(p, &p);

5302

*core = coremem >> PAGE_SHIFT;

5312

*core = coremem >> PAGE_SHIFT;

5303

5313

5304

/* Paranoid check that UL is enough for the coremem value */

5314

/* Paranoid check that UL is enough for the coremem value */

5305

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5315

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5306

5316

5307

return 0;

5317

return 0;

5308

}

5318

}

5309

5319

5310

/*

5320

/*

5311

* kernelcore=size sets the amount of memory for use for allocations that

5321

* kernelcore=size sets the amount of memory for use for allocations that

5312

* cannot be reclaimed or migrated.

5322

* cannot be reclaimed or migrated.

5313

*/

5323

*/

5314

static int __init cmdline_parse_kernelcore(char *p)

5324

static int __init cmdline_parse_kernelcore(char *p)

5315

{

5325

{

5316

return cmdline_parse_core(p, &required_kernelcore);

5326

return cmdline_parse_core(p, &required_kernelcore);

5317

}

5327

}

5318

5328

5319

/*

5329

/*

5320

* movablecore=size sets the amount of memory for use for allocations that

5330

* movablecore=size sets the amount of memory for use for allocations that

5321

* can be reclaimed or migrated.

5331

* can be reclaimed or migrated.

5322

*/

5332

*/

5323

static int __init cmdline_parse_movablecore(char *p)

5333

static int __init cmdline_parse_movablecore(char *p)

5324

{

5334

{

5325

return cmdline_parse_core(p, &required_movablecore);

5335

return cmdline_parse_core(p, &required_movablecore);

5326

}

5336

}

5327

5337

5328

early_param("kernelcore", cmdline_parse_kernelcore);

5338

early_param("kernelcore", cmdline_parse_kernelcore);

5329

early_param("movablecore", cmdline_parse_movablecore);

5339

early_param("movablecore", cmdline_parse_movablecore);

5330

5340

5331

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5341

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5332

5342

5333

void adjust_managed_page_count(struct page *page, long count)

5343

void adjust_managed_page_count(struct page *page, long count)

5334

{

5344

{

5335

spin_lock(&managed_page_count_lock);

5345

spin_lock(&managed_page_count_lock);

5336

page_zone(page)->managed_pages += count;

5346

page_zone(page)->managed_pages += count;

5337

totalram_pages += count;

5347

totalram_pages += count;

5338

#ifdef CONFIG_HIGHMEM

5348

#ifdef CONFIG_HIGHMEM

5339

if (PageHighMem(page))

5349

if (PageHighMem(page))

5340

totalhigh_pages += count;

5350

totalhigh_pages += count;

5341

#endif

5351

#endif

5342

spin_unlock(&managed_page_count_lock);

5352

spin_unlock(&managed_page_count_lock);

5343

}

5353

}

5344

EXPORT_SYMBOL(adjust_managed_page_count);

5354

EXPORT_SYMBOL(adjust_managed_page_count);

5345

5355

5346

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5356

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5347

{

5357

{

5348

void *pos;

5358

void *pos;

5349

unsigned long pages = 0;

5359

unsigned long pages = 0;

5350

5360

5351

start = (void *)PAGE_ALIGN((unsigned long)start);

5361

start = (void *)PAGE_ALIGN((unsigned long)start);

5352

end = (void *)((unsigned long)end & PAGE_MASK);

5362

end = (void *)((unsigned long)end & PAGE_MASK);

5353

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5363

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5354

if ((unsigned int)poison <= 0xFF)

5364

if ((unsigned int)poison <= 0xFF)

5355

memset(pos, poison, PAGE_SIZE);

5365

memset(pos, poison, PAGE_SIZE);

5356

free_reserved_page(virt_to_page(pos));

5366

free_reserved_page(virt_to_page(pos));

5357

}

5367

}

5358

5368

5359

if (pages && s)

5369

if (pages && s)

5360

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5370

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5361

s, pages << (PAGE_SHIFT - 10), start, end);

5371

s, pages << (PAGE_SHIFT - 10), start, end);

5362

5372

5363

return pages;

5373

return pages;

5364

}

5374

}

5365

EXPORT_SYMBOL(free_reserved_area);

5375

EXPORT_SYMBOL(free_reserved_area);

5366

5376

5367

#ifdef CONFIG_HIGHMEM

5377

#ifdef CONFIG_HIGHMEM

5368

void free_highmem_page(struct page *page)

5378

void free_highmem_page(struct page *page)

5369

{

5379

{

5370

__free_reserved_page(page);

5380

__free_reserved_page(page);

5371

totalram_pages++;

5381

totalram_pages++;

5372

page_zone(page)->managed_pages++;

5382

page_zone(page)->managed_pages++;

5373

totalhigh_pages++;

5383

totalhigh_pages++;

5374

}

5384

}

5375

#endif

5385

#endif

5376

5386

5377

5387

5378

void __init mem_init_print_info(const char *str)

5388

void __init mem_init_print_info(const char *str)

5379

{

5389

{

5380

unsigned long physpages, codesize, datasize, rosize, bss_size;

5390

unsigned long physpages, codesize, datasize, rosize, bss_size;

5381

unsigned long init_code_size, init_data_size;

5391

unsigned long init_code_size, init_data_size;

5382

5392

5383

physpages = get_num_physpages();

5393

physpages = get_num_physpages();

5384

codesize = _etext - _stext;

5394

codesize = _etext - _stext;

5385

datasize = _edata - _sdata;

5395

datasize = _edata - _sdata;

5386

rosize = __end_rodata - __start_rodata;

5396

rosize = __end_rodata - __start_rodata;

5387

bss_size = __bss_stop - __bss_start;

5397

bss_size = __bss_stop - __bss_start;

5388

init_data_size = __init_end - __init_begin;

5398

init_data_size = __init_end - __init_begin;

5389

init_code_size = _einittext - _sinittext;

5399

init_code_size = _einittext - _sinittext;

5390

5400

5391

/*

5401

/*

5392

* Detect special cases and adjust section sizes accordingly:

5402

* Detect special cases and adjust section sizes accordingly:

5393

* 1) .init.* may be embedded into .data sections

5403

* 1) .init.* may be embedded into .data sections

5394

* 2) .init.text.* may be out of [__init_begin, __init_end],

5404

* 2) .init.text.* may be out of [__init_begin, __init_end],

5395

* please refer to arch/tile/kernel/vmlinux.lds.S.

5405

* please refer to arch/tile/kernel/vmlinux.lds.S.

5396

* 3) .rodata.* may be embedded into .text or .data sections.

5406

* 3) .rodata.* may be embedded into .text or .data sections.

5397

*/

5407

*/

5398

#define adj_init_size(start, end, size, pos, adj) \

5408

#define adj_init_size(start, end, size, pos, adj) \

5399

do { \

5409

do { \

5400

if (start <= pos && pos < end && size > adj) \

5410

if (start <= pos && pos < end && size > adj) \

5401

size -= adj; \

5411

size -= adj; \

5402

} while (0)

5412

} while (0)

5403

5413

5404

adj_init_size(__init_begin, __init_end, init_data_size,

5414

adj_init_size(__init_begin, __init_end, init_data_size,

5405

_sinittext, init_code_size);

5415

_sinittext, init_code_size);

5406

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5416

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5407

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5417

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5408

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5418

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5409

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5419

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5410

5420

5411

#undef adj_init_size

5421

#undef adj_init_size

5412

5422

5413

printk("Memory: %luK/%luK available "

5423

printk("Memory: %luK/%luK available "

5414

"(%luK kernel code, %luK rwdata, %luK rodata, "

5424

"(%luK kernel code, %luK rwdata, %luK rodata, "

5415

"%luK init, %luK bss, %luK reserved"

5425

"%luK init, %luK bss, %luK reserved"

5416

#ifdef CONFIG_HIGHMEM

5426

#ifdef CONFIG_HIGHMEM

5417

", %luK highmem"

5427

", %luK highmem"

5418

#endif

5428

#endif

5419

"%s%s)\n",

5429

"%s%s)\n",

5420

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5430

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5421

codesize >> 10, datasize >> 10, rosize >> 10,

5431

codesize >> 10, datasize >> 10, rosize >> 10,

5422

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5432

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5423

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5433

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5424

#ifdef CONFIG_HIGHMEM

5434

#ifdef CONFIG_HIGHMEM

5425

totalhigh_pages << (PAGE_SHIFT-10),

5435

totalhigh_pages << (PAGE_SHIFT-10),

5426

#endif

5436

#endif

5427

str ? ", " : "", str ? str : "");

5437

str ? ", " : "", str ? str : "");

5428

}

5438

}

5429

5439

5430

/**

5440

/**

5431

* set_dma_reserve - set the specified number of pages reserved in the first zone

5441

* set_dma_reserve - set the specified number of pages reserved in the first zone

5432

* @new_dma_reserve: The number of pages to mark reserved

5442

* @new_dma_reserve: The number of pages to mark reserved

5433

*

5443

*

5434

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5444

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5435

* In the DMA zone, a significant percentage may be consumed by kernel image

5445

* In the DMA zone, a significant percentage may be consumed by kernel image

5436

* and other unfreeable allocations which can skew the watermarks badly. This

5446

* and other unfreeable allocations which can skew the watermarks badly. This

5437

* function may optionally be used to account for unfreeable pages in the

5447

* function may optionally be used to account for unfreeable pages in the

5438

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5448

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5439

* smaller per-cpu batchsize.

5449

* smaller per-cpu batchsize.

5440

*/

5450

*/

5441

void __init set_dma_reserve(unsigned long new_dma_reserve)

5451

void __init set_dma_reserve(unsigned long new_dma_reserve)

5442

{

5452

{

5443

dma_reserve = new_dma_reserve;

5453

dma_reserve = new_dma_reserve;

5444

}

5454

}

5445

5455

5446

void __init free_area_init(unsigned long *zones_size)

5456

void __init free_area_init(unsigned long *zones_size)

5447

{

5457

{

5448

free_area_init_node(0, zones_size,

5458

free_area_init_node(0, zones_size,

5449

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5459

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5450

}

5460

}

5451

5461

5452

static int page_alloc_cpu_notify(struct notifier_block *self,

5462

static int page_alloc_cpu_notify(struct notifier_block *self,

5453

unsigned long action, void *hcpu)

5463

unsigned long action, void *hcpu)

5454

{

5464

{

5455

int cpu = (unsigned long)hcpu;

5465

int cpu = (unsigned long)hcpu;

5456

5466

5457

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5467

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5458

lru_add_drain_cpu(cpu);

5468

lru_add_drain_cpu(cpu);

5459

drain_pages(cpu);

5469

drain_pages(cpu);

5460

5470

5461

/*

5471

/*

5462

* Spill the event counters of the dead processor

5472

* Spill the event counters of the dead processor

5463

* into the current processors event counters.

5473

* into the current processors event counters.

5464

* This artificially elevates the count of the current

5474

* This artificially elevates the count of the current

5465

* processor.

5475

* processor.

5466

*/

5476

*/

5467

vm_events_fold_cpu(cpu);

5477

vm_events_fold_cpu(cpu);

5468

5478

5469

/*

5479

/*

5470

* Zero the differential counters of the dead processor

5480

* Zero the differential counters of the dead processor

5471

* so that the vm statistics are consistent.

5481

* so that the vm statistics are consistent.

5472

*

5482

*

5473

* This is only okay since the processor is dead and cannot

5483

* This is only okay since the processor is dead and cannot

5474

* race with what we are doing.

5484

* race with what we are doing.

5475

*/

5485

*/

5476

cpu_vm_stats_fold(cpu);

5486

cpu_vm_stats_fold(cpu);

5477

}

5487

}

5478

return NOTIFY_OK;

5488

return NOTIFY_OK;

5479

}

5489

}

5480

5490

5481

void __init page_alloc_init(void)

5491

void __init page_alloc_init(void)

5482

{

5492

{

5483

hotcpu_notifier(page_alloc_cpu_notify, 0);

5493

hotcpu_notifier(page_alloc_cpu_notify, 0);

5484

}

5494

}

5485

5495

5486

/*

5496

/*

5487

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5497

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5488

* or min_free_kbytes changes.

5498

* or min_free_kbytes changes.

5489

*/

5499

*/

5490

static void calculate_totalreserve_pages(void)

5500

static void calculate_totalreserve_pages(void)

5491

{

5501

{

5492

struct pglist_data *pgdat;

5502

struct pglist_data *pgdat;

5493

unsigned long reserve_pages = 0;

5503

unsigned long reserve_pages = 0;

5494

enum zone_type i, j;

5504

enum zone_type i, j;

5495

5505

5496

for_each_online_pgdat(pgdat) {

5506

for_each_online_pgdat(pgdat) {

5497

for (i = 0; i < MAX_NR_ZONES; i++) {

5507

for (i = 0; i < MAX_NR_ZONES; i++) {

5498

struct zone *zone = pgdat->node_zones + i;

5508

struct zone *zone = pgdat->node_zones + i;

5499

unsigned long max = 0;

5509

unsigned long max = 0;

5500

5510

5501

/* Find valid and maximum lowmem_reserve in the zone */

5511

/* Find valid and maximum lowmem_reserve in the zone */

5502

for (j = i; j < MAX_NR_ZONES; j++) {

5512

for (j = i; j < MAX_NR_ZONES; j++) {

5503

if (zone->lowmem_reserve[j] > max)

5513

if (zone->lowmem_reserve[j] > max)

5504

max = zone->lowmem_reserve[j];

5514

max = zone->lowmem_reserve[j];

5505

}

5515

}

5506

5516

5507

/* we treat the high watermark as reserved pages. */

5517

/* we treat the high watermark as reserved pages. */

5508

max += high_wmark_pages(zone);

5518

max += high_wmark_pages(zone);

5509

5519

5510

if (max > zone->managed_pages)

5520

if (max > zone->managed_pages)

5511

max = zone->managed_pages;

5521

max = zone->managed_pages;

5512

reserve_pages += max;

5522

reserve_pages += max;

5513

/*

5523

/*

5514

* Lowmem reserves are not available to

5524

* Lowmem reserves are not available to

5515

* GFP_HIGHUSER page cache allocations and

5525

* GFP_HIGHUSER page cache allocations and

5516

* kswapd tries to balance zones to their high

5526

* kswapd tries to balance zones to their high

5517

* watermark. As a result, neither should be

5527

* watermark. As a result, neither should be

5518

* regarded as dirtyable memory, to prevent a

5528

* regarded as dirtyable memory, to prevent a

5519

* situation where reclaim has to clean pages

5529

* situation where reclaim has to clean pages

5520

* in order to balance the zones.

5530

* in order to balance the zones.

5521

*/

5531

*/

5522

zone->dirty_balance_reserve = max;

5532

zone->dirty_balance_reserve = max;

5523

}

5533

}

5524

}

5534

}

5525

dirty_balance_reserve = reserve_pages;

5535

dirty_balance_reserve = reserve_pages;

5526

totalreserve_pages = reserve_pages;

5536

totalreserve_pages = reserve_pages;

5527

}

5537

}

5528

5538

5529

/*

5539

/*

5530

* setup_per_zone_lowmem_reserve - called whenever

5540

* setup_per_zone_lowmem_reserve - called whenever

5531

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5541

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5532

* has a correct pages reserved value, so an adequate number of

5542

* has a correct pages reserved value, so an adequate number of

5533

* pages are left in the zone after a successful __alloc_pages().

5543

* pages are left in the zone after a successful __alloc_pages().

5534

*/

5544

*/

5535

static void setup_per_zone_lowmem_reserve(void)

5545

static void setup_per_zone_lowmem_reserve(void)

5536

{

5546

{

5537

struct pglist_data *pgdat;

5547

struct pglist_data *pgdat;

5538

enum zone_type j, idx;

5548

enum zone_type j, idx;

5539

5549

5540

for_each_online_pgdat(pgdat) {

5550

for_each_online_pgdat(pgdat) {

5541

for (j = 0; j < MAX_NR_ZONES; j++) {

5551

for (j = 0; j < MAX_NR_ZONES; j++) {

5542

struct zone *zone = pgdat->node_zones + j;

5552

struct zone *zone = pgdat->node_zones + j;

5543

unsigned long managed_pages = zone->managed_pages;

5553

unsigned long managed_pages = zone->managed_pages;

5544

5554

5545

zone->lowmem_reserve[j] = 0;

5555

zone->lowmem_reserve[j] = 0;

5546

5556

5547

idx = j;

5557

idx = j;

5548

while (idx) {

5558

while (idx) {

5549

struct zone *lower_zone;

5559

struct zone *lower_zone;

5550

5560

5551

idx--;

5561

idx--;

5552

5562

5553

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5563

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5554

sysctl_lowmem_reserve_ratio[idx] = 1;

5564

sysctl_lowmem_reserve_ratio[idx] = 1;

5555

5565

5556

lower_zone = pgdat->node_zones + idx;

5566

lower_zone = pgdat->node_zones + idx;

5557

lower_zone->lowmem_reserve[j] = managed_pages /

5567

lower_zone->lowmem_reserve[j] = managed_pages /

5558

sysctl_lowmem_reserve_ratio[idx];

5568

sysctl_lowmem_reserve_ratio[idx];

5559

managed_pages += lower_zone->managed_pages;

5569

managed_pages += lower_zone->managed_pages;

5560

}

5570

}

5561

}

5571

}

5562

}

5572

}

5563

5573

5564

/* update totalreserve_pages */

5574

/* update totalreserve_pages */

5565

calculate_totalreserve_pages();

5575

calculate_totalreserve_pages();

5566

}

5576

}

5567

5577

5568

static void __setup_per_zone_wmarks(void)

5578

static void __setup_per_zone_wmarks(void)

5569

{

5579

{

5570

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5580

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5571

unsigned long lowmem_pages = 0;

5581

unsigned long lowmem_pages = 0;

5572

struct zone *zone;

5582

struct zone *zone;

5573

unsigned long flags;

5583

unsigned long flags;

5574

5584

5575

/* Calculate total number of !ZONE_HIGHMEM pages */

5585

/* Calculate total number of !ZONE_HIGHMEM pages */

5576

for_each_zone(zone) {

5586

for_each_zone(zone) {

5577

if (!is_highmem(zone))

5587

if (!is_highmem(zone))

5578

lowmem_pages += zone->managed_pages;

5588

lowmem_pages += zone->managed_pages;

5579

}

5589

}

5580

5590

5581

for_each_zone(zone) {

5591

for_each_zone(zone) {

5582

u64 tmp;

5592

u64 tmp;

5583

5593

5584

spin_lock_irqsave(&zone->lock, flags);

5594

spin_lock_irqsave(&zone->lock, flags);

5585

tmp = (u64)pages_min * zone->managed_pages;

5595

tmp = (u64)pages_min * zone->managed_pages;

5586

do_div(tmp, lowmem_pages);

5596

do_div(tmp, lowmem_pages);

5587

if (is_highmem(zone)) {

5597

if (is_highmem(zone)) {

5588

/*

5598

/*

5589

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5599

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5590

* need highmem pages, so cap pages_min to a small

5600

* need highmem pages, so cap pages_min to a small

5591

* value here.

5601

* value here.

5592

*

5602

*

5593

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5603

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5594

* deltas controls asynch page reclaim, and so should

5604

* deltas controls asynch page reclaim, and so should

5595

* not be capped for highmem.

5605

* not be capped for highmem.

5596

*/

5606

*/

5597

unsigned long min_pages;

5607

unsigned long min_pages;

5598

5608

5599

min_pages = zone->managed_pages / 1024;

5609

min_pages = zone->managed_pages / 1024;

5600

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5610

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5601

zone->watermark[WMARK_MIN] = min_pages;

5611

zone->watermark[WMARK_MIN] = min_pages;

5602

} else {

5612

} else {

5603

/*

5613

/*

5604

* If it's a lowmem zone, reserve a number of pages

5614

* If it's a lowmem zone, reserve a number of pages

5605

* proportionate to the zone's size.

5615

* proportionate to the zone's size.

5606

*/

5616

*/

5607

zone->watermark[WMARK_MIN] = tmp;

5617

zone->watermark[WMARK_MIN] = tmp;

5608

}

5618

}

5609

5619

5610

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5620

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5611

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5621

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5612

5622

5613

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5623

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5614

high_wmark_pages(zone) -

5624

high_wmark_pages(zone) -

5615

low_wmark_pages(zone) -

5625

low_wmark_pages(zone) -

5616

zone_page_state(zone, NR_ALLOC_BATCH));

5626

zone_page_state(zone, NR_ALLOC_BATCH));

5617

5627

5618

setup_zone_migrate_reserve(zone);

5628

setup_zone_migrate_reserve(zone);

5619

spin_unlock_irqrestore(&zone->lock, flags);

5629

spin_unlock_irqrestore(&zone->lock, flags);

5620

}

5630

}

5621

5631

5622

/* update totalreserve_pages */

5632

/* update totalreserve_pages */

5623

calculate_totalreserve_pages();

5633

calculate_totalreserve_pages();

5624

}

5634

}

5625

5635

5626

/**

5636

/**

5627

* setup_per_zone_wmarks - called when min_free_kbytes changes

5637

* setup_per_zone_wmarks - called when min_free_kbytes changes

5628

* or when memory is hot-{added|removed}

5638

* or when memory is hot-{added|removed}

5629

*

5639

*

5630

* Ensures that the watermark[min,low,high] values for each zone are set

5640

* Ensures that the watermark[min,low,high] values for each zone are set

5631

* correctly with respect to min_free_kbytes.

5641

* correctly with respect to min_free_kbytes.

5632

*/

5642

*/

5633

void setup_per_zone_wmarks(void)

5643

void setup_per_zone_wmarks(void)

5634

{

5644

{

5635

mutex_lock(&zonelists_mutex);

5645

mutex_lock(&zonelists_mutex);

5636

__setup_per_zone_wmarks();

5646

__setup_per_zone_wmarks();

5637

mutex_unlock(&zonelists_mutex);

5647

mutex_unlock(&zonelists_mutex);

5638

}

5648

}

5639

5649

5640

/*

5650

/*

5641

* The inactive anon list should be small enough that the VM never has to

5651

* The inactive anon list should be small enough that the VM never has to

5642

* do too much work, but large enough that each inactive page has a chance

5652

* do too much work, but large enough that each inactive page has a chance

5643

* to be referenced again before it is swapped out.

5653

* to be referenced again before it is swapped out.

5644

*

5654

*

5645

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5655

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5646

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5656

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5647

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5657

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5648

* the anonymous pages are kept on the inactive list.

5658

* the anonymous pages are kept on the inactive list.

5649

*

5659

*

5650

* total target max

5660

* total target max

5651

* memory ratio inactive anon

5661

* memory ratio inactive anon

5652

* -------------------------------------

5662

* -------------------------------------

5653

* 10MB 1 5MB

5663

* 10MB 1 5MB

5654

* 100MB 1 50MB

5664

* 100MB 1 50MB

5655

* 1GB 3 250MB

5665

* 1GB 3 250MB

5656

* 10GB 10 0.9GB

5666

* 10GB 10 0.9GB

5657

* 100GB 31 3GB

5667

* 100GB 31 3GB

5658

* 1TB 101 10GB

5668

* 1TB 101 10GB

5659

* 10TB 320 32GB

5669

* 10TB 320 32GB

5660

*/

5670

*/

5661

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5671

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5662

{

5672

{

5663

unsigned int gb, ratio;

5673

unsigned int gb, ratio;

5664

5674

5665

/* Zone size in gigabytes */

5675

/* Zone size in gigabytes */

5666

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5676

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5667

if (gb)

5677

if (gb)

5668

ratio = int_sqrt(10 * gb);

5678

ratio = int_sqrt(10 * gb);

5669

else

5679

else

5670

ratio = 1;

5680

ratio = 1;

5671

5681

5672

zone->inactive_ratio = ratio;

5682

zone->inactive_ratio = ratio;

5673

}

5683

}

5674

5684

5675

static void __meminit setup_per_zone_inactive_ratio(void)

5685

static void __meminit setup_per_zone_inactive_ratio(void)

5676

{

5686

{

5677

struct zone *zone;

5687

struct zone *zone;

5678

5688

5679

for_each_zone(zone)

5689

for_each_zone(zone)

5680

calculate_zone_inactive_ratio(zone);

5690

calculate_zone_inactive_ratio(zone);

5681

}

5691

}

5682

5692

5683

/*

5693

/*

5684

* Initialise min_free_kbytes.

5694

* Initialise min_free_kbytes.

5685

*

5695

*

5686

* For small machines we want it small (128k min). For large machines

5696

* For small machines we want it small (128k min). For large machines

5687

* we want it large (64MB max). But it is not linear, because network

5697

* we want it large (64MB max). But it is not linear, because network

5688

* bandwidth does not increase linearly with machine size. We use

5698

* bandwidth does not increase linearly with machine size. We use

5689

*

5699

*

5690

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5700

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5691

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5701

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5692

*

5702

*

5693

* which yields

5703

* which yields

5694

*

5704

*

5695

* 16MB: 512k

5705

* 16MB: 512k

5696

* 32MB: 724k

5706

* 32MB: 724k

5697

* 64MB: 1024k

5707

* 64MB: 1024k

5698

* 128MB: 1448k

5708

* 128MB: 1448k

5699

* 256MB: 2048k

5709

* 256MB: 2048k

5700

* 512MB: 2896k

5710

* 512MB: 2896k

5701

* 1024MB: 4096k

5711

* 1024MB: 4096k

5702

* 2048MB: 5792k

5712

* 2048MB: 5792k

5703

* 4096MB: 8192k

5713

* 4096MB: 8192k

5704

* 8192MB: 11584k

5714

* 8192MB: 11584k

5705

* 16384MB: 16384k

5715

* 16384MB: 16384k

5706

*/

5716

*/

5707

int __meminit init_per_zone_wmark_min(void)

5717

int __meminit init_per_zone_wmark_min(void)

5708

{

5718

{

5709

unsigned long lowmem_kbytes;

5719

unsigned long lowmem_kbytes;

5710

int new_min_free_kbytes;

5720

int new_min_free_kbytes;

5711

5721

5712

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5722

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5713

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5723

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5714

5724

5715

if (new_min_free_kbytes > user_min_free_kbytes) {

5725

if (new_min_free_kbytes > user_min_free_kbytes) {

5716

min_free_kbytes = new_min_free_kbytes;

5726

min_free_kbytes = new_min_free_kbytes;

5717

if (min_free_kbytes < 128)

5727

if (min_free_kbytes < 128)

5718

min_free_kbytes = 128;

5728

min_free_kbytes = 128;

5719

if (min_free_kbytes > 65536)

5729

if (min_free_kbytes > 65536)

5720

min_free_kbytes = 65536;

5730

min_free_kbytes = 65536;

5721

} else {

5731

} else {

5722

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5732

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5723

new_min_free_kbytes, user_min_free_kbytes);

5733

new_min_free_kbytes, user_min_free_kbytes);

5724

}

5734

}

5725

setup_per_zone_wmarks();

5735

setup_per_zone_wmarks();

5726

refresh_zone_stat_thresholds();

5736

refresh_zone_stat_thresholds();

5727

setup_per_zone_lowmem_reserve();

5737

setup_per_zone_lowmem_reserve();

5728

setup_per_zone_inactive_ratio();

5738

setup_per_zone_inactive_ratio();

5729

return 0;

5739

return 0;

5730

}

5740

}

5731

module_init(init_per_zone_wmark_min)

5741

module_init(init_per_zone_wmark_min)

5732

5742

5733

/*

5743

/*

5734

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5744

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5735

* that we can call two helper functions whenever min_free_kbytes

5745

* that we can call two helper functions whenever min_free_kbytes

5736

* changes.

5746

* changes.

5737

*/

5747

*/

5738

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5748

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5739

void __user *buffer, size_t *length, loff_t *ppos)

5749

void __user *buffer, size_t *length, loff_t *ppos)

5740

{

5750

{

5741

int rc;

5751

int rc;

5742

5752

5743

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5753

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5744

if (rc)

5754

if (rc)

5745

return rc;

5755

return rc;

5746

5756

5747

if (write) {

5757

if (write) {

5748

user_min_free_kbytes = min_free_kbytes;

5758

user_min_free_kbytes = min_free_kbytes;

5749

setup_per_zone_wmarks();

5759

setup_per_zone_wmarks();

5750

}

5760

}

5751

return 0;

5761

return 0;

5752

}

5762

}

5753

5763

5754

#ifdef CONFIG_NUMA

5764

#ifdef CONFIG_NUMA

5755

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5765

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5756

void __user *buffer, size_t *length, loff_t *ppos)

5766

void __user *buffer, size_t *length, loff_t *ppos)

5757

{

5767

{

5758

struct zone *zone;

5768

struct zone *zone;

5759

int rc;

5769

int rc;

5760

5770

5761

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5771

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5762

if (rc)

5772

if (rc)

5763

return rc;

5773

return rc;

5764

5774

5765

for_each_zone(zone)

5775

for_each_zone(zone)

5766

zone->min_unmapped_pages = (zone->managed_pages *

5776

zone->min_unmapped_pages = (zone->managed_pages *

5767

sysctl_min_unmapped_ratio) / 100;

5777

sysctl_min_unmapped_ratio) / 100;

5768

return 0;

5778

return 0;

5769

}

5779

}

5770

5780

5771

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5781

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5772

void __user *buffer, size_t *length, loff_t *ppos)

5782

void __user *buffer, size_t *length, loff_t *ppos)

5773

{

5783

{

5774

struct zone *zone;

5784

struct zone *zone;

5775

int rc;

5785

int rc;

5776

5786

5777

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5787

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5778

if (rc)

5788

if (rc)

5779

return rc;

5789

return rc;

5780

5790

5781

for_each_zone(zone)

5791

for_each_zone(zone)

5782

zone->min_slab_pages = (zone->managed_pages *

5792

zone->min_slab_pages = (zone->managed_pages *

5783

sysctl_min_slab_ratio) / 100;

5793

sysctl_min_slab_ratio) / 100;

5784

return 0;

5794

return 0;

5785

}

5795

}

5786

#endif

5796

#endif

5787

5797

5788

/*

5798

/*

5789

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5799

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5790

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5800

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5791

* whenever sysctl_lowmem_reserve_ratio changes.

5801

* whenever sysctl_lowmem_reserve_ratio changes.

5792

*

5802

*

5793

* The reserve ratio obviously has absolutely no relation with the

5803

* The reserve ratio obviously has absolutely no relation with the

5794

* minimum watermarks. The lowmem reserve ratio can only make sense

5804

* minimum watermarks. The lowmem reserve ratio can only make sense

5795

* if in function of the boot time zone sizes.

5805

* if in function of the boot time zone sizes.

5796

*/

5806

*/

5797

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5807

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5798

void __user *buffer, size_t *length, loff_t *ppos)

5808

void __user *buffer, size_t *length, loff_t *ppos)

5799

{

5809

{

5800

proc_dointvec_minmax(table, write, buffer, length, ppos);

5810

proc_dointvec_minmax(table, write, buffer, length, ppos);

5801

setup_per_zone_lowmem_reserve();

5811

setup_per_zone_lowmem_reserve();

5802

return 0;

5812

return 0;

5803

}

5813

}

5804

5814

5805

/*

5815

/*

5806

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5816

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5807

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5817

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5808

* pagelist can have before it gets flushed back to buddy allocator.

5818

* pagelist can have before it gets flushed back to buddy allocator.

5809

*/

5819

*/

5810

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5820

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5811

void __user *buffer, size_t *length, loff_t *ppos)

5821

void __user *buffer, size_t *length, loff_t *ppos)

5812

{

5822

{

5813

struct zone *zone;

5823

struct zone *zone;

5814

int old_percpu_pagelist_fraction;

5824

int old_percpu_pagelist_fraction;

5815

int ret;

5825

int ret;

5816

5826

5817

mutex_lock(&pcp_batch_high_lock);

5827

mutex_lock(&pcp_batch_high_lock);

5818

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5828

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5819

5829

5820

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5830

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5821

if (!write || ret < 0)

5831

if (!write || ret < 0)

5822

goto out;

5832

goto out;

5823

5833

5824

/* Sanity checking to avoid pcp imbalance */

5834

/* Sanity checking to avoid pcp imbalance */

5825

if (percpu_pagelist_fraction &&

5835

if (percpu_pagelist_fraction &&

5826

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5836

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5827

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5837

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5828

ret = -EINVAL;

5838

ret = -EINVAL;

5829

goto out;

5839

goto out;

5830

}

5840

}

5831

5841

5832

/* No change? */

5842

/* No change? */

5833

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5843

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5834

goto out;

5844

goto out;

5835

5845

5836

for_each_populated_zone(zone) {

5846

for_each_populated_zone(zone) {

5837

unsigned int cpu;

5847

unsigned int cpu;

5838

5848

5839

for_each_possible_cpu(cpu)

5849

for_each_possible_cpu(cpu)

5840

pageset_set_high_and_batch(zone,

5850

pageset_set_high_and_batch(zone,

5841

per_cpu_ptr(zone->pageset, cpu));

5851

per_cpu_ptr(zone->pageset, cpu));

5842

}

5852

}

5843

out:

5853

out:

5844

mutex_unlock(&pcp_batch_high_lock);

5854

mutex_unlock(&pcp_batch_high_lock);

5845

return ret;

5855

return ret;

5846

}

5856

}

5847

5857

5848

int hashdist = HASHDIST_DEFAULT;

5858

int hashdist = HASHDIST_DEFAULT;

5849

5859

5850

#ifdef CONFIG_NUMA

5860

#ifdef CONFIG_NUMA

5851

static int __init set_hashdist(char *str)

5861

static int __init set_hashdist(char *str)

5852

{

5862

{

5853

if (!str)

5863

if (!str)

5854

return 0;

5864

return 0;

5855

hashdist = simple_strtoul(str, &str, 0);

5865

hashdist = simple_strtoul(str, &str, 0);

5856

return 1;

5866

return 1;

5857

}

5867

}

5858

__setup("hashdist=", set_hashdist);

5868

__setup("hashdist=", set_hashdist);

5859

#endif

5869

#endif

5860

5870

5861

/*

5871

/*

5862

* allocate a large system hash table from bootmem

5872

* allocate a large system hash table from bootmem

5863

* - it is assumed that the hash table must contain an exact power-of-2

5873

* - it is assumed that the hash table must contain an exact power-of-2

5864

* quantity of entries

5874

* quantity of entries

5865

* - limit is the number of hash buckets, not the total allocation size

5875

* - limit is the number of hash buckets, not the total allocation size

5866

*/

5876

*/

5867

void *__init alloc_large_system_hash(const char *tablename,

5877

void *__init alloc_large_system_hash(const char *tablename,

5868

unsigned long bucketsize,

5878

unsigned long bucketsize,

5869

unsigned long numentries,

5879

unsigned long numentries,

5870

int scale,

5880

int scale,

5871

int flags,

5881

int flags,

5872

unsigned int *_hash_shift,

5882

unsigned int *_hash_shift,

5873

unsigned int *_hash_mask,

5883

unsigned int *_hash_mask,

5874

unsigned long low_limit,

5884

unsigned long low_limit,

5875

unsigned long high_limit)

5885

unsigned long high_limit)

5876

{

5886

{

5877

unsigned long long max = high_limit;

5887

unsigned long long max = high_limit;

5878

unsigned long log2qty, size;

5888

unsigned long log2qty, size;

5879

void *table = NULL;

5889

void *table = NULL;

5880

5890

5881

/* allow the kernel cmdline to have a say */

5891

/* allow the kernel cmdline to have a say */

5882

if (!numentries) {

5892

if (!numentries) {

5883

/* round applicable memory size up to nearest megabyte */

5893

/* round applicable memory size up to nearest megabyte */

5884

numentries = nr_kernel_pages;

5894

numentries = nr_kernel_pages;

5885

5895

5886

/* It isn't necessary when PAGE_SIZE >= 1MB */

5896

/* It isn't necessary when PAGE_SIZE >= 1MB */

5887

if (PAGE_SHIFT < 20)

5897

if (PAGE_SHIFT < 20)

5888

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5898

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5889

5899

5890

/* limit to 1 bucket per 2^scale bytes of low memory */

5900

/* limit to 1 bucket per 2^scale bytes of low memory */

5891

if (scale > PAGE_SHIFT)

5901

if (scale > PAGE_SHIFT)

5892

numentries >>= (scale - PAGE_SHIFT);

5902

numentries >>= (scale - PAGE_SHIFT);

5893

else

5903

else

5894

numentries <<= (PAGE_SHIFT - scale);

5904

numentries <<= (PAGE_SHIFT - scale);

5895

5905

5896

/* Make sure we've got at least a 0-order allocation.. */

5906

/* Make sure we've got at least a 0-order allocation.. */

5897

if (unlikely(flags & HASH_SMALL)) {

5907

if (unlikely(flags & HASH_SMALL)) {

5898

/* Makes no sense without HASH_EARLY */

5908

/* Makes no sense without HASH_EARLY */

5899

WARN_ON(!(flags & HASH_EARLY));

5909

WARN_ON(!(flags & HASH_EARLY));

5900

if (!(numentries >> *_hash_shift)) {

5910

if (!(numentries >> *_hash_shift)) {

5901

numentries = 1UL << *_hash_shift;

5911

numentries = 1UL << *_hash_shift;

5902

BUG_ON(!numentries);

5912

BUG_ON(!numentries);

5903

}

5913

}

5904

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5914

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5905

numentries = PAGE_SIZE / bucketsize;

5915

numentries = PAGE_SIZE / bucketsize;

5906

}

5916

}

5907

numentries = roundup_pow_of_two(numentries);

5917

numentries = roundup_pow_of_two(numentries);

5908

5918

5909

/* limit allocation size to 1/16 total memory by default */

5919

/* limit allocation size to 1/16 total memory by default */

5910

if (max == 0) {

5920

if (max == 0) {

5911

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5921

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5912

do_div(max, bucketsize);

5922

do_div(max, bucketsize);

5913

}

5923

}

5914

max = min(max, 0x80000000ULL);

5924

max = min(max, 0x80000000ULL);

5915

5925

5916

if (numentries < low_limit)

5926

if (numentries < low_limit)

5917

numentries = low_limit;

5927

numentries = low_limit;

5918

if (numentries > max)

5928

if (numentries > max)

5919

numentries = max;

5929

numentries = max;

5920

5930

5921

log2qty = ilog2(numentries);

5931

log2qty = ilog2(numentries);

5922

5932

5923

do {

5933

do {

5924

size = bucketsize << log2qty;

5934

size = bucketsize << log2qty;

5925

if (flags & HASH_EARLY)

5935

if (flags & HASH_EARLY)

5926

table = alloc_bootmem_nopanic(size);

5936

table = alloc_bootmem_nopanic(size);

5927

else if (hashdist)

5937

else if (hashdist)

5928

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5938

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5929

else {

5939

else {

5930

/*

5940

/*

5931

* If bucketsize is not a power-of-two, we may free

5941

* If bucketsize is not a power-of-two, we may free

5932

* some pages at the end of hash table which

5942

* some pages at the end of hash table which

5933

* alloc_pages_exact() automatically does

5943

* alloc_pages_exact() automatically does

5934

*/

5944

*/

5935

if (get_order(size) < MAX_ORDER) {

5945

if (get_order(size) < MAX_ORDER) {

5936

table = alloc_pages_exact(size, GFP_ATOMIC);

5946

table = alloc_pages_exact(size, GFP_ATOMIC);

5937

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5947

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5938

}

5948

}

5939

}

5949

}

5940

} while (!table && size > PAGE_SIZE && --log2qty);

5950

} while (!table && size > PAGE_SIZE && --log2qty);

5941

5951

5942

if (!table)

5952

if (!table)

5943

panic("Failed to allocate %s hash table\n", tablename);

5953

panic("Failed to allocate %s hash table\n", tablename);

5944

5954

5945

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5955

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5946

tablename,

5956

tablename,

5947

(1UL << log2qty),

5957

(1UL << log2qty),

5948

ilog2(size) - PAGE_SHIFT,

5958

ilog2(size) - PAGE_SHIFT,

5949

size);

5959

size);

5950

5960

5951

if (_hash_shift)

5961

if (_hash_shift)

5952

*_hash_shift = log2qty;

5962

*_hash_shift = log2qty;

5953

if (_hash_mask)

5963

if (_hash_mask)

5954

*_hash_mask = (1 << log2qty) - 1;

5964

*_hash_mask = (1 << log2qty) - 1;

5955

5965

5956

return table;

5966

return table;

5957

}

5967

}

5958

5968

5959

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5969

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5960

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5970

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5961

unsigned long pfn)

5971

unsigned long pfn)

5962

{

5972

{

5963

#ifdef CONFIG_SPARSEMEM

5973

#ifdef CONFIG_SPARSEMEM

5964

return __pfn_to_section(pfn)->pageblock_flags;

5974

return __pfn_to_section(pfn)->pageblock_flags;

5965

#else

5975

#else

5966

return zone->pageblock_flags;

5976

return zone->pageblock_flags;

5967

#endif /* CONFIG_SPARSEMEM */

5977

#endif /* CONFIG_SPARSEMEM */

5968

}

5978

}

5969

5979

5970

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5980

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5971

{

5981

{

5972

#ifdef CONFIG_SPARSEMEM

5982

#ifdef CONFIG_SPARSEMEM

5973

pfn &= (PAGES_PER_SECTION-1);

5983

pfn &= (PAGES_PER_SECTION-1);

5974

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5984

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5975

#else

5985

#else

5976

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5986

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5977

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5987

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5978

#endif /* CONFIG_SPARSEMEM */

5988

#endif /* CONFIG_SPARSEMEM */

5979

}

5989

}

5980

5990

5981

/**

5991

/**

5982

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5992

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5983

* @page: The page within the block of interest

5993

* @page: The page within the block of interest

5984

* @start_bitidx: The first bit of interest to retrieve

5994

* @start_bitidx: The first bit of interest to retrieve

5985

* @end_bitidx: The last bit of interest

5995

* @end_bitidx: The last bit of interest

5986

* returns pageblock_bits flags

5996

* returns pageblock_bits flags

5987

*/

5997

*/

5988

unsigned long get_pageblock_flags_mask(struct page *page,

5998

unsigned long get_pageblock_flags_mask(struct page *page,

5989

unsigned long end_bitidx,

5999

unsigned long end_bitidx,

5990

unsigned long mask)

6000

unsigned long mask)

5991

{

6001

{

5992

struct zone *zone;

6002

struct zone *zone;

5993

unsigned long *bitmap;

6003

unsigned long *bitmap;

5994

unsigned long pfn, bitidx, word_bitidx;

6004

unsigned long pfn, bitidx, word_bitidx;

5995

unsigned long word;

6005

unsigned long word;

5996

6006

5997

zone = page_zone(page);

6007

zone = page_zone(page);

5998

pfn = page_to_pfn(page);

6008

pfn = page_to_pfn(page);

5999

bitmap = get_pageblock_bitmap(zone, pfn);

6009

bitmap = get_pageblock_bitmap(zone, pfn);

6000

bitidx = pfn_to_bitidx(zone, pfn);

6010

bitidx = pfn_to_bitidx(zone, pfn);

6001

word_bitidx = bitidx / BITS_PER_LONG;

6011

word_bitidx = bitidx / BITS_PER_LONG;

6002

bitidx &= (BITS_PER_LONG-1);

6012

bitidx &= (BITS_PER_LONG-1);

6003

6013

6004

word = bitmap[word_bitidx];

6014

word = bitmap[word_bitidx];

6005

bitidx += end_bitidx;

6015

bitidx += end_bitidx;

6006

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6016

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6007

}

6017

}

6008

6018

6009

/**

6019

/**

6010

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6020

* set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6011

* @page: The page within the block of interest

6021

* @page: The page within the block of interest

6012

* @start_bitidx: The first bit of interest

6022

* @start_bitidx: The first bit of interest

6013

* @end_bitidx: The last bit of interest

6023

* @end_bitidx: The last bit of interest

6014

* @flags: The flags to set

6024

* @flags: The flags to set

6015

*/

6025

*/

6016

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6026

void set_pageblock_flags_mask(struct page *page, unsigned long flags,

6017

unsigned long end_bitidx,

6027

unsigned long end_bitidx,

6018

unsigned long mask)

6028

unsigned long mask)

6019

{

6029

{

6020

struct zone *zone;

6030

struct zone *zone;

6021

unsigned long *bitmap;

6031

unsigned long *bitmap;

6022

unsigned long pfn, bitidx, word_bitidx;

6032

unsigned long pfn, bitidx, word_bitidx;

6023

unsigned long old_word, word;

6033

unsigned long old_word, word;

6024

6034

6025

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6035

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6026

6036

6027

zone = page_zone(page);

6037

zone = page_zone(page);

6028

pfn = page_to_pfn(page);

6038

pfn = page_to_pfn(page);

6029

bitmap = get_pageblock_bitmap(zone, pfn);

6039

bitmap = get_pageblock_bitmap(zone, pfn);

6030

bitidx = pfn_to_bitidx(zone, pfn);

6040

bitidx = pfn_to_bitidx(zone, pfn);

6031

word_bitidx = bitidx / BITS_PER_LONG;

6041

word_bitidx = bitidx / BITS_PER_LONG;

6032

bitidx &= (BITS_PER_LONG-1);

6042

bitidx &= (BITS_PER_LONG-1);

6033

6043

6034

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6044

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6035

6045

6036

bitidx += end_bitidx;

6046

bitidx += end_bitidx;

6037

mask <<= (BITS_PER_LONG - bitidx - 1);

6047

mask <<= (BITS_PER_LONG - bitidx - 1);

6038

flags <<= (BITS_PER_LONG - bitidx - 1);

6048

flags <<= (BITS_PER_LONG - bitidx - 1);

6039

6049

6040

word = ACCESS_ONCE(bitmap[word_bitidx]);

6050

word = ACCESS_ONCE(bitmap[word_bitidx]);

6041

for (;;) {

6051

for (;;) {

6042

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6052

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6043

if (word == old_word)

6053

if (word == old_word)

6044

break;

6054

break;

6045

word = old_word;

6055

word = old_word;

6046

}

6056

}

6047

}

6057

}

6048

6058

6049

/*

6059

/*

6050

* This function checks whether pageblock includes unmovable pages or not.

6060

* This function checks whether pageblock includes unmovable pages or not.

6051

* If @count is not zero, it is okay to include less @count unmovable pages

6061

* If @count is not zero, it is okay to include less @count unmovable pages

6052

*

6062

*

6053

* PageLRU check without isolation or lru_lock could race so that

6063

* PageLRU check without isolation or lru_lock could race so that

6054

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6064

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6055

* expect this function should be exact.

6065

* expect this function should be exact.

6056

*/

6066

*/

6057

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6067

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6058

bool skip_hwpoisoned_pages)

6068

bool skip_hwpoisoned_pages)

6059

{

6069

{

6060

unsigned long pfn, iter, found;

6070

unsigned long pfn, iter, found;

6061

int mt;

6071

int mt;

6062

6072

6063

/*

6073

/*

6064

* For avoiding noise data, lru_add_drain_all() should be called

6074

* For avoiding noise data, lru_add_drain_all() should be called

6065

* If ZONE_MOVABLE, the zone never contains unmovable pages

6075

* If ZONE_MOVABLE, the zone never contains unmovable pages

6066

*/

6076

*/

6067

if (zone_idx(zone) == ZONE_MOVABLE)

6077

if (zone_idx(zone) == ZONE_MOVABLE)

6068

return false;

6078

return false;

6069

mt = get_pageblock_migratetype(page);

6079

mt = get_pageblock_migratetype(page);

6070

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6080

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6071

return false;

6081

return false;

6072

6082

6073

pfn = page_to_pfn(page);

6083

pfn = page_to_pfn(page);

6074

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6084

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6075

unsigned long check = pfn + iter;

6085

unsigned long check = pfn + iter;

6076

6086

6077

if (!pfn_valid_within(check))

6087

if (!pfn_valid_within(check))

6078

continue;

6088

continue;

6079

6089

6080

page = pfn_to_page(check);

6090

page = pfn_to_page(check);

6081

6091

6082

/*

6092

/*

6083

* Hugepages are not in LRU lists, but they're movable.

6093

* Hugepages are not in LRU lists, but they're movable.

6084

* We need not scan over tail pages bacause we don't

6094

* We need not scan over tail pages bacause we don't

6085

* handle each tail page individually in migration.

6095

* handle each tail page individually in migration.

6086

*/

6096

*/

6087

if (PageHuge(page)) {

6097

if (PageHuge(page)) {

6088

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6098

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6089

continue;

6099

continue;

6090

}

6100

}

6091

6101

6092

/*

6102

/*

6093

* We can't use page_count without pin a page

6103

* We can't use page_count without pin a page

6094

* because another CPU can free compound page.

6104

* because another CPU can free compound page.

6095

* This check already skips compound tails of THP

6105

* This check already skips compound tails of THP

6096

* because their page->_count is zero at all time.

6106

* because their page->_count is zero at all time.

6097

*/

6107

*/

6098

if (!atomic_read(&page->_count)) {

6108

if (!atomic_read(&page->_count)) {

6099

if (PageBuddy(page))

6109

if (PageBuddy(page))

6100

iter += (1 << page_order(page)) - 1;

6110

iter += (1 << page_order(page)) - 1;

6101

continue;

6111

continue;

6102

}

6112

}

6103

6113

6104

/*

6114

/*

6105

* The HWPoisoned page may be not in buddy system, and

6115

* The HWPoisoned page may be not in buddy system, and

6106

* page_count() is not 0.

6116

* page_count() is not 0.

6107

*/

6117

*/

6108

if (skip_hwpoisoned_pages && PageHWPoison(page))

6118

if (skip_hwpoisoned_pages && PageHWPoison(page))

6109

continue;

6119

continue;

6110

6120

6111

if (!PageLRU(page))

6121

if (!PageLRU(page))

6112

found++;

6122

found++;

6113

/*

6123

/*

6114

* If there are RECLAIMABLE pages, we need to check it.

6124

* If there are RECLAIMABLE pages, we need to check it.

6115

* But now, memory offline itself doesn't call shrink_slab()

6125

* But now, memory offline itself doesn't call shrink_slab()

6116

* and it still to be fixed.

6126

* and it still to be fixed.

6117

*/

6127

*/

6118

/*

6128

/*

6119

* If the page is not RAM, page_count()should be 0.

6129

* If the page is not RAM, page_count()should be 0.

6120

* we don't need more check. This is an _used_ not-movable page.

6130

* we don't need more check. This is an _used_ not-movable page.

6121

*

6131

*

6122

* The problematic thing here is PG_reserved pages. PG_reserved

6132

* The problematic thing here is PG_reserved pages. PG_reserved

6123

* is set to both of a memory hole page and a _used_ kernel

6133

* is set to both of a memory hole page and a _used_ kernel

6124

* page at boot.

6134

* page at boot.

6125

*/

6135

*/

6126

if (found > count)

6136

if (found > count)

6127

return true;

6137

return true;

6128

}

6138

}

6129

return false;

6139

return false;

6130

}

6140

}

6131

6141

6132

bool is_pageblock_removable_nolock(struct page *page)

6142

bool is_pageblock_removable_nolock(struct page *page)

6133

{

6143

{

6134

struct zone *zone;

6144

struct zone *zone;

6135

unsigned long pfn;

6145

unsigned long pfn;

6136

6146

6137

/*

6147

/*

6138

* We have to be careful here because we are iterating over memory

6148

* We have to be careful here because we are iterating over memory

6139

* sections which are not zone aware so we might end up outside of

6149

* sections which are not zone aware so we might end up outside of

6140

* the zone but still within the section.

6150

* the zone but still within the section.

6141

* We have to take care about the node as well. If the node is offline

6151

* We have to take care about the node as well. If the node is offline

6142

* its NODE_DATA will be NULL - see page_zone.

6152

* its NODE_DATA will be NULL - see page_zone.

6143

*/

6153

*/

6144

if (!node_online(page_to_nid(page)))

6154

if (!node_online(page_to_nid(page)))

6145

return false;

6155

return false;

6146

6156

6147

zone = page_zone(page);

6157

zone = page_zone(page);

6148

pfn = page_to_pfn(page);

6158

pfn = page_to_pfn(page);

6149

if (!zone_spans_pfn(zone, pfn))

6159

if (!zone_spans_pfn(zone, pfn))

6150

return false;

6160

return false;

6151

6161

6152

return !has_unmovable_pages(zone, page, 0, true);

6162

return !has_unmovable_pages(zone, page, 0, true);

6153

}

6163

}

6154

6164

6155

#ifdef CONFIG_CMA

6165

#ifdef CONFIG_CMA

6156

6166

6157

static unsigned long pfn_max_align_down(unsigned long pfn)

6167

static unsigned long pfn_max_align_down(unsigned long pfn)

6158

{

6168

{

6159

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6169

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6160

pageblock_nr_pages) - 1);

6170

pageblock_nr_pages) - 1);

6161

}

6171

}

6162

6172

6163

static unsigned long pfn_max_align_up(unsigned long pfn)

6173

static unsigned long pfn_max_align_up(unsigned long pfn)

6164

{

6174

{

6165

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6175

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6166

pageblock_nr_pages));

6176

pageblock_nr_pages));

6167

}

6177

}

6168

6178

6169

/* [start, end) must belong to a single zone. */

6179

/* [start, end) must belong to a single zone. */

6170

static int __alloc_contig_migrate_range(struct compact_control *cc,

6180

static int __alloc_contig_migrate_range(struct compact_control *cc,

6171

unsigned long start, unsigned long end)

6181

unsigned long start, unsigned long end)

6172

{

6182

{

6173

/* This function is based on compact_zone() from compaction.c. */

6183

/* This function is based on compact_zone() from compaction.c. */

6174

unsigned long nr_reclaimed;

6184

unsigned long nr_reclaimed;

6175

unsigned long pfn = start;

6185

unsigned long pfn = start;

6176

unsigned int tries = 0;

6186

unsigned int tries = 0;

6177

int ret = 0;

6187

int ret = 0;

6178

6188

6179

migrate_prep();

6189

migrate_prep();

6180

6190

6181

while (pfn < end || !list_empty(&cc->migratepages)) {

6191

while (pfn < end || !list_empty(&cc->migratepages)) {

6182

if (fatal_signal_pending(current)) {

6192

if (fatal_signal_pending(current)) {

6183

ret = -EINTR;

6193

ret = -EINTR;

6184

break;

6194

break;

6185

}

6195

}

6186

6196

6187

if (list_empty(&cc->migratepages)) {

6197

if (list_empty(&cc->migratepages)) {

6188

cc->nr_migratepages = 0;

6198

cc->nr_migratepages = 0;

6189

pfn = isolate_migratepages_range(cc->zone, cc,

6199

pfn = isolate_migratepages_range(cc->zone, cc,

6190

pfn, end, true);

6200

pfn, end, true);

6191

if (!pfn) {

6201

if (!pfn) {

6192

ret = -EINTR;

6202

ret = -EINTR;

6193

break;

6203

break;

6194

}

6204

}

6195

tries = 0;

6205

tries = 0;

6196

} else if (++tries == 5) {

6206

} else if (++tries == 5) {

6197

ret = ret < 0 ? ret : -EBUSY;

6207

ret = ret < 0 ? ret : -EBUSY;

6198

break;

6208

break;

6199

}

6209

}

6200

6210

6201

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6211

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6202

&cc->migratepages);

6212

&cc->migratepages);

6203

cc->nr_migratepages -= nr_reclaimed;

6213

cc->nr_migratepages -= nr_reclaimed;

6204

6214

6205

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6215

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6206

NULL, 0, cc->mode, MR_CMA);

6216

NULL, 0, cc->mode, MR_CMA);

6207

}

6217

}

6208

if (ret < 0) {

6218

if (ret < 0) {

6209

putback_movable_pages(&cc->migratepages);

6219

putback_movable_pages(&cc->migratepages);

6210

return ret;

6220

return ret;

6211

}

6221

}

6212

return 0;

6222

return 0;

6213

}

6223

}

6214

6224

6215

/**

6225

/**

6216

* alloc_contig_range() -- tries to allocate given range of pages

6226

* alloc_contig_range() -- tries to allocate given range of pages

6217

* @start: start PFN to allocate

6227

* @start: start PFN to allocate

6218

* @end: one-past-the-last PFN to allocate

6228

* @end: one-past-the-last PFN to allocate

6219

* @migratetype: migratetype of the underlaying pageblocks (either

6229

* @migratetype: migratetype of the underlaying pageblocks (either

6220

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6230

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6221

* in range must have the same migratetype and it must

6231

* in range must have the same migratetype and it must

6222

* be either of the two.

6232

* be either of the two.

6223

*

6233

*

6224

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6234

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6225

* aligned, however it's the caller's responsibility to guarantee that

6235

* aligned, however it's the caller's responsibility to guarantee that

6226

* we are the only thread that changes migrate type of pageblocks the

6236

* we are the only thread that changes migrate type of pageblocks the

6227

* pages fall in.

6237

* pages fall in.

6228

*

6238

*

6229

* The PFN range must belong to a single zone.

6239

* The PFN range must belong to a single zone.

6230

*

6240

*

6231

* Returns zero on success or negative error code. On success all

6241

* Returns zero on success or negative error code. On success all

6232

* pages which PFN is in [start, end) are allocated for the caller and

6242

* pages which PFN is in [start, end) are allocated for the caller and

6233

* need to be freed with free_contig_range().

6243

* need to be freed with free_contig_range().

6234

*/

6244

*/

6235

int alloc_contig_range(unsigned long start, unsigned long end,

6245

int alloc_contig_range(unsigned long start, unsigned long end,

6236

unsigned migratetype)

6246

unsigned migratetype)

6237

{

6247

{

6238

unsigned long outer_start, outer_end;

6248

unsigned long outer_start, outer_end;

6239

int ret = 0, order;

6249

int ret = 0, order;

6240

6250

6241

struct compact_control cc = {

6251

struct compact_control cc = {

6242

.nr_migratepages = 0,

6252

.nr_migratepages = 0,

6243

.order = -1,

6253

.order = -1,

6244

.zone = page_zone(pfn_to_page(start)),

6254

.zone = page_zone(pfn_to_page(start)),

6245

.mode = MIGRATE_SYNC,

6255

.mode = MIGRATE_SYNC,

6246

.ignore_skip_hint = true,

6256

.ignore_skip_hint = true,

6247

};

6257

};

6248

INIT_LIST_HEAD(&cc.migratepages);

6258

INIT_LIST_HEAD(&cc.migratepages);

6249

6259

6250

/*

6260

/*

6251

* What we do here is we mark all pageblocks in range as

6261

* What we do here is we mark all pageblocks in range as

6252

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6262

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6253

* have different sizes, and due to the way page allocator

6263

* have different sizes, and due to the way page allocator

6254

* work, we align the range to biggest of the two pages so

6264

* work, we align the range to biggest of the two pages so

6255

* that page allocator won't try to merge buddies from

6265

* that page allocator won't try to merge buddies from

6256

* different pageblocks and change MIGRATE_ISOLATE to some

6266

* different pageblocks and change MIGRATE_ISOLATE to some

6257

* other migration type.

6267

* other migration type.

6258

*

6268

*

6259

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6269

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6260

* migrate the pages from an unaligned range (ie. pages that

6270

* migrate the pages from an unaligned range (ie. pages that

6261

* we are interested in). This will put all the pages in

6271

* we are interested in). This will put all the pages in

6262

* range back to page allocator as MIGRATE_ISOLATE.

6272

* range back to page allocator as MIGRATE_ISOLATE.

6263

*

6273

*

6264

* When this is done, we take the pages in range from page

6274

* When this is done, we take the pages in range from page

6265

* allocator removing them from the buddy system. This way

6275

* allocator removing them from the buddy system. This way

6266

* page allocator will never consider using them.

6276

* page allocator will never consider using them.

6267

*

6277

*

6268

* This lets us mark the pageblocks back as

6278

* This lets us mark the pageblocks back as

6269

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6279

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6270

* aligned range but not in the unaligned, original range are

6280

* aligned range but not in the unaligned, original range are

6271

* put back to page allocator so that buddy can use them.

6281

* put back to page allocator so that buddy can use them.

6272

*/

6282

*/

6273

6283

6274

ret = start_isolate_page_range(pfn_max_align_down(start),

6284

ret = start_isolate_page_range(pfn_max_align_down(start),

6275

pfn_max_align_up(end), migratetype,

6285

pfn_max_align_up(end), migratetype,

6276

false);

6286

false);

6277

if (ret)

6287

if (ret)

6278

return ret;

6288

return ret;

6279

6289

6280

ret = __alloc_contig_migrate_range(&cc, start, end);

6290

ret = __alloc_contig_migrate_range(&cc, start, end);

6281

if (ret)

6291

if (ret)

6282

goto done;

6292

goto done;

6283

6293

6284

/*

6294

/*

6285

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6295

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6286

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6296

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6287

* more, all pages in [start, end) are free in page allocator.

6297

* more, all pages in [start, end) are free in page allocator.

6288

* What we are going to do is to allocate all pages from

6298

* What we are going to do is to allocate all pages from

6289

* [start, end) (that is remove them from page allocator).

6299

* [start, end) (that is remove them from page allocator).

6290

*

6300

*

6291

* The only problem is that pages at the beginning and at the

6301

* The only problem is that pages at the beginning and at the

6292

* end of interesting range may be not aligned with pages that

6302

* end of interesting range may be not aligned with pages that

6293

* page allocator holds, ie. they can be part of higher order

6303

* page allocator holds, ie. they can be part of higher order

6294

* pages. Because of this, we reserve the bigger range and

6304

* pages. Because of this, we reserve the bigger range and

6295

* once this is done free the pages we are not interested in.

6305

* once this is done free the pages we are not interested in.

6296

*

6306

*

6297

* We don't have to hold zone->lock here because the pages are

6307

* We don't have to hold zone->lock here because the pages are

6298

* isolated thus they won't get removed from buddy.

6308

* isolated thus they won't get removed from buddy.

6299

*/

6309

*/

6300

6310

6301

lru_add_drain_all();

6311

lru_add_drain_all();

6302

drain_all_pages();

6312

drain_all_pages();

6303

6313

6304

order = 0;

6314

order = 0;

6305

outer_start = start;

6315

outer_start = start;

6306

while (!PageBuddy(pfn_to_page(outer_start))) {

6316

while (!PageBuddy(pfn_to_page(outer_start))) {

6307

if (++order >= MAX_ORDER) {

6317

if (++order >= MAX_ORDER) {

6308

ret = -EBUSY;

6318

ret = -EBUSY;

6309

goto done;

6319

goto done;

6310

}

6320

}

6311

outer_start &= ~0UL << order;

6321

outer_start &= ~0UL << order;

6312

}

6322

}

6313

6323

6314

/* Make sure the range is really isolated. */

6324

/* Make sure the range is really isolated. */

6315

if (test_pages_isolated(outer_start, end, false)) {

6325

if (test_pages_isolated(outer_start, end, false)) {

6316

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6326

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6317

outer_start, end);

6327

outer_start, end);

6318

ret = -EBUSY;

6328

ret = -EBUSY;

6319

goto done;

6329

goto done;

6320

}

6330

}

6321

6331

6322

6332

6323

/* Grab isolated pages from freelists. */

6333

/* Grab isolated pages from freelists. */

6324

outer_end = isolate_freepages_range(&cc, outer_start, end);

6334

outer_end = isolate_freepages_range(&cc, outer_start, end);

6325

if (!outer_end) {

6335

if (!outer_end) {

6326

ret = -EBUSY;

6336

ret = -EBUSY;

6327

goto done;

6337

goto done;

6328

}

6338

}

6329

6339

6330

/* Free head and tail (if any) */

6340

/* Free head and tail (if any) */

6331

if (start != outer_start)

6341

if (start != outer_start)

6332

free_contig_range(outer_start, start - outer_start);

6342

free_contig_range(outer_start, start - outer_start);

6333

if (end != outer_end)

6343

if (end != outer_end)

6334

free_contig_range(end, outer_end - end);

6344

free_contig_range(end, outer_end - end);

6335

6345

6336

done:

6346

done:

6337

undo_isolate_page_range(pfn_max_align_down(start),

6347

undo_isolate_page_range(pfn_max_align_down(start),

6338

pfn_max_align_up(end), migratetype);

6348

pfn_max_align_up(end), migratetype);

6339

return ret;

6349

return ret;

6340

}

6350

}

6341

6351

6342

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6352

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6343

{

6353

{

6344

unsigned int count = 0;

6354

unsigned int count = 0;

6345

6355

6346

for (; nr_pages--; pfn++) {

6356

for (; nr_pages--; pfn++) {

6347

struct page *page = pfn_to_page(pfn);

6357

struct page *page = pfn_to_page(pfn);

6348

6358

6349

count += page_count(page) != 1;

6359

count += page_count(page) != 1;

6350

__free_page(page);

6360

__free_page(page);

6351

}

6361

}

6352

WARN(count != 0, "%d pages are still in use!\n", count);

6362

WARN(count != 0, "%d pages are still in use!\n", count);

6353

}

6363

}

6354

#endif

6364

#endif

6355

6365

6356

#ifdef CONFIG_MEMORY_HOTPLUG

6366

#ifdef CONFIG_MEMORY_HOTPLUG

6357

/*

6367

/*

6358

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6368

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6359

* page high values need to be recalulated.

6369

* page high values need to be recalulated.

6360

*/

6370

*/

6361

void __meminit zone_pcp_update(struct zone *zone)

6371

void __meminit zone_pcp_update(struct zone *zone)

6362

{

6372

{

6363

unsigned cpu;

6373

unsigned cpu;

6364

mutex_lock(&pcp_batch_high_lock);

6374

mutex_lock(&pcp_batch_high_lock);

6365

for_each_possible_cpu(cpu)

6375

for_each_possible_cpu(cpu)

6366

pageset_set_high_and_batch(zone,

6376

pageset_set_high_and_batch(zone,

6367

per_cpu_ptr(zone->pageset, cpu));

6377

per_cpu_ptr(zone->pageset, cpu));

6368

mutex_unlock(&pcp_batch_high_lock);

6378

mutex_unlock(&pcp_batch_high_lock);

6369

}

6379

}

6370

#endif

6380

#endif

6371

6381

6372

void zone_pcp_reset(struct zone *zone)

6382

void zone_pcp_reset(struct zone *zone)

6373

{

6383

{

6374

unsigned long flags;

6384

unsigned long flags;

6375

int cpu;

6385

int cpu;

6376

struct per_cpu_pageset *pset;

6386

struct per_cpu_pageset *pset;

6377

6387

6378

/* avoid races with drain_pages() */

6388

/* avoid races with drain_pages() */

6379

local_irq_save(flags);

6389

local_irq_save(flags);

6380

if (zone->pageset != &boot_pageset) {

6390

if (zone->pageset != &boot_pageset) {

6381

for_each_online_cpu(cpu) {

6391

for_each_online_cpu(cpu) {

6382

pset = per_cpu_ptr(zone->pageset, cpu);

6392

pset = per_cpu_ptr(zone->pageset, cpu);

6383

drain_zonestat(zone, pset);

6393

drain_zonestat(zone, pset);

6384

}

6394

}

6385

free_percpu(zone->pageset);

6395

free_percpu(zone->pageset);

6386

zone->pageset = &boot_pageset;

6396

zone->pageset = &boot_pageset;

6387

}

6397

}

6388

local_irq_restore(flags);

6398

local_irq_restore(flags);

6389

}

6399

}

6390

6400

6391

#ifdef CONFIG_MEMORY_HOTREMOVE

6401

#ifdef CONFIG_MEMORY_HOTREMOVE

6392

/*

6402

/*

6393

* All pages in the range must be isolated before calling this.

6403

* All pages in the range must be isolated before calling this.

6394

*/

6404

*/

6395

void

6405

void

6396

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6406

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6397

{

6407

{

6398

struct page *page;

6408

struct page *page;

6399

struct zone *zone;

6409

struct zone *zone;

6400

int order, i;

6410

int order, i;

6401

unsigned long pfn;

6411

unsigned long pfn;

6402

unsigned long flags;

6412

unsigned long flags;

6403

/* find the first valid pfn */

6413

/* find the first valid pfn */

6404

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6414

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6405

if (pfn_valid(pfn))

6415

if (pfn_valid(pfn))

6406

break;

6416

break;

6407

if (pfn == end_pfn)

6417

if (pfn == end_pfn)

6408

return;

6418

return;

6409

zone = page_zone(pfn_to_page(pfn));

6419

zone = page_zone(pfn_to_page(pfn));

6410

spin_lock_irqsave(&zone->lock, flags);

6420

spin_lock_irqsave(&zone->lock, flags);

6411

pfn = start_pfn;

6421

pfn = start_pfn;

6412

while (pfn < end_pfn) {

6422

while (pfn < end_pfn) {

6413

if (!pfn_valid(pfn)) {

6423

if (!pfn_valid(pfn)) {

6414

pfn++;

6424

pfn++;

6415

continue;

6425

continue;

6416

}

6426

}

6417

page = pfn_to_page(pfn);

6427

page = pfn_to_page(pfn);

6418

/*

6428

/*

6419

* The HWPoisoned page may be not in buddy system, and

6429

* The HWPoisoned page may be not in buddy system, and

6420

* page_count() is not 0.

6430

* page_count() is not 0.

6421

*/

6431

*/

6422

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6432

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6423

pfn++;

6433

pfn++;

6424

SetPageReserved(page);

6434

SetPageReserved(page);

6425

continue;

6435

continue;

6426

}

6436

}

6427

6437

6428

BUG_ON(page_count(page));

6438

BUG_ON(page_count(page));

6429

BUG_ON(!PageBuddy(page));

6439

BUG_ON(!PageBuddy(page));

6430

order = page_order(page);

6440

order = page_order(page);

6431

#ifdef CONFIG_DEBUG_VM

6441

#ifdef CONFIG_DEBUG_VM

6432

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6442

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6433

pfn, 1 << order, end_pfn);

6443

pfn, 1 << order, end_pfn);

6434

#endif

6444

#endif

6435

list_del(&page->lru);

6445

list_del(&page->lru);

6436

rmv_page_order(page);

6446

rmv_page_order(page);

6437

zone->free_area[order].nr_free--;

6447

zone->free_area[order].nr_free--;

6438

for (i = 0; i < (1 << order); i++)

6448

for (i = 0; i < (1 << order); i++)

6439

SetPageReserved((page+i));

6449

SetPageReserved((page+i));

6440

pfn += (1 << order);

6450

pfn += (1 << order);

6441

}

6451

}

6442

spin_unlock_irqrestore(&zone->lock, flags);

6452

spin_unlock_irqrestore(&zone->lock, flags);

6443

}

6453

}

6444

#endif

6454

#endif

6445

6455

6446

#ifdef CONFIG_MEMORY_FAILURE

6456

#ifdef CONFIG_MEMORY_FAILURE

6447

bool is_free_buddy_page(struct page *page)

6457

bool is_free_buddy_page(struct page *page)

6448

{

6458

{

6449

struct zone *zone = page_zone(page);

6459

struct zone *zone = page_zone(page);

6450

unsigned long pfn = page_to_pfn(page);

6460

unsigned long pfn = page_to_pfn(page);

6451

unsigned long flags;

6461

unsigned long flags;

6452

int order;

6462

int order;

6453

6463

6454

spin_lock_irqsave(&zone->lock, flags);

6464

spin_lock_irqsave(&zone->lock, flags);

6455

for (order = 0; order < MAX_ORDER; order++) {

6465

for (order = 0; order < MAX_ORDER; order++) {

6456

struct page *page_head = page - (pfn & ((1 << order) - 1));

6466

struct page *page_head = page - (pfn & ((1 << order) - 1));

6457

6467

6458

if (PageBuddy(page_head) && page_order(page_head) >= order)

6468

if (PageBuddy(page_head) && page_order(page_head) >= order)

6459

break;

6469

break;

6460

}

6470

}

6461

spin_unlock_irqrestore(&zone->lock, flags);

6471

spin_unlock_irqrestore(&zone->lock, flags);

6462

6472

6463

return order < MAX_ORDER;

6473

return order < MAX_ORDER;

6464

}

6474

}

6465

#endif

6475

#endif

6466

6476

6467

static const struct trace_print_flags pageflag_names[] = {

6477

static const struct trace_print_flags pageflag_names[] = {

6468

{1UL << PG_locked, "locked" },

6478

{1UL << PG_locked, "locked" },

6469

{1UL << PG_error, "error" },

6479

{1UL << PG_error, "error" },

6470

{1UL << PG_referenced, "referenced" },

6480

{1UL << PG_referenced, "referenced" },

6471

{1UL << PG_uptodate, "uptodate" },

6481

{1UL << PG_uptodate, "uptodate" },

6472

{1UL << PG_dirty, "dirty" },

6482

{1UL << PG_dirty, "dirty" },

6473

{1UL << PG_lru, "lru" },

6483

{1UL << PG_lru, "lru" },

6474

{1UL << PG_active, "active" },

6484

{1UL << PG_active, "active" },

6475

{1UL << PG_slab, "slab" },

6485

{1UL << PG_slab, "slab" },

6476

{1UL << PG_owner_priv_1, "owner_priv_1" },

6486

{1UL << PG_owner_priv_1, "owner_priv_1" },

6477

{1UL << PG_arch_1, "arch_1" },

6487

{1UL << PG_arch_1, "arch_1" },

6478

{1UL << PG_reserved, "reserved" },

6488

{1UL << PG_reserved, "reserved" },

6479

{1UL << PG_private, "private" },

6489

{1UL << PG_private, "private" },

6480

{1UL << PG_private_2, "private_2" },

6490

{1UL << PG_private_2, "private_2" },

6481

{1UL << PG_writeback, "writeback" },

6491

{1UL << PG_writeback, "writeback" },

6482

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6492

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6483

{1UL << PG_head, "head" },

6493

{1UL << PG_head, "head" },

6484

{1UL << PG_tail, "tail" },

6494

{1UL << PG_tail, "tail" },

6485

#else

6495

#else

6486

{1UL << PG_compound, "compound" },

6496

{1UL << PG_compound, "compound" },

6487

#endif

6497

#endif

6488

{1UL << PG_swapcache, "swapcache" },

6498

{1UL << PG_swapcache, "swapcache" },

6489

{1UL << PG_mappedtodisk, "mappedtodisk" },

6499

{1UL << PG_mappedtodisk, "mappedtodisk" },

6490

{1UL << PG_reclaim, "reclaim" },

6500

{1UL << PG_reclaim, "reclaim" },

6491

{1UL << PG_swapbacked, "swapbacked" },

6501

{1UL << PG_swapbacked, "swapbacked" },

6492

{1UL << PG_unevictable, "unevictable" },

6502

{1UL << PG_unevictable, "unevictable" },

6493

#ifdef CONFIG_MMU

6503

#ifdef CONFIG_MMU

6494

{1UL << PG_mlocked, "mlocked" },

6504

{1UL << PG_mlocked, "mlocked" },

6495

#endif

6505

#endif

6496

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6506

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6497

{1UL << PG_uncached, "uncached" },

6507

{1UL << PG_uncached, "uncached" },

6498

#endif

6508

#endif

6499

#ifdef CONFIG_MEMORY_FAILURE

6509

#ifdef CONFIG_MEMORY_FAILURE

6500

{1UL << PG_hwpoison, "hwpoison" },

6510

{1UL << PG_hwpoison, "hwpoison" },

6501

#endif

6511

#endif

6502

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6512

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6503

{1UL << PG_compound_lock, "compound_lock" },

6513

{1UL << PG_compound_lock, "compound_lock" },

6504

#endif

6514

#endif

6505

};

6515

};

6506

6516

6507

static void dump_page_flags(unsigned long flags)

6517

static void dump_page_flags(unsigned long flags)

6508

{

6518

{

6509

const char *delim = "";

6519

const char *delim = "";

6510

unsigned long mask;

6520

unsigned long mask;

6511

int i;

6521

int i;

6512

6522

6513

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6523

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6514

6524

6515

printk(KERN_ALERT "page flags: %#lx(", flags);

6525

printk(KERN_ALERT "page flags: %#lx(", flags);

6516

6526

6517

/* remove zone id */

6527

/* remove zone id */

6518

flags &= (1UL << NR_PAGEFLAGS) - 1;

6528

flags &= (1UL << NR_PAGEFLAGS) - 1;

6519

6529

6520

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6530

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6521

6531

6522

mask = pageflag_names[i].mask;

6532

mask = pageflag_names[i].mask;

6523

if ((flags & mask) != mask)

6533

if ((flags & mask) != mask)

6524

continue;

6534

continue;

6525

6535

6526

flags &= ~mask;

6536

flags &= ~mask;

6527

printk("%s%s", delim, pageflag_names[i].name);

6537

printk("%s%s", delim, pageflag_names[i].name);

6528

delim = "|";

6538

delim = "|";

6529

}

6539

}

6530

6540

6531

/* check for left over flags */

6541

/* check for left over flags */

6532

if (flags)

6542

if (flags)

6533

printk("%s%#lx", delim, flags);

6543

printk("%s%#lx", delim, flags);

6534

6544

6535

printk(")\n");

6545

printk(")\n");

6536

}

6546

}

6537

6547

6538

void dump_page(struct page *page)

6548

void dump_page(struct page *page)

6539

{

6549

{

6540

printk(KERN_ALERT

6550

printk(KERN_ALERT

6541

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6551

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6542

page, atomic_read(&page->_count), page_mapcount(page),

6552

page, atomic_read(&page->_count), page_mapcount(page),

6543

page->mapping, page->index);

6553

page->mapping, page->index);

6544

dump_page_flags(page->flags);

6554

dump_page_flags(page->flags);

6545

mem_cgroup_print_bad_page(page);

6555

mem_cgroup_print_bad_page(page);

GITLAB

mm: page_alloc: calculate classzone_idx once from the zonelist ref

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 #ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 int user_min_free_kbytes;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page %lu outside zone [ %lu - %lu ]\n",
 			pfn, start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		set_page_count(p, 0);
 		p->first_page = page;
 		/* Make sure p->first_page is always valid for PageTail() */
 		smp_wmb();
 		__SetPageTail(p);
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order)) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	VM_BUG_ON(!zone_is_initialized(zone));
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_freepage_state(zone, 1 << order,
 						  migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = higher_page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	page_nid_reset_last(page);
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_freepage_migratetype(page);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 			if (likely(!is_migrate_isolate_page(page))) {
 				__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
 				if (is_migrate_cma(mt))
 					__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
 			}
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	if (unlikely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	free_one_page(page_zone(page), page, order, migratetype);
 	local_irq_restore(flags);
 }
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	if (pageblock_order >= MAX_ORDER) {
 		i = pageblock_nr_pages;
 		p = page;
 		do {
 			set_page_refcounted(p);
 			__free_pages(p, MAX_ORDER - 1);
 			p += MAX_ORDER_NR_PAGES;
 		} while (i -= MAX_ORDER_NR_PAGES);
 	} else {
 		set_page_refcounted(page);
 		__free_pages(page, pageblock_order);
 	}
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_freepage_state(zone, -(1 << high),
 						  migratetype);
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		set_freepage_migratetype(page, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 #endif
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		set_freepage_migratetype(page, migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * If breaking a large block of pages, move all free pages to the preferred
  * allocation list. If falling back for a reclaimable kernel allocation, be
  * more aggressive about taking ownership of free pages.
  *
  * On the other hand, never change migration type of MIGRATE_CMA pageblocks
  * nor move CMA pages to different free lists. We don't want unmovable pages
  * to be allocated from MIGRATE_CMA areas.
  *
  * Returns the new migratetype of the pageblock (or the same old migratetype
  * if it was unchanged).
  */
 static int try_to_steal_freepages(struct zone *zone, struct page *page,
 				  int start_type, int fallback_type)
 {
 	int current_order = page_order(page);
 	/*
 	 * When borrowing from MIGRATE_CMA, we need to release the excess
 	 * buddy pages to CMA itself. We also ensure the freepage_migratetype
 	 * is set to CMA so it is returned to the correct freelist in case
 	 * the page ends up being not actually allocated from the pcp lists.
 	 */
 	if (is_migrate_cma(fallback_type))
 		return fallback_type;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		return start_type;
 	}
 	if (current_order >= pageblock_order / 2 ||
 	    start_type == MIGRATE_RECLAIMABLE ||
 	    page_group_by_mobility_disabled) {
 		int pages;
 		pages = move_freepages_block(zone, page, start_type);
 		/* Claim the whole block if over half of it is free */
 		if (pages >= (1 << (pageblock_order-1)) ||
 				page_group_by_mobility_disabled) {
 			set_pageblock_migratetype(page, start_type);
 			return start_type;
 		}
 	}
 	return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int migratetype, new_type, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			new_type = try_to_steal_freepages(zone, page,
 							  start_migratetype,
 							  migratetype);
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			expand(zone, page, order, current_order, area,
 			       new_type);
 			/* The freepage_migratetype may differ from pageblock's
 			 * migratetype depending on the decisions in
 			 * try_to_steal_freepages. This is OK as long as it does
 			 * not differ for MIGRATE_CMA type.
 			 */
 			set_freepage_migratetype(page, new_type);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype, new_type);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
 	int i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
 		if (is_migrate_cma(get_freepage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	unsigned long batch;
 	local_irq_save(flags);
 	batch = ACCESS_ONCE(pcp->batch);
 	if (pcp->count >= batch)
 		to_drain = batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_freepage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = ACCESS_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 EXPORT_SYMBOL_GPL(split_page);
 static int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/* Obey watermarks as if the page was being allocated */
 		watermark = low_wmark_pages(zone) + (1 << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	int nr_pages;
 	order = page_order(page);
 	nr_pages = __isolate_free_page(page, order);
 	if (!nr_pages)
 		return 0;
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	return nr_pages;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_freepage_migratetype(page));
 	}
 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	long free_cma = 0;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	if (free_pages - free_cma <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return local_zone->node == zone->node;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
 }
 static void __paginginit init_zone_allows_reclaim(int nid)
 {
 	int i;
 	for_each_node_state(i, N_MEMORY)
 		if (node_distance(nid, i) <= RECLAIM_DISTANCE)
 			node_set(i, NODE_DATA(nid)->reclaim_nodes);
 		else
 			zone_reclaim_mode = 1;
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 static bool zone_local(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 static inline void init_zone_allows_reclaim(int nid)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
-		struct zone *preferred_zone, int migratetype)
+		struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
-	int classzone_idx;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
-	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		unsigned long mark;
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
 			goto try_this_zone;
 		/*
 		 * Distribute pages in proportion to the individual
 		 * zone size to ensure fair page aging.  The zone a
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			if (!zone_local(preferred_zone, zone))
 				continue;
 			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
 				continue;
 		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if ((alloc_flags & ALLOC_WMARK_LOW) &&
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			continue;
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_ok(zone, order, mark,
 				       classzone_idx, alloc_flags)) {
 			int ret;
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(preferred_zone, zone))
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto try_this_zone;
 				/*
 				 * Failed to reclaim enough to meet watermark.
 				 * Only mark the zone full if checking the min
 				 * watermark or if we failed to reclaim just
 				 * 1<<order pages or else the page allocator
 				 * fastpath will prematurely mark zones full
 				 * when the watermark is between the low and
 				 * min watermarks.
 				 */
 				if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
 				    ret == ZONE_RECLAIM_SOME)
 					goto this_zone_full;
 				continue;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	if (page)
 		/*
 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
 		 * necessary to allocate the page. The expectation is
 		 * that the caller is taking steps that will free more
 		 * memory. The caller should avoid the page being used
 		 * for !PFMEMALLOC purposes.
 		 */
 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * Walking all memory to count page types is very expensive and should
 	 * be inhibited in non-blockable contexts.
 	 */
 	if (!(gfp_mask & __GFP_WAIT))
 		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int migratetype)
+	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-		preferred_zone, migratetype);
+		preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, enum migrate_mode mode,
+	int classzone_idx, int migratetype, enum migrate_mode mode,
 	bool *contended_compaction, bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, mode,
 						contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		struct page *page;
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags & ~ALLOC_NO_WATERMARKS,
-				preferred_zone, migratetype);
+				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			preferred_zone->compact_blockskip_flush = false;
 			compaction_defer_reset(preferred_zone, order, true);
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (mode != MIGRATE_ASYNC)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, enum migrate_mode mode, bool *contended_compaction,
+	int classzone_idx, int migratetype,
+	enum migrate_mode mode, bool *contended_compaction,
 	bool *deferred_compaction, unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-	int migratetype, unsigned long *did_some_progress)
+	int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (IS_ENABLED(CONFIG_NUMA))
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags & ~ALLOC_NO_WATERMARKS,
-					preferred_zone, migratetype);
+					preferred_zone, classzone_idx,
+					migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int migratetype)
+	int classzone_idx, int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, classzone_idx, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static void reset_alloc_batches(struct zonelist *zonelist,
 				enum zone_type high_zoneidx,
 				struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		/*
 		 * Only reset the batches of zones that were actually
 		 * considered in the fairness pass, we don't want to
 		 * trash fairness information for zones that are not
 		 * actually part of this zonelist's round-robin cycle.
 		 */
 		if (!zone_local(preferred_zone, zone))
 			continue;
 		mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
 			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 	}
 }
 static void wake_all_kswapds(unsigned int order,
 			     struct zonelist *zonelist,
 			     enum zone_type high_zoneidx,
 			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (atomic) {
 		/*
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
 		if (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed_softwall().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 		else if (!in_interrupt() &&
 				((current->flags & PF_MEMALLOC) ||
 				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int migratetype)
+	int classzone_idx, int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	bool deferred_compaction = false;
 	bool contended_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (IS_ENABLED(CONFIG_NUMA) &&
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
-	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
-		first_zones_zonelist(zonelist, high_zoneidx, NULL,
+		struct zoneref *preferred_zoneref;
-					&preferred_zone);
+		preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+				NULL,
+				&preferred_zone);
+		classzone_idx = zonelist_zone_idx(preferred_zoneref);
+	}
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
-			preferred_zone, migratetype);
+			preferred_zone, classzone_idx, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		/*
 		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
 		 * the allocation is high priority and these type of
 		 * allocations are system rather than user orientated
 		 */
 		zonelist = node_zonelist(numa_node_id(), gfp_mask);
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
-				preferred_zone, migratetype);
+				preferred_zone, classzone_idx, migratetype);
 		if (page) {
 			goto got_pg;
 		}
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
-					preferred_zone, migratetype,
+					preferred_zone,
+					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	migration_mode = MIGRATE_SYNC_LIGHT;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * requested a movable allocation that does not heavily disrupt the
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
 						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
-					migratetype, &did_some_progress);
+					classzone_idx, migratetype,
+					&did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
-					migratetype);
+					classzone_idx, migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
 					high_zoneidx, nodemask, alloc_flags,
-					preferred_zone, migratetype,
+					preferred_zone,
+					classzone_idx, migratetype,
 					migration_mode, &contended_compaction,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
+	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
 	struct mem_cgroup *memcg = NULL;
+	int classzone_idx;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 	/*
 	 * Will only have any effect when __GFP_KMEMCG is set.  This is
 	 * verified in the (always inline) callee
 	 */
 	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/* The preferred zone is used for statistics later */
-	first_zones_zonelist(zonelist, high_zoneidx,
+	preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
+	classzone_idx = zonelist_zone_idx(preferred_zoneref);
 #ifdef CONFIG_CMA
 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
-			preferred_zone, migratetype);
+			preferred_zone, classzone_idx, migratetype);
 	if (unlikely(!page)) {
 		/*
 		 * The first pass makes sure allocations are spread
 		 * fairly within the local node.  However, the local
 		 * node might have free pages left after the fairness
 		 * batches are exhausted, and remote zones haven't
 		 * even been considered yet.  Try once more without
 		 * fairness, and include remote zones now, before
 		 * entering the slowpath and waking kswapd: prefer
 		 * spilling to a remote zone over swapping locally.
 		 */
 		if (alloc_flags & ALLOC_FAIR) {
 			reset_alloc_batches(zonelist, high_zoneidx,
 					    preferred_zone);
 			alloc_flags &= ~ALLOC_FAIR;
 			goto retry;
 		}
 		/*
 		 * Runtime PM, block IO and its error handling path
 		 * can deadlock because I/O on the device might not
 		 * complete.
 		 */
 		gfp_mask = memalloc_noio_flags(gfp_mask);
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
-				preferred_zone, migratetype);
+				preferred_zone, classzone_idx, migratetype);
 	}
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	memcg_kmem_commit_charge(page, memcg, order);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
  * pages allocated with __GFP_KMEMCG.
  *
  * Those pages are accounted to a particular memcg, embedded in the
  * corresponding page_cgroup. To avoid adding a hit in the allocator to search
  * for that information only to find out that it is NULL for users who have no
  * interest in that whatsoever, we provide these functions.
  *
  * The caller knows better which flags it relies on.
  */
 void __free_memcg_kmem_pages(struct page *page, unsigned int order)
 {
 	memcg_kmem_uncharge_pages(page, order);
 	__free_pages(page, order);
 }
 void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
 	}
 }
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *     managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = read_mems_allowed_begin();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RESERVE]	= 'R',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk("(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free_cma:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE),
 		global_page_state(NR_FREE_CMA_PAGES));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" free_cma:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk("= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write) {
 		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
 			ret = -EINVAL;
 			goto out;
 		}
 		strcpy(saved_string, (char *)table->data);
 	}
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		ret = __parse_numa_zonelist_order((char *)table->data);
 		if (ret) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char *)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
 	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->managed_pages;
 				total_size += z->managed_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
 	 * If there is a node whose DMA/DMA32 memory is very big area on
 	 * local memory, NODE_ORDER may be suitable.
 	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	int old_reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = zone_end_pfn(zone);
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	old_reserve = zone->nr_migrate_reserve_block;
 	/* When memory hot-add, we almost always need to do nothing */
 	if (reserve == old_reserve)
 		return;
 	zone->nr_migrate_reserve_block = reserve;
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		} else if (!old_reserve) {
 			/*
 			 * At boot time we don't need to scan the whole zone
 			 * for turning off MIGRATE_RESERVE.
 			 */
 			break;
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		page_mapcount_reset(page);
 		page_nid_reset_last(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < zone_end_pfn(z))
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void pageset_set_high_and_batch(struct zone *zone,
 				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
 	 */
 	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
 	static int __meminitdata last_nid;
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		last_start_pfn = start_pfn;
 		last_end_pfn = end_pfn;
 		last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the zone */
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 							 node_start_pfn,
 							 node_end_pfn,
 							 zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not naturally algined on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long node_start_pfn, unsigned long node_end_pfn,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
 						  node_end_pfn, zones_size);
 		realsize = freesize = size - zone_absent_pages_in_node(nid, j,
 								node_start_pfn,
 								node_end_pfn,
 								zholes_size);
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (freesize >= memmap_pages) {
 			freesize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds freesize %lu\n",
 				zone_names[j], memmap_pages, freesize);
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		/* For bootup, initialized properly in watermark setup */
 		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	if (node_state(nid, N_MEMORY))
 		init_zone_allows_reclaim(nid);
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, start_pfn, end_pfn,
 			    zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	printk("Memory: %luK/%luK available "
 	       "(%luK kernel code, %luK rwdata, %luK rodata, "
 	       "%luK init, %luK bss, %luK reserved"
 #ifdef	CONFIG_HIGHMEM
 	       ", %luK highmem"
 #endif
 	       "%s%s)\n",
 	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
 	       codesize >> 10, datasize >> 10, rosize >> 10,
 	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
 	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
 #ifdef	CONFIG_HIGHMEM
 	       totalhigh_pages << (PAGE_SHIFT-10),
 #endif
 	       str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->managed_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 				      high_wmark_pages(zone) -
 				      low_wmark_pages(zone) -
 				      zone_page_state(zone, NR_ALLOC_BATCH));
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
 	int ret;
 	mutex_lock(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || ret < 0)
 		goto out;
 	/* Sanity checking to avoid pcp imbalance */
 	if (percpu_pagelist_fraction &&
 	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
 		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
 		ret = -EINVAL;
 		goto out;
 	}
 	/* No change? */
 	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
 		goto out;
 	for_each_populated_zone(zone) {
 		unsigned int cpu;
 		for_each_possible_cpu(cpu)
 			pageset_set_high_and_batch(zone,
 					per_cpu_ptr(zone->pageset, cpu));
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_mask(struct page *page,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long word;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	word = bitmap[word_bitidx];
 	bitidx += end_bitidx;
 	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 }
 /**
  * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx, word_bitidx;
 	unsigned long old_word, word;
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	VM_BUG_ON(!zone_spans_pfn(zone, pfn));
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
 	flags <<= (BITS_PER_LONG - bitidx - 1);
 	word = ACCESS_ONCE(bitmap[word_bitidx]);
 	for (;;) {
 		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 		if (word == old_word)
 			break;
 		word = old_word;
 	}
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc->zone, cc,
 							 pfn, end, true);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    NULL, 0, cc->mode, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, false)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);