Eric Lee / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kasan.h>

27

#include <linux/kasan.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/memremap.h>

45

#include <linux/memremap.h>

46

#include <linux/stop_machine.h>

46

#include <linux/stop_machine.h>

47

#include <linux/sort.h>

47

#include <linux/sort.h>

48

#include <linux/pfn.h>

48

#include <linux/pfn.h>

49

#include <linux/backing-dev.h>

49

#include <linux/backing-dev.h>

50

#include <linux/fault-inject.h>

50

#include <linux/fault-inject.h>

51

#include <linux/page-isolation.h>

51

#include <linux/page-isolation.h>

52

#include <linux/page_ext.h>

52

#include <linux/page_ext.h>

53

#include <linux/debugobjects.h>

53

#include <linux/debugobjects.h>

54

#include <linux/kmemleak.h>

54

#include <linux/kmemleak.h>

55

#include <linux/compaction.h>

55

#include <linux/compaction.h>

56

#include <trace/events/kmem.h>

56

#include <trace/events/kmem.h>

57

#include <trace/events/oom.h>

57

#include <trace/events/oom.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/hugetlb.h>

61

#include <linux/hugetlb.h>

62

#include <linux/sched/rt.h>

62

#include <linux/sched/rt.h>

63

#include <linux/sched/mm.h>

63

#include <linux/sched/mm.h>

64

#include <linux/page_owner.h>

64

#include <linux/page_owner.h>

65

#include <linux/kthread.h>

65

#include <linux/kthread.h>

66

#include <linux/memcontrol.h>

66

#include <linux/memcontrol.h>

67

#include <linux/ftrace.h>

67

#include <linux/ftrace.h>

68

#include <linux/lockdep.h>

68

#include <linux/lockdep.h>

69

#include <linux/nmi.h>

69

#include <linux/nmi.h>

70

71

#include <asm/sections.h>

71

#include <asm/sections.h>

72

#include <asm/tlbflush.h>

72

#include <asm/tlbflush.h>

73

#include <asm/div64.h>

73

#include <asm/div64.h>

74

#include "internal.h"

74

#include "internal.h"

75

76

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

76

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

77

static DEFINE_MUTEX(pcp_batch_high_lock);

77

static DEFINE_MUTEX(pcp_batch_high_lock);

78

#define MIN_PERCPU_PAGELIST_FRACTION (8)

78

#define MIN_PERCPU_PAGELIST_FRACTION (8)

79

80

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

80

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

81

DEFINE_PER_CPU(int, numa_node);

81

DEFINE_PER_CPU(int, numa_node);

82

EXPORT_PER_CPU_SYMBOL(numa_node);

82

EXPORT_PER_CPU_SYMBOL(numa_node);

83

#endif

83

#endif

84

85

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

85

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

86

/*

86

/*

87

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

87

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

88

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

88

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

89

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

89

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

90

* defined in <linux/topology.h>.

90

* defined in <linux/topology.h>.

91

*/

91

*/

92

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

92

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

93

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

93

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

94

int _node_numa_mem_[MAX_NUMNODES];

94

int _node_numa_mem_[MAX_NUMNODES];

95

#endif

95

#endif

96

97

/* work_structs for global per-cpu drains */

97

/* work_structs for global per-cpu drains */

98

DEFINE_MUTEX(pcpu_drain_mutex);

98

DEFINE_MUTEX(pcpu_drain_mutex);

99

DEFINE_PER_CPU(struct work_struct, pcpu_drain);

99

DEFINE_PER_CPU(struct work_struct, pcpu_drain);

100

101

#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY

101

#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY

102

volatile unsigned long latent_entropy __latent_entropy;

102

volatile unsigned long latent_entropy __latent_entropy;

103

EXPORT_SYMBOL(latent_entropy);

103

EXPORT_SYMBOL(latent_entropy);

104

#endif

104

#endif

105

106

/*

106

/*

107

* Array of node states.

107

* Array of node states.

108

*/

108

*/

109

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

109

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

110

[N_POSSIBLE] = NODE_MASK_ALL,

110

[N_POSSIBLE] = NODE_MASK_ALL,

111

[N_ONLINE] = { { [0] = 1UL } },

111

[N_ONLINE] = { { [0] = 1UL } },

112

#ifndef CONFIG_NUMA

112

#ifndef CONFIG_NUMA

113

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

113

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

114

#ifdef CONFIG_HIGHMEM

114

#ifdef CONFIG_HIGHMEM

115

[N_HIGH_MEMORY] = { { [0] = 1UL } },

115

[N_HIGH_MEMORY] = { { [0] = 1UL } },

116

#endif

116

#endif

117

[N_MEMORY] = { { [0] = 1UL } },

117

[N_MEMORY] = { { [0] = 1UL } },

118

[N_CPU] = { { [0] = 1UL } },

118

[N_CPU] = { { [0] = 1UL } },

119

#endif /* NUMA */

119

#endif /* NUMA */

120

};

120

};

121

EXPORT_SYMBOL(node_states);

121

EXPORT_SYMBOL(node_states);

122

123

/* Protect totalram_pages and zone->managed_pages */

123

/* Protect totalram_pages and zone->managed_pages */

124

static DEFINE_SPINLOCK(managed_page_count_lock);

124

static DEFINE_SPINLOCK(managed_page_count_lock);

125

126

unsigned long totalram_pages __read_mostly;

126

unsigned long totalram_pages __read_mostly;

127

unsigned long totalreserve_pages __read_mostly;

127

unsigned long totalreserve_pages __read_mostly;

128

unsigned long totalcma_pages __read_mostly;

128

unsigned long totalcma_pages __read_mostly;

129

130

int percpu_pagelist_fraction;

130

int percpu_pagelist_fraction;

131

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

131

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

132

133

/*

133

/*

134

* A cached value of the page's pageblock's migratetype, used when the page is

134

* A cached value of the page's pageblock's migratetype, used when the page is

135

* put on a pcplist. Used to avoid the pageblock migratetype lookup when

135

* put on a pcplist. Used to avoid the pageblock migratetype lookup when

136

* freeing from pcplists in most cases, at the cost of possibly becoming stale.

136

* freeing from pcplists in most cases, at the cost of possibly becoming stale.

137

* Also the migratetype set in the page does not necessarily match the pcplist

137

* Also the migratetype set in the page does not necessarily match the pcplist

138

* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any

138

* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any

139

* other index - this ensures that it will be put on the correct CMA freelist.

139

* other index - this ensures that it will be put on the correct CMA freelist.

140

*/

140

*/

141

static inline int get_pcppage_migratetype(struct page *page)

141

static inline int get_pcppage_migratetype(struct page *page)

142

{

142

{

143

return page->index;

143

return page->index;

144

}

144

}

145

146

static inline void set_pcppage_migratetype(struct page *page, int migratetype)

146

static inline void set_pcppage_migratetype(struct page *page, int migratetype)

147

{

147

{

148

page->index = migratetype;

148

page->index = migratetype;

149

}

149

}

150

151

#ifdef CONFIG_PM_SLEEP

151

#ifdef CONFIG_PM_SLEEP

152

/*

152

/*

153

* The following functions are used by the suspend/hibernate code to temporarily

153

* The following functions are used by the suspend/hibernate code to temporarily

154

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

154

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

155

* while devices are suspended. To avoid races with the suspend/hibernate code,

155

* while devices are suspended. To avoid races with the suspend/hibernate code,

156

* they should always be called with pm_mutex held (gfp_allowed_mask also should

156

* they should always be called with pm_mutex held (gfp_allowed_mask also should

157

* only be modified with pm_mutex held, unless the suspend/hibernate code is

157

* only be modified with pm_mutex held, unless the suspend/hibernate code is

158

* guaranteed not to run in parallel with that modification).

158

* guaranteed not to run in parallel with that modification).

159

*/

159

*/

160

161

static gfp_t saved_gfp_mask;

161

static gfp_t saved_gfp_mask;

162

163

void pm_restore_gfp_mask(void)

163

void pm_restore_gfp_mask(void)

164

{

164

{

165

WARN_ON(!mutex_is_locked(&pm_mutex));

165

WARN_ON(!mutex_is_locked(&pm_mutex));

166

if (saved_gfp_mask) {

166

if (saved_gfp_mask) {

167

gfp_allowed_mask = saved_gfp_mask;

167

gfp_allowed_mask = saved_gfp_mask;

168

saved_gfp_mask = 0;

168

saved_gfp_mask = 0;

169

}

169

}

170

}

170

}

171

172

void pm_restrict_gfp_mask(void)

172

void pm_restrict_gfp_mask(void)

173

{

173

{

174

WARN_ON(!mutex_is_locked(&pm_mutex));

174

WARN_ON(!mutex_is_locked(&pm_mutex));

175

WARN_ON(saved_gfp_mask);

175

WARN_ON(saved_gfp_mask);

176

saved_gfp_mask = gfp_allowed_mask;

176

saved_gfp_mask = gfp_allowed_mask;

177

gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);

177

gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);

178

}

178

}

179

180

bool pm_suspended_storage(void)

180

bool pm_suspended_storage(void)

181

{

181

{

182

if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))

182

if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))

183

return false;

183

return false;

184

return true;

184

return true;

185

}

185

}

186

#endif /* CONFIG_PM_SLEEP */

186

#endif /* CONFIG_PM_SLEEP */

187

188

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

188

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

189

unsigned int pageblock_order __read_mostly;

189

unsigned int pageblock_order __read_mostly;

190

#endif

190

#endif

191

192

static void __free_pages_ok(struct page *page, unsigned int order);

192

static void __free_pages_ok(struct page *page, unsigned int order);

193

194

/*

194

/*

195

* results with 256, 32 in the lowmem_reserve sysctl:

195

* results with 256, 32 in the lowmem_reserve sysctl:

196

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

196

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

197

* 1G machine -> (16M dma, 784M normal, 224M high)

197

* 1G machine -> (16M dma, 784M normal, 224M high)

198

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

198

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

199

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

199

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

200

* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA

200

* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA

201

*

201

*

202

* TBD: should special case ZONE_DMA32 machines here - in those we normally

202

* TBD: should special case ZONE_DMA32 machines here - in those we normally

203

* don't need any ZONE_NORMAL reservation

203

* don't need any ZONE_NORMAL reservation

204

*/

204

*/

205

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

205

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

206

#ifdef CONFIG_ZONE_DMA

206

#ifdef CONFIG_ZONE_DMA

207

256,

207

256,

208

#endif

208

#endif

209

#ifdef CONFIG_ZONE_DMA32

209

#ifdef CONFIG_ZONE_DMA32

210

256,

210

256,

211

#endif

211

#endif

212

#ifdef CONFIG_HIGHMEM

212

#ifdef CONFIG_HIGHMEM

213

32,

213

32,

214

#endif

214

#endif

215

32,

215

32,

216

};

216

};

217

218

EXPORT_SYMBOL(totalram_pages);

218

EXPORT_SYMBOL(totalram_pages);

219

220

static char * const zone_names[MAX_NR_ZONES] = {

220

static char * const zone_names[MAX_NR_ZONES] = {

221

#ifdef CONFIG_ZONE_DMA

221

#ifdef CONFIG_ZONE_DMA

222

"DMA",

222

"DMA",

223

#endif

223

#endif

224

#ifdef CONFIG_ZONE_DMA32

224

#ifdef CONFIG_ZONE_DMA32

225

"DMA32",

225

"DMA32",

226

#endif

226

#endif

227

"Normal",

227

"Normal",

228

#ifdef CONFIG_HIGHMEM

228

#ifdef CONFIG_HIGHMEM

229

"HighMem",

229

"HighMem",

230

#endif

230

#endif

231

"Movable",

231

"Movable",

232

#ifdef CONFIG_ZONE_DEVICE

232

#ifdef CONFIG_ZONE_DEVICE

233

"Device",

233

"Device",

234

#endif

234

#endif

235

};

235

};

236

237

char * const migratetype_names[MIGRATE_TYPES] = {

237

char * const migratetype_names[MIGRATE_TYPES] = {

238

"Unmovable",

238

"Unmovable",

239

"Movable",

239

"Movable",

240

"Reclaimable",

240

"Reclaimable",

241

"HighAtomic",

241

"HighAtomic",

242

#ifdef CONFIG_CMA

242

#ifdef CONFIG_CMA

243

"CMA",

243

"CMA",

244

#endif

244

#endif

245

#ifdef CONFIG_MEMORY_ISOLATION

245

#ifdef CONFIG_MEMORY_ISOLATION

246

"Isolate",

246

"Isolate",

247

#endif

247

#endif

248

};

248

};

249

250

compound_page_dtor * const compound_page_dtors[] = {

250

compound_page_dtor * const compound_page_dtors[] = {

251

NULL,

251

NULL,

252

free_compound_page,

252

free_compound_page,

253

#ifdef CONFIG_HUGETLB_PAGE

253

#ifdef CONFIG_HUGETLB_PAGE

254

free_huge_page,

254

free_huge_page,

255

#endif

255

#endif

256

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

256

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

257

free_transhuge_page,

257

free_transhuge_page,

258

#endif

258

#endif

259

};

259

};

260

261

/*

261

/*

262

* Try to keep at least this much lowmem free. Do not allow normal

262

* Try to keep at least this much lowmem free. Do not allow normal

263

* allocations below this point, only high priority ones. Automatically

263

* allocations below this point, only high priority ones. Automatically

264

* tuned according to the amount of memory in the system.

264

* tuned according to the amount of memory in the system.

265

*/

265

*/

266

int min_free_kbytes = 1024;

266

int min_free_kbytes = 1024;

267

int user_min_free_kbytes = -1;

267

int user_min_free_kbytes = -1;

268

int watermark_scale_factor = 10;

268

int watermark_scale_factor = 10;

269

270

/*

270

/*

271

* Extra memory for the system to try freeing. Used to temporarily

271

* Extra memory for the system to try freeing. Used to temporarily

272

* free memory, to make space for new workloads. Anyone can allocate

272

* free memory, to make space for new workloads. Anyone can allocate

273

* down to the min watermarks controlled by min_free_kbytes above.

273

* down to the min watermarks controlled by min_free_kbytes above.

274

*/

274

*/

275

int extra_free_kbytes = 0;

275

int extra_free_kbytes = 0;

276

277

static unsigned long __meminitdata nr_kernel_pages;

277

static unsigned long __meminitdata nr_kernel_pages;

278

static unsigned long __meminitdata nr_all_pages;

278

static unsigned long __meminitdata nr_all_pages;

279

static unsigned long __meminitdata dma_reserve;

279

static unsigned long __meminitdata dma_reserve;

280

281

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

281

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

282

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

282

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

283

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

283

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

284

static unsigned long __initdata required_kernelcore;

284

static unsigned long __initdata required_kernelcore;

285

static unsigned long __initdata required_movablecore;

285

static unsigned long __initdata required_movablecore;

286

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

286

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

287

static bool mirrored_kernelcore;

287

static bool mirrored_kernelcore;

288

289

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

289

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

290

int movable_zone;

290

int movable_zone;

291

EXPORT_SYMBOL(movable_zone);

291

EXPORT_SYMBOL(movable_zone);

292

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

292

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

293

294

#if MAX_NUMNODES > 1

294

#if MAX_NUMNODES > 1

295

int nr_node_ids __read_mostly = MAX_NUMNODES;

295

int nr_node_ids __read_mostly = MAX_NUMNODES;

296

int nr_online_nodes __read_mostly = 1;

296

int nr_online_nodes __read_mostly = 1;

297

EXPORT_SYMBOL(nr_node_ids);

297

EXPORT_SYMBOL(nr_node_ids);

298

EXPORT_SYMBOL(nr_online_nodes);

298

EXPORT_SYMBOL(nr_online_nodes);

299

#endif

299

#endif

300

301

int page_group_by_mobility_disabled __read_mostly;

301

int page_group_by_mobility_disabled __read_mostly;

302

303

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

303

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

304

305

/*

305

/*

306

* Determine how many pages need to be initialized durig early boot

306

* Determine how many pages need to be initialized durig early boot

307

* (non-deferred initialization).

307

* (non-deferred initialization).

308

* The value of first_deferred_pfn will be set later, once non-deferred pages

308

* The value of first_deferred_pfn will be set later, once non-deferred pages

309

* are initialized, but for now set it ULONG_MAX.

309

* are initialized, but for now set it ULONG_MAX.

310

*/

310

*/

311

static inline void reset_deferred_meminit(pg_data_t *pgdat)

311

static inline void reset_deferred_meminit(pg_data_t *pgdat)

312

{

312

{

313

phys_addr_t start_addr, end_addr;

313

phys_addr_t start_addr, end_addr;

314

unsigned long max_pgcnt;

314

unsigned long max_pgcnt;

315

unsigned long reserved;

315

unsigned long reserved;

316

317

/*

317

/*

318

* Initialise at least 2G of a node but also take into account that

318

* Initialise at least 2G of a node but also take into account that

319

* two large system hashes that can take up 1GB for 0.25TB/node.

319

* two large system hashes that can take up 1GB for 0.25TB/node.

320

*/

320

*/

321

max_pgcnt = max(2UL << (30 - PAGE_SHIFT),

321

max_pgcnt = max(2UL << (30 - PAGE_SHIFT),

322

(pgdat->node_spanned_pages >> 8));

322

(pgdat->node_spanned_pages >> 8));

323

324

/*

324

/*

325

* Compensate the all the memblock reservations (e.g. crash kernel)

325

* Compensate the all the memblock reservations (e.g. crash kernel)

326

* from the initial estimation to make sure we will initialize enough

326

* from the initial estimation to make sure we will initialize enough

327

* memory to boot.

327

* memory to boot.

328

*/

328

*/

329

start_addr = PFN_PHYS(pgdat->node_start_pfn);

329

start_addr = PFN_PHYS(pgdat->node_start_pfn);

330

end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);

330

end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);

331

reserved = memblock_reserved_memory_within(start_addr, end_addr);

331

reserved = memblock_reserved_memory_within(start_addr, end_addr);

332

max_pgcnt += PHYS_PFN(reserved);

332

max_pgcnt += PHYS_PFN(reserved);

333

334

pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);

334

pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);

335

pgdat->first_deferred_pfn = ULONG_MAX;

335

pgdat->first_deferred_pfn = ULONG_MAX;

336

}

336

}

337

338

/* Returns true if the struct page for the pfn is uninitialised */

338

/* Returns true if the struct page for the pfn is uninitialised */

339

static inline bool __meminit early_page_uninitialised(unsigned long pfn)

339

static inline bool __meminit early_page_uninitialised(unsigned long pfn)

340

{

340

{

341

int nid = early_pfn_to_nid(pfn);

341

int nid = early_pfn_to_nid(pfn);

342

343

if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)

343

if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)

344

return true;

344

return true;

345

346

return false;

346

return false;

347

}

347

}

348

349

/*

349

/*

350

* Returns false when the remaining initialisation should be deferred until

350

* Returns false when the remaining initialisation should be deferred until

351

* later in the boot cycle when it can be parallelised.

351

* later in the boot cycle when it can be parallelised.

352

*/

352

*/

353

static inline bool update_defer_init(pg_data_t *pgdat,

353

static inline bool update_defer_init(pg_data_t *pgdat,

354

unsigned long pfn, unsigned long zone_end,

354

unsigned long pfn, unsigned long zone_end,

355

unsigned long *nr_initialised)

355

unsigned long *nr_initialised)

356

{

356

{

357

/* Always populate low zones for address-contrained allocations */

357

/* Always populate low zones for address-contrained allocations */

358

if (zone_end < pgdat_end_pfn(pgdat))

358

if (zone_end < pgdat_end_pfn(pgdat))

359

return true;

359

return true;

360

(*nr_initialised)++;

360

(*nr_initialised)++;

361

if ((*nr_initialised > pgdat->static_init_pgcnt) &&

361

if ((*nr_initialised > pgdat->static_init_pgcnt) &&

362

(pfn & (PAGES_PER_SECTION - 1)) == 0) {

362

(pfn & (PAGES_PER_SECTION - 1)) == 0) {

363

pgdat->first_deferred_pfn = pfn;

363

pgdat->first_deferred_pfn = pfn;

364

return false;

364

return false;

365

}

365

}

366

367

return true;

367

return true;

368

}

368

}

369

#else

369

#else

370

static inline void reset_deferred_meminit(pg_data_t *pgdat)

370

static inline void reset_deferred_meminit(pg_data_t *pgdat)

371

{

371

{

372

}

372

}

373

374

static inline bool early_page_uninitialised(unsigned long pfn)

374

static inline bool early_page_uninitialised(unsigned long pfn)

375

{

375

{

376

return false;

376

return false;

377

}

377

}

378

379

static inline bool update_defer_init(pg_data_t *pgdat,

379

static inline bool update_defer_init(pg_data_t *pgdat,

380

unsigned long pfn, unsigned long zone_end,

380

unsigned long pfn, unsigned long zone_end,

381

unsigned long *nr_initialised)

381

unsigned long *nr_initialised)

382

{

382

{

383

return true;

383

return true;

384

}

384

}

385

#endif

385

#endif

386

387

/* Return a pointer to the bitmap storing bits affecting a block of pages */

387

/* Return a pointer to the bitmap storing bits affecting a block of pages */

388

static inline unsigned long *get_pageblock_bitmap(struct page *page,

388

static inline unsigned long *get_pageblock_bitmap(struct page *page,

389

unsigned long pfn)

389

unsigned long pfn)

390

{

390

{

391

#ifdef CONFIG_SPARSEMEM

391

#ifdef CONFIG_SPARSEMEM

392

return __pfn_to_section(pfn)->pageblock_flags;

392

return __pfn_to_section(pfn)->pageblock_flags;

393

#else

393

#else

394

return page_zone(page)->pageblock_flags;

394

return page_zone(page)->pageblock_flags;

395

#endif /* CONFIG_SPARSEMEM */

395

#endif /* CONFIG_SPARSEMEM */

396

}

396

}

397

398

static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)

398

static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)

399

{

399

{

400

#ifdef CONFIG_SPARSEMEM

400

#ifdef CONFIG_SPARSEMEM

401

pfn &= (PAGES_PER_SECTION-1);

401

pfn &= (PAGES_PER_SECTION-1);

402

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

402

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

403

#else

403

#else

404

pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);

404

pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);

405

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

405

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

406

#endif /* CONFIG_SPARSEMEM */

406

#endif /* CONFIG_SPARSEMEM */

407

}

407

}

408

409

/**

409

/**

410

* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages

410

* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages

411

* @page: The page within the block of interest

411

* @page: The page within the block of interest

412

* @pfn: The target page frame number

412

* @pfn: The target page frame number

413

* @end_bitidx: The last bit of interest to retrieve

413

* @end_bitidx: The last bit of interest to retrieve

414

* @mask: mask of bits that the caller is interested in

414

* @mask: mask of bits that the caller is interested in

415

*

415

*

416

* Return: pageblock_bits flags

416

* Return: pageblock_bits flags

417

*/

417

*/

418

static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,

418

static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,

419

unsigned long pfn,

419

unsigned long pfn,

420

unsigned long end_bitidx,

420

unsigned long end_bitidx,

421

unsigned long mask)

421

unsigned long mask)

422

{

422

{

423

unsigned long *bitmap;

423

unsigned long *bitmap;

424

unsigned long bitidx, word_bitidx;

424

unsigned long bitidx, word_bitidx;

425

unsigned long word;

425

unsigned long word;

426

427

bitmap = get_pageblock_bitmap(page, pfn);

427

bitmap = get_pageblock_bitmap(page, pfn);

428

bitidx = pfn_to_bitidx(page, pfn);

428

bitidx = pfn_to_bitidx(page, pfn);

429

word_bitidx = bitidx / BITS_PER_LONG;

429

word_bitidx = bitidx / BITS_PER_LONG;

430

bitidx &= (BITS_PER_LONG-1);

430

bitidx &= (BITS_PER_LONG-1);

431

432

word = bitmap[word_bitidx];

432

word = bitmap[word_bitidx];

433

bitidx += end_bitidx;

433

bitidx += end_bitidx;

434

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

434

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

435

}

435

}

436

437

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

437

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

438

unsigned long end_bitidx,

438

unsigned long end_bitidx,

439

unsigned long mask)

439

unsigned long mask)

440

{

440

{

441

return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);

441

return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);

442

}

442

}

443

444

static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)

444

static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)

445

{

445

{

446

return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);

446

return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);

447

}

447

}

448

449

/**

449

/**

450

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

450

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

451

* @page: The page within the block of interest

451

* @page: The page within the block of interest

452

* @flags: The flags to set

452

* @flags: The flags to set

453

* @pfn: The target page frame number

453

* @pfn: The target page frame number

454

* @end_bitidx: The last bit of interest

454

* @end_bitidx: The last bit of interest

455

* @mask: mask of bits that the caller is interested in

455

* @mask: mask of bits that the caller is interested in

456

*/

456

*/

457

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

457

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

458

unsigned long pfn,

458

unsigned long pfn,

459

unsigned long end_bitidx,

459

unsigned long end_bitidx,

460

unsigned long mask)

460

unsigned long mask)

461

{

461

{

462

unsigned long *bitmap;

462

unsigned long *bitmap;

463

unsigned long bitidx, word_bitidx;

463

unsigned long bitidx, word_bitidx;

464

unsigned long old_word, word;

464

unsigned long old_word, word;

465

466

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

466

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

467

468

bitmap = get_pageblock_bitmap(page, pfn);

468

bitmap = get_pageblock_bitmap(page, pfn);

469

bitidx = pfn_to_bitidx(page, pfn);

469

bitidx = pfn_to_bitidx(page, pfn);

470

word_bitidx = bitidx / BITS_PER_LONG;

470

word_bitidx = bitidx / BITS_PER_LONG;

471

bitidx &= (BITS_PER_LONG-1);

471

bitidx &= (BITS_PER_LONG-1);

472

473

VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

473

VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);

474

475

bitidx += end_bitidx;

475

bitidx += end_bitidx;

476

mask <<= (BITS_PER_LONG - bitidx - 1);

476

mask <<= (BITS_PER_LONG - bitidx - 1);

477

flags <<= (BITS_PER_LONG - bitidx - 1);

477

flags <<= (BITS_PER_LONG - bitidx - 1);

478

479

word = READ_ONCE(bitmap[word_bitidx]);

479

word = READ_ONCE(bitmap[word_bitidx]);

480

for (;;) {

480

for (;;) {

481

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

481

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

482

if (word == old_word)

482

if (word == old_word)

483

break;

483

break;

484

word = old_word;

484

word = old_word;

485

}

485

}

486

}

486

}

487

488

void set_pageblock_migratetype(struct page *page, int migratetype)

488

void set_pageblock_migratetype(struct page *page, int migratetype)

489

{

489

{

490

if (unlikely(page_group_by_mobility_disabled &&

490

if (unlikely(page_group_by_mobility_disabled &&

491

migratetype < MIGRATE_PCPTYPES))

491

migratetype < MIGRATE_PCPTYPES))

492

migratetype = MIGRATE_UNMOVABLE;

492

migratetype = MIGRATE_UNMOVABLE;

493

494

set_pageblock_flags_group(page, (unsigned long)migratetype,

494

set_pageblock_flags_group(page, (unsigned long)migratetype,

495

PB_migrate, PB_migrate_end);

495

PB_migrate, PB_migrate_end);

496

}

496

}

497

498

#ifdef CONFIG_DEBUG_VM

498

#ifdef CONFIG_DEBUG_VM

499

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

499

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

500

{

500

{

501

int ret = 0;

501

int ret = 0;

502

unsigned seq;

502

unsigned seq;

503

unsigned long pfn = page_to_pfn(page);

503

unsigned long pfn = page_to_pfn(page);

504

unsigned long sp, start_pfn;

504

unsigned long sp, start_pfn;

505

506

do {

506

do {

507

seq = zone_span_seqbegin(zone);

507

seq = zone_span_seqbegin(zone);

508

start_pfn = zone->zone_start_pfn;

508

start_pfn = zone->zone_start_pfn;

509

sp = zone->spanned_pages;

509

sp = zone->spanned_pages;

510

if (!zone_spans_pfn(zone, pfn))

510

if (!zone_spans_pfn(zone, pfn))

511

ret = 1;

511

ret = 1;

512

} while (zone_span_seqretry(zone, seq));

512

} while (zone_span_seqretry(zone, seq));

513

514

if (ret)

514

if (ret)

515

pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",

515

pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",

516

pfn, zone_to_nid(zone), zone->name,

516

pfn, zone_to_nid(zone), zone->name,

517

start_pfn, start_pfn + sp);

517

start_pfn, start_pfn + sp);

518

519

return ret;

519

return ret;

520

}

520

}

521

522

static int page_is_consistent(struct zone *zone, struct page *page)

522

static int page_is_consistent(struct zone *zone, struct page *page)

523

{

523

{

524

if (!pfn_valid_within(page_to_pfn(page)))

524

if (!pfn_valid_within(page_to_pfn(page)))

525

return 0;

525

return 0;

526

if (zone != page_zone(page))

526

if (zone != page_zone(page))

527

return 0;

527

return 0;

528

529

return 1;

529

return 1;

530

}

530

}

531

/*

531

/*

532

* Temporary debugging check for pages not lying within a given zone.

532

* Temporary debugging check for pages not lying within a given zone.

533

*/

533

*/

534

static int __maybe_unused bad_range(struct zone *zone, struct page *page)

534

static int __maybe_unused bad_range(struct zone *zone, struct page *page)

535

{

535

{

536

if (page_outside_zone_boundaries(zone, page))

536

if (page_outside_zone_boundaries(zone, page))

537

return 1;

537

return 1;

538

if (!page_is_consistent(zone, page))

538

if (!page_is_consistent(zone, page))

539

return 1;

539

return 1;

540

541

return 0;

541

return 0;

542

}

542

}

543

#else

543

#else

544

static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)

544

static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)

545

{

545

{

546

return 0;

546

return 0;

547

}

547

}

548

#endif

548

#endif

549

550

static void bad_page(struct page *page, const char *reason,

550

static void bad_page(struct page *page, const char *reason,

551

unsigned long bad_flags)

551

unsigned long bad_flags)

552

{

552

{

553

static unsigned long resume;

553

static unsigned long resume;

554

static unsigned long nr_shown;

554

static unsigned long nr_shown;

555

static unsigned long nr_unshown;

555

static unsigned long nr_unshown;

556

557

/*

557

/*

558

* Allow a burst of 60 reports, then keep quiet for that minute;

558

* Allow a burst of 60 reports, then keep quiet for that minute;

559

* or allow a steady drip of one report per second.

559

* or allow a steady drip of one report per second.

560

*/

560

*/

561

if (nr_shown == 60) {

561

if (nr_shown == 60) {

562

if (time_before(jiffies, resume)) {

562

if (time_before(jiffies, resume)) {

563

nr_unshown++;

563

nr_unshown++;

564

goto out;

564

goto out;

565

}

565

}

566

if (nr_unshown) {

566

if (nr_unshown) {

567

pr_alert(

567

pr_alert(

568

"BUG: Bad page state: %lu messages suppressed\n",

568

"BUG: Bad page state: %lu messages suppressed\n",

569

nr_unshown);

569

nr_unshown);

570

nr_unshown = 0;

570

nr_unshown = 0;

571

}

571

}

572

nr_shown = 0;

572

nr_shown = 0;

573

}

573

}

574

if (nr_shown++ == 0)

574

if (nr_shown++ == 0)

575

resume = jiffies + 60 * HZ;

575

resume = jiffies + 60 * HZ;

576

577

pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",

577

pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",

578

current->comm, page_to_pfn(page));

578

current->comm, page_to_pfn(page));

579

__dump_page(page, reason);

579

__dump_page(page, reason);

580

bad_flags &= page->flags;

580

bad_flags &= page->flags;

581

if (bad_flags)

581

if (bad_flags)

582

pr_alert("bad because of flags: %#lx(%pGp)\n",

582

pr_alert("bad because of flags: %#lx(%pGp)\n",

583

bad_flags, &bad_flags);

583

bad_flags, &bad_flags);

584

dump_page_owner(page);

584

dump_page_owner(page);

585

586

print_modules();

586

print_modules();

587

dump_stack();

587

dump_stack();

588

out:

588

out:

589

/* Leave bad fields for debug, except PageBuddy could make trouble */

589

/* Leave bad fields for debug, except PageBuddy could make trouble */

590

page_mapcount_reset(page); /* remove PageBuddy */

590

page_mapcount_reset(page); /* remove PageBuddy */

591

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

591

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

592

}

592

}

593

594

/*

594

/*

595

* Higher-order pages are called "compound pages". They are structured thusly:

595

* Higher-order pages are called "compound pages". They are structured thusly:

596

*

596

*

597

* The first PAGE_SIZE page is called the "head page" and have PG_head set.

597

* The first PAGE_SIZE page is called the "head page" and have PG_head set.

598

*

598

*

599

* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded

599

* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded

600

* in bit 0 of page->compound_head. The rest of bits is pointer to head page.

600

* in bit 0 of page->compound_head. The rest of bits is pointer to head page.

601

*

601

*

602

* The first tail page's ->compound_dtor holds the offset in array of compound

602

* The first tail page's ->compound_dtor holds the offset in array of compound

603

* page destructors. See compound_page_dtors.

603

* page destructors. See compound_page_dtors.

604

*

604

*

605

* The first tail page's ->compound_order holds the order of allocation.

605

* The first tail page's ->compound_order holds the order of allocation.

606

* This usage means that zero-order pages may not be compound.

606

* This usage means that zero-order pages may not be compound.

607

*/

607

*/

608

609

void free_compound_page(struct page *page)

609

void free_compound_page(struct page *page)

610

{

610

{

611

__free_pages_ok(page, compound_order(page));

611

__free_pages_ok(page, compound_order(page));

612

}

612

}

613

614

void prep_compound_page(struct page *page, unsigned int order)

614

void prep_compound_page(struct page *page, unsigned int order)

615

{

615

{

616

int i;

616

int i;

617

int nr_pages = 1 << order;

617

int nr_pages = 1 << order;

618

619

set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);

619

set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);

620

set_compound_order(page, order);

620

set_compound_order(page, order);

621

__SetPageHead(page);

621

__SetPageHead(page);

622

for (i = 1; i < nr_pages; i++) {

622

for (i = 1; i < nr_pages; i++) {

623

struct page *p = page + i;

623

struct page *p = page + i;

624

set_page_count(p, 0);

624

set_page_count(p, 0);

625

p->mapping = TAIL_MAPPING;

625

p->mapping = TAIL_MAPPING;

626

set_compound_head(p, page);

626

set_compound_head(p, page);

627

}

627

}

628

atomic_set(compound_mapcount_ptr(page), -1);

628

atomic_set(compound_mapcount_ptr(page), -1);

629

}

629

}

630

631

#ifdef CONFIG_DEBUG_PAGEALLOC

631

#ifdef CONFIG_DEBUG_PAGEALLOC

632

unsigned int _debug_guardpage_minorder;

632

unsigned int _debug_guardpage_minorder;

633

bool _debug_pagealloc_enabled __read_mostly

633

bool _debug_pagealloc_enabled __read_mostly

634

= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);

634

= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);

635

EXPORT_SYMBOL(_debug_pagealloc_enabled);

635

EXPORT_SYMBOL(_debug_pagealloc_enabled);

636

bool _debug_guardpage_enabled __read_mostly;

636

bool _debug_guardpage_enabled __read_mostly;

637

638

static int __init early_debug_pagealloc(char *buf)

638

static int __init early_debug_pagealloc(char *buf)

639

{

639

{

640

if (!buf)

640

if (!buf)

641

return -EINVAL;

641

return -EINVAL;

642

return kstrtobool(buf, &_debug_pagealloc_enabled);

642

return kstrtobool(buf, &_debug_pagealloc_enabled);

643

}

643

}

644

early_param("debug_pagealloc", early_debug_pagealloc);

644

early_param("debug_pagealloc", early_debug_pagealloc);

645

646

static bool need_debug_guardpage(void)

646

static bool need_debug_guardpage(void)

647

{

647

{

648

/* If we don't use debug_pagealloc, we don't need guard page */

648

/* If we don't use debug_pagealloc, we don't need guard page */

649

if (!debug_pagealloc_enabled())

649

if (!debug_pagealloc_enabled())

650

return false;

650

return false;

651

652

if (!debug_guardpage_minorder())

652

if (!debug_guardpage_minorder())

653

return false;

653

return false;

654

655

return true;

655

return true;

656

}

656

}

657

658

static void init_debug_guardpage(void)

658

static void init_debug_guardpage(void)

659

{

659

{

660

if (!debug_pagealloc_enabled())

660

if (!debug_pagealloc_enabled())

661

return;

661

return;

662

663

if (!debug_guardpage_minorder())

663

if (!debug_guardpage_minorder())

664

return;

664

return;

665

666

_debug_guardpage_enabled = true;

666

_debug_guardpage_enabled = true;

667

}

667

}

668

669

struct page_ext_operations debug_guardpage_ops = {

669

struct page_ext_operations debug_guardpage_ops = {

670

.need = need_debug_guardpage,

670

.need = need_debug_guardpage,

671

.init = init_debug_guardpage,

671

.init = init_debug_guardpage,

672

};

672

};

673

674

static int __init debug_guardpage_minorder_setup(char *buf)

674

static int __init debug_guardpage_minorder_setup(char *buf)

675

{

675

{

676

unsigned long res;

676

unsigned long res;

677

678

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

678

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

679

pr_err("Bad debug_guardpage_minorder value\n");

679

pr_err("Bad debug_guardpage_minorder value\n");

680

return 0;

680

return 0;

681

}

681

}

682

_debug_guardpage_minorder = res;

682

_debug_guardpage_minorder = res;

683

pr_info("Setting debug_guardpage_minorder to %lu\n", res);

683

pr_info("Setting debug_guardpage_minorder to %lu\n", res);

684

return 0;

684

return 0;

685

}

685

}

686

early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);

686

early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);

687

688

static inline bool set_page_guard(struct zone *zone, struct page *page,

688

static inline bool set_page_guard(struct zone *zone, struct page *page,

689

unsigned int order, int migratetype)

689

unsigned int order, int migratetype)

690

{

690

{

691

struct page_ext *page_ext;

691

struct page_ext *page_ext;

692

693

if (!debug_guardpage_enabled())

693

if (!debug_guardpage_enabled())

694

return false;

694

return false;

695

696

if (order >= debug_guardpage_minorder())

696

if (order >= debug_guardpage_minorder())

697

return false;

697

return false;

698

699

page_ext = lookup_page_ext(page);

699

page_ext = lookup_page_ext(page);

700

if (unlikely(!page_ext))

700

if (unlikely(!page_ext))

701

return false;

701

return false;

702

703

__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

703

__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

704

705

INIT_LIST_HEAD(&page->lru);

705

INIT_LIST_HEAD(&page->lru);

706

set_page_private(page, order);

706

set_page_private(page, order);

707

/* Guard pages are not available for any usage */

707

/* Guard pages are not available for any usage */

708

__mod_zone_freepage_state(zone, -(1 << order), migratetype);

708

__mod_zone_freepage_state(zone, -(1 << order), migratetype);

709

710

return true;

710

return true;

711

}

711

}

712

713

static inline void clear_page_guard(struct zone *zone, struct page *page,

713

static inline void clear_page_guard(struct zone *zone, struct page *page,

714

unsigned int order, int migratetype)

714

unsigned int order, int migratetype)

715

{

715

{

716

struct page_ext *page_ext;

716

struct page_ext *page_ext;

717

718

if (!debug_guardpage_enabled())

718

if (!debug_guardpage_enabled())

719

return;

719

return;

720

721

page_ext = lookup_page_ext(page);

721

page_ext = lookup_page_ext(page);

722

if (unlikely(!page_ext))

722

if (unlikely(!page_ext))

723

return;

723

return;

724

725

__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

725

__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);

726

727

set_page_private(page, 0);

727

set_page_private(page, 0);

728

if (!is_migrate_isolate(migratetype))

728

if (!is_migrate_isolate(migratetype))

729

__mod_zone_freepage_state(zone, (1 << order), migratetype);

729

__mod_zone_freepage_state(zone, (1 << order), migratetype);

730

}

730

}

731

#else

731

#else

732

struct page_ext_operations debug_guardpage_ops;

732

struct page_ext_operations debug_guardpage_ops;

733

static inline bool set_page_guard(struct zone *zone, struct page *page,

733

static inline bool set_page_guard(struct zone *zone, struct page *page,

734

unsigned int order, int migratetype) { return false; }

734

unsigned int order, int migratetype) { return false; }

735

static inline void clear_page_guard(struct zone *zone, struct page *page,

735

static inline void clear_page_guard(struct zone *zone, struct page *page,

736

unsigned int order, int migratetype) {}

736

unsigned int order, int migratetype) {}

737

#endif

737

#endif

738

739

static inline void set_page_order(struct page *page, unsigned int order)

739

static inline void set_page_order(struct page *page, unsigned int order)

740

{

740

{

741

set_page_private(page, order);

741

set_page_private(page, order);

742

__SetPageBuddy(page);

742

__SetPageBuddy(page);

743

}

743

}

744

745

static inline void rmv_page_order(struct page *page)

745

static inline void rmv_page_order(struct page *page)

746

{

746

{

747

__ClearPageBuddy(page);

747

__ClearPageBuddy(page);

748

set_page_private(page, 0);

748

set_page_private(page, 0);

749

}

749

}

750

751

/*

751

/*

752

* This function checks whether a page is free && is the buddy

752

* This function checks whether a page is free && is the buddy

753

* we can do coalesce a page and its buddy if

753

* we can do coalesce a page and its buddy if

754

* (a) the buddy is not in a hole (check before calling!) &&

754

* (a) the buddy is not in a hole (check before calling!) &&

755

* (b) the buddy is in the buddy system &&

755

* (b) the buddy is in the buddy system &&

756

* (c) a page and its buddy have the same order &&

756

* (c) a page and its buddy have the same order &&

757

* (d) a page and its buddy are in the same zone.

757

* (d) a page and its buddy are in the same zone.

758

*

758

*

759

* For recording whether a page is in the buddy system, we set ->_mapcount

759

* For recording whether a page is in the buddy system, we set ->_mapcount

760

* PAGE_BUDDY_MAPCOUNT_VALUE.

760

* PAGE_BUDDY_MAPCOUNT_VALUE.

761

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

761

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

762

* serialized by zone->lock.

762

* serialized by zone->lock.

763

*

763

*

764

* For recording page's order, we use page_private(page).

764

* For recording page's order, we use page_private(page).

765

*/

765

*/

766

static inline int page_is_buddy(struct page *page, struct page *buddy,

766

static inline int page_is_buddy(struct page *page, struct page *buddy,

767

unsigned int order)

767

unsigned int order)

768

{

768

{

769

if (page_is_guard(buddy) && page_order(buddy) == order) {

769

if (page_is_guard(buddy) && page_order(buddy) == order) {

770

if (page_zone_id(page) != page_zone_id(buddy))

770

if (page_zone_id(page) != page_zone_id(buddy))

771

return 0;

771

return 0;

772

773

VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

773

VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

774

775

return 1;

775

return 1;

776

}

776

}

777

778

if (PageBuddy(buddy) && page_order(buddy) == order) {

778

if (PageBuddy(buddy) && page_order(buddy) == order) {

779

/*

779

/*

780

* zone check is done late to avoid uselessly

780

* zone check is done late to avoid uselessly

781

* calculating zone/node ids for pages that could

781

* calculating zone/node ids for pages that could

782

* never merge.

782

* never merge.

783

*/

783

*/

784

if (page_zone_id(page) != page_zone_id(buddy))

784

if (page_zone_id(page) != page_zone_id(buddy))

785

return 0;

785

return 0;

786

787

VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

787

VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

788

789

return 1;

789

return 1;

790

}

790

}

791

return 0;

791

return 0;

792

}

792

}

793

794

/*

794

/*

795

* Freeing function for a buddy system allocator.

795

* Freeing function for a buddy system allocator.

796

*

796

*

797

* The concept of a buddy system is to maintain direct-mapped table

797

* The concept of a buddy system is to maintain direct-mapped table

798

* (containing bit values) for memory blocks of various "orders".

798

* (containing bit values) for memory blocks of various "orders".

799

* The bottom level table contains the map for the smallest allocatable

799

* The bottom level table contains the map for the smallest allocatable

800

* units of memory (here, pages), and each level above it describes

800

* units of memory (here, pages), and each level above it describes

801

* pairs of units from the levels below, hence, "buddies".

801

* pairs of units from the levels below, hence, "buddies".

802

* At a high level, all that happens here is marking the table entry

802

* At a high level, all that happens here is marking the table entry

803

* at the bottom level available, and propagating the changes upward

803

* at the bottom level available, and propagating the changes upward

804

* as necessary, plus some accounting needed to play nicely with other

804

* as necessary, plus some accounting needed to play nicely with other

805

* parts of the VM system.

805

* parts of the VM system.

806

* At each level, we keep a list of pages, which are heads of continuous

806

* At each level, we keep a list of pages, which are heads of continuous

807

* free pages of length of (1 << order) and marked with _mapcount

807

* free pages of length of (1 << order) and marked with _mapcount

808

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

808

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

809

* field.

809

* field.

810

* So when we are allocating or freeing one, we can derive the state of the

810

* So when we are allocating or freeing one, we can derive the state of the

811

* other. That is, if we allocate a small block, and both were

811

* other. That is, if we allocate a small block, and both were

812

* free, the remainder of the region must be split into blocks.

812

* free, the remainder of the region must be split into blocks.

813

* If a block is freed, and its buddy is also free, then this

813

* If a block is freed, and its buddy is also free, then this

814

* triggers coalescing into a block of larger size.

814

* triggers coalescing into a block of larger size.

815

*

815

*

816

* -- nyc

816

* -- nyc

817

*/

817

*/

818

819

static inline void __free_one_page(struct page *page,

819

static inline void __free_one_page(struct page *page,

820

unsigned long pfn,

820

unsigned long pfn,

821

struct zone *zone, unsigned int order,

821

struct zone *zone, unsigned int order,

822

int migratetype)

822

int migratetype)

823

{

823

{

824

unsigned long combined_pfn;

824

unsigned long combined_pfn;

825

unsigned long uninitialized_var(buddy_pfn);

825

unsigned long uninitialized_var(buddy_pfn);

826

struct page *buddy;

826

struct page *buddy;

827

unsigned int max_order;

827

unsigned int max_order;

828

829

max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

829

max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);

830

831

VM_BUG_ON(!zone_is_initialized(zone));

831

VM_BUG_ON(!zone_is_initialized(zone));

832

VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

832

VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);

833

834

VM_BUG_ON(migratetype == -1);

834

VM_BUG_ON(migratetype == -1);

835

if (likely(!is_migrate_isolate(migratetype)))

835

if (likely(!is_migrate_isolate(migratetype)))

836

__mod_zone_freepage_state(zone, 1 << order, migratetype);

836

__mod_zone_freepage_state(zone, 1 << order, migratetype);

837

838

VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);

838

VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);

839

VM_BUG_ON_PAGE(bad_range(zone, page), page);

839

VM_BUG_ON_PAGE(bad_range(zone, page), page);

840

841

continue_merging:

841

continue_merging:

842

while (order < max_order - 1) {

842

while (order < max_order - 1) {

843

buddy_pfn = __find_buddy_pfn(pfn, order);

843

buddy_pfn = __find_buddy_pfn(pfn, order);

844

buddy = page + (buddy_pfn - pfn);

844

buddy = page + (buddy_pfn - pfn);

845

846

if (!pfn_valid_within(buddy_pfn))

846

if (!pfn_valid_within(buddy_pfn))

847

goto done_merging;

847

goto done_merging;

848

if (!page_is_buddy(page, buddy, order))

848

if (!page_is_buddy(page, buddy, order))

849

goto done_merging;

849

goto done_merging;

850

/*

850

/*

851

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

851

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

852

* merge with it and move up one order.

852

* merge with it and move up one order.

853

*/

853

*/

854

if (page_is_guard(buddy)) {

854

if (page_is_guard(buddy)) {

855

clear_page_guard(zone, buddy, order, migratetype);

855

clear_page_guard(zone, buddy, order, migratetype);

856

} else {

856

} else {

857

list_del(&buddy->lru);

857

list_del(&buddy->lru);

858

zone->free_area[order].nr_free--;

858

zone->free_area[order].nr_free--;

859

rmv_page_order(buddy);

859

rmv_page_order(buddy);

860

}

860

}

861

combined_pfn = buddy_pfn & pfn;

861

combined_pfn = buddy_pfn & pfn;

862

page = page + (combined_pfn - pfn);

862

page = page + (combined_pfn - pfn);

863

pfn = combined_pfn;

863

pfn = combined_pfn;

864

order++;

864

order++;

865

}

865

}

866

if (max_order < MAX_ORDER) {

866

if (max_order < MAX_ORDER) {

867

/* If we are here, it means order is >= pageblock_order.

867

/* If we are here, it means order is >= pageblock_order.

868

* We want to prevent merge between freepages on isolate

868

* We want to prevent merge between freepages on isolate

869

* pageblock and normal pageblock. Without this, pageblock

869

* pageblock and normal pageblock. Without this, pageblock

870

* isolation could cause incorrect freepage or CMA accounting.

870

* isolation could cause incorrect freepage or CMA accounting.

871

*

871

*

872

* We don't want to hit this code for the more frequent

872

* We don't want to hit this code for the more frequent

873

* low-order merging.

873

* low-order merging.

874

*/

874

*/

875

if (unlikely(has_isolate_pageblock(zone))) {

875

if (unlikely(has_isolate_pageblock(zone))) {

876

int buddy_mt;

876

int buddy_mt;

877

878

buddy_pfn = __find_buddy_pfn(pfn, order);

878

buddy_pfn = __find_buddy_pfn(pfn, order);

879

buddy = page + (buddy_pfn - pfn);

879

buddy = page + (buddy_pfn - pfn);

880

buddy_mt = get_pageblock_migratetype(buddy);

880

buddy_mt = get_pageblock_migratetype(buddy);

881

882

if (migratetype != buddy_mt

882

if (migratetype != buddy_mt

883

&& (is_migrate_isolate(migratetype) ||

883

&& (is_migrate_isolate(migratetype) ||

884

is_migrate_isolate(buddy_mt)))

884

is_migrate_isolate(buddy_mt)))

885

goto done_merging;

885

goto done_merging;

886

}

886

}

887

max_order++;

887

max_order++;

888

goto continue_merging;

888

goto continue_merging;

889

}

889

}

890

891

done_merging:

891

done_merging:

892

set_page_order(page, order);

892

set_page_order(page, order);

893

894

/*

894

/*

895

* If this is not the largest possible page, check if the buddy

895

* If this is not the largest possible page, check if the buddy

896

* of the next-highest order is free. If it is, it's possible

896

* of the next-highest order is free. If it is, it's possible

897

* that pages are being freed that will coalesce soon. In case,

897

* that pages are being freed that will coalesce soon. In case,

898

* that is happening, add the free page to the tail of the list

898

* that is happening, add the free page to the tail of the list

899

* so it's less likely to be used soon and more likely to be merged

899

* so it's less likely to be used soon and more likely to be merged

900

* as a higher order page

900

* as a higher order page

901

*/

901

*/

902

if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {

902

if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {

903

struct page *higher_page, *higher_buddy;

903

struct page *higher_page, *higher_buddy;

904

combined_pfn = buddy_pfn & pfn;

904

combined_pfn = buddy_pfn & pfn;

905

higher_page = page + (combined_pfn - pfn);

905

higher_page = page + (combined_pfn - pfn);

906

buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);

906

buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);

907

higher_buddy = higher_page + (buddy_pfn - combined_pfn);

907

higher_buddy = higher_page + (buddy_pfn - combined_pfn);

908

if (pfn_valid_within(buddy_pfn) &&

908

if (pfn_valid_within(buddy_pfn) &&

909

page_is_buddy(higher_page, higher_buddy, order + 1)) {

909

page_is_buddy(higher_page, higher_buddy, order + 1)) {

910

list_add_tail(&page->lru,

910

list_add_tail(&page->lru,

911

&zone->free_area[order].free_list[migratetype]);

911

&zone->free_area[order].free_list[migratetype]);

912

goto out;

912

goto out;

913

}

913

}

914

}

914

}

915

916

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

916

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

917

out:

917

out:

918

zone->free_area[order].nr_free++;

918

zone->free_area[order].nr_free++;

919

}

919

}

920

921

/*

921

/*

922

* A bad page could be due to a number of fields. Instead of multiple branches,

922

* A bad page could be due to a number of fields. Instead of multiple branches,

923

* try and check multiple fields with one check. The caller must do a detailed

923

* try and check multiple fields with one check. The caller must do a detailed

924

* check if necessary.

924

* check if necessary.

925

*/

925

*/

926

static inline bool page_expected_state(struct page *page,

926

static inline bool page_expected_state(struct page *page,

927

unsigned long check_flags)

927

unsigned long check_flags)

928

{

928

{

929

if (unlikely(atomic_read(&page->_mapcount) != -1))

929

if (unlikely(atomic_read(&page->_mapcount) != -1))

930

return false;

930

return false;

931

932

if (unlikely((unsigned long)page->mapping |

932

if (unlikely((unsigned long)page->mapping |

933

page_ref_count(page) |

933

page_ref_count(page) |

934

#ifdef CONFIG_MEMCG

934

#ifdef CONFIG_MEMCG

935

(unsigned long)page->mem_cgroup |

935

(unsigned long)page->mem_cgroup |

936

#endif

936

#endif

937

(page->flags & check_flags)))

937

(page->flags & check_flags)))

938

return false;

938

return false;

939

940

return true;

940

return true;

941

}

941

}

942

943

static void free_pages_check_bad(struct page *page)

943

static void free_pages_check_bad(struct page *page)

944

{

944

{

945

const char *bad_reason;

945

const char *bad_reason;

946

unsigned long bad_flags;

946

unsigned long bad_flags;

947

948

bad_reason = NULL;

948

bad_reason = NULL;

949

bad_flags = 0;

949

bad_flags = 0;

950

951

if (unlikely(atomic_read(&page->_mapcount) != -1))

951

if (unlikely(atomic_read(&page->_mapcount) != -1))

952

bad_reason = "nonzero mapcount";

952

bad_reason = "nonzero mapcount";

953

if (unlikely(page->mapping != NULL))

953

if (unlikely(page->mapping != NULL))

954

bad_reason = "non-NULL mapping";

954

bad_reason = "non-NULL mapping";

955

if (unlikely(page_ref_count(page) != 0))

955

if (unlikely(page_ref_count(page) != 0))

956

bad_reason = "nonzero _refcount";

956

bad_reason = "nonzero _refcount";

957

if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {

957

if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {

958

bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";

958

bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";

959

bad_flags = PAGE_FLAGS_CHECK_AT_FREE;

959

bad_flags = PAGE_FLAGS_CHECK_AT_FREE;

960

}

960

}

961

#ifdef CONFIG_MEMCG

961

#ifdef CONFIG_MEMCG

962

if (unlikely(page->mem_cgroup))

962

if (unlikely(page->mem_cgroup))

963

bad_reason = "page still charged to cgroup";

963

bad_reason = "page still charged to cgroup";

964

#endif

964

#endif

965

bad_page(page, bad_reason, bad_flags);

965

bad_page(page, bad_reason, bad_flags);

966

}

966

}

967

968

static inline int free_pages_check(struct page *page)

968

static inline int free_pages_check(struct page *page)

969

{

969

{

970

if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))

970

if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))

971

return 0;

971

return 0;

972

973

/* Something has gone sideways, find it */

973

/* Something has gone sideways, find it */

974

free_pages_check_bad(page);

974

free_pages_check_bad(page);

975

return 1;

975

return 1;

976

}

976

}

977

978

static int free_tail_pages_check(struct page *head_page, struct page *page)

978

static int free_tail_pages_check(struct page *head_page, struct page *page)

979

{

979

{

980

int ret = 1;

980

int ret = 1;

981

982

/*

982

/*

983

* We rely page->lru.next never has bit 0 set, unless the page

983

* We rely page->lru.next never has bit 0 set, unless the page

984

* is PageTail(). Let's make sure that's true even for poisoned ->lru.

984

* is PageTail(). Let's make sure that's true even for poisoned ->lru.

985

*/

985

*/

986

BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);

986

BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);

987

988

if (!IS_ENABLED(CONFIG_DEBUG_VM)) {

988

if (!IS_ENABLED(CONFIG_DEBUG_VM)) {

989

ret = 0;

989

ret = 0;

990

goto out;

990

goto out;

991

}

991

}

992

switch (page - head_page) {

992

switch (page - head_page) {

993

case 1:

993

case 1:

994

/* the first tail page: ->mapping is compound_mapcount() */

994

/* the first tail page: ->mapping is compound_mapcount() */

995

if (unlikely(compound_mapcount(page))) {

995

if (unlikely(compound_mapcount(page))) {

996

bad_page(page, "nonzero compound_mapcount", 0);

996

bad_page(page, "nonzero compound_mapcount", 0);

997

goto out;

997

goto out;

998

}

998

}

999

break;

999

break;

1000

case 2:

1000

case 2:

1001

/*

1001

/*

1002

* the second tail page: ->mapping is

1002

* the second tail page: ->mapping is

1003

* page_deferred_list().next -- ignore value.

1003

* page_deferred_list().next -- ignore value.

1004

*/

1004

*/

1005

break;

1005

break;

1006

default:

1006

default:

1007

if (page->mapping != TAIL_MAPPING) {

1007

if (page->mapping != TAIL_MAPPING) {

1008

bad_page(page, "corrupted mapping in tail page", 0);

1008

bad_page(page, "corrupted mapping in tail page", 0);

1009

goto out;

1009

goto out;

1010

}

1010

}

1011

break;

1011

break;

1012

}

1012

}

1013

if (unlikely(!PageTail(page))) {

1013

if (unlikely(!PageTail(page))) {

1014

bad_page(page, "PageTail not set", 0);

1014

bad_page(page, "PageTail not set", 0);

1015

goto out;

1015

goto out;

1016

}

1016

}

1017

if (unlikely(compound_head(page) != head_page)) {

1017

if (unlikely(compound_head(page) != head_page)) {

1018

bad_page(page, "compound_head not consistent", 0);

1018

bad_page(page, "compound_head not consistent", 0);

1019

goto out;

1019

goto out;

1020

}

1020

}

1021

ret = 0;

1021

ret = 0;

1022

out:

1022

out:

1023

page->mapping = NULL;

1023

page->mapping = NULL;

1024

clear_compound_head(page);

1024

clear_compound_head(page);

1025

return ret;

1025

return ret;

1026

}

1026

}

1027

1028

static __always_inline bool free_pages_prepare(struct page *page,

1028

static __always_inline bool free_pages_prepare(struct page *page,

1029

unsigned int order, bool check_free)

1029

unsigned int order, bool check_free)

1030

{

1030

{

1031

int bad = 0;

1031

int bad = 0;

1032

1033

VM_BUG_ON_PAGE(PageTail(page), page);

1033

VM_BUG_ON_PAGE(PageTail(page), page);

1034

1035

trace_mm_page_free(page, order);

1035

trace_mm_page_free(page, order);

1036

1037

/*

1037

/*

1038

* Check tail pages before head page information is cleared to

1038

* Check tail pages before head page information is cleared to

1039

* avoid checking PageCompound for order-0 pages.

1039

* avoid checking PageCompound for order-0 pages.

1040

*/

1040

*/

1041

if (unlikely(order)) {

1041

if (unlikely(order)) {

1042

bool compound = PageCompound(page);

1042

bool compound = PageCompound(page);

1043

int i;

1043

int i;

1044

1045

VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);

1045

VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);

1046

1047

if (compound)

1047

if (compound)

1048

ClearPageDoubleMap(page);

1048

ClearPageDoubleMap(page);

1049

for (i = 1; i < (1 << order); i++) {

1049

for (i = 1; i < (1 << order); i++) {

1050

if (compound)

1050

if (compound)

1051

bad += free_tail_pages_check(page, page + i);

1051

bad += free_tail_pages_check(page, page + i);

1052

if (unlikely(free_pages_check(page + i))) {

1052

if (unlikely(free_pages_check(page + i))) {

1053

bad++;

1053

bad++;

1054

continue;

1054

continue;

1055

}

1055

}

1056

(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

1056

(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

1057

}

1057

}

1058

}

1058

}

1059

if (PageMappingFlags(page))

1059

if (PageMappingFlags(page))

1060

page->mapping = NULL;

1060

page->mapping = NULL;

1061

if (memcg_kmem_enabled() && PageKmemcg(page))

1061

if (memcg_kmem_enabled() && PageKmemcg(page))

1062

memcg_kmem_uncharge(page, order);

1062

memcg_kmem_uncharge(page, order);

1063

if (check_free)

1063

if (check_free)

1064

bad += free_pages_check(page);

1064

bad += free_pages_check(page);

1065

if (bad)

1065

if (bad)

1066

return false;

1066

return false;

1067

1068

page_cpupid_reset_last(page);

1068

page_cpupid_reset_last(page);

1069

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

1069

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

1070

reset_page_owner(page, order);

1070

reset_page_owner(page, order);

1071

1072

if (!PageHighMem(page)) {

1072

if (!PageHighMem(page)) {

1073

debug_check_no_locks_freed(page_address(page),

1073

debug_check_no_locks_freed(page_address(page),

1074

PAGE_SIZE << order);

1074

PAGE_SIZE << order);

1075

debug_check_no_obj_freed(page_address(page),

1075

debug_check_no_obj_freed(page_address(page),

1076

PAGE_SIZE << order);

1076

PAGE_SIZE << order);

1077

}

1077

}

1078

arch_free_page(page, order);

1078

arch_free_page(page, order);

1079

kernel_poison_pages(page, 1 << order, 0);

1079

kernel_poison_pages(page, 1 << order, 0);

1080

kernel_map_pages(page, 1 << order, 0);

1080

kernel_map_pages(page, 1 << order, 0);

1081

kasan_free_pages(page, order);

1081

kasan_free_pages(page, order);

1082

1083

return true;

1083

return true;

1084

}

1084

}

1085

1086

#ifdef CONFIG_DEBUG_VM

1086

#ifdef CONFIG_DEBUG_VM

1087

static inline bool free_pcp_prepare(struct page *page)

1087

static inline bool free_pcp_prepare(struct page *page)

1088

{

1088

{

1089

return free_pages_prepare(page, 0, true);

1089

return free_pages_prepare(page, 0, true);

1090

}

1090

}

1091

1092

static inline bool bulkfree_pcp_prepare(struct page *page)

1092

static inline bool bulkfree_pcp_prepare(struct page *page)

1093

{

1093

{

1094

return false;

1094

return false;

1095

}

1095

}

1096

#else

1096

#else

1097

static bool free_pcp_prepare(struct page *page)

1097

static bool free_pcp_prepare(struct page *page)

1098

{

1098

{

1099

return free_pages_prepare(page, 0, false);

1099

return free_pages_prepare(page, 0, false);

1100

}

1100

}

1101

1102

static bool bulkfree_pcp_prepare(struct page *page)

1102

static bool bulkfree_pcp_prepare(struct page *page)

1103

{

1103

{

1104

return free_pages_check(page);

1104

return free_pages_check(page);

1105

}

1105

}

1106

#endif /* CONFIG_DEBUG_VM */

1106

#endif /* CONFIG_DEBUG_VM */

1107

1108

/*

1108

/*

1109

* Frees a number of pages from the PCP lists

1109

* Frees a number of pages from the PCP lists

1110

* Assumes all pages on list are in same zone, and of same order.

1110

* Assumes all pages on list are in same zone, and of same order.

1111

* count is the number of pages to free.

1111

* count is the number of pages to free.

1112

*

1112

*

1113

* If the zone was previously in an "all pages pinned" state then look to

1113

* If the zone was previously in an "all pages pinned" state then look to

1114

* see if this freeing clears that state.

1114

* see if this freeing clears that state.

1115

*

1115

*

1116

* And clear the zone's pages_scanned counter, to hold off the "all pages are

1116

* And clear the zone's pages_scanned counter, to hold off the "all pages are

1117

* pinned" detection logic.

1117

* pinned" detection logic.

1118

*/

1118

*/

1119

static void free_pcppages_bulk(struct zone *zone, int count,

1119

static void free_pcppages_bulk(struct zone *zone, int count,

1120

struct per_cpu_pages *pcp)

1120

struct per_cpu_pages *pcp)

1121

{

1121

{

1122

int migratetype = 0;

1122

int migratetype = 0;

1123

int batch_free = 0;

1123

int batch_free = 0;

1124

bool isolated_pageblocks;

1124

bool isolated_pageblocks;

1125

1126

spin_lock(&zone->lock);

1126

spin_lock(&zone->lock);

1127

isolated_pageblocks = has_isolate_pageblock(zone);

1127

isolated_pageblocks = has_isolate_pageblock(zone);

1128

1129

while (count) {

1129

while (count) {

1130

struct page *page;

1130

struct page *page;

1131

struct list_head *list;

1131

struct list_head *list;

1132

1133

/*

1133

/*

1134

* Remove pages from lists in a round-robin fashion. A

1134

* Remove pages from lists in a round-robin fashion. A

1135

* batch_free count is maintained that is incremented when an

1135

* batch_free count is maintained that is incremented when an

1136

* empty list is encountered. This is so more pages are freed

1136

* empty list is encountered. This is so more pages are freed

1137

* off fuller lists instead of spinning excessively around empty

1137

* off fuller lists instead of spinning excessively around empty

1138

* lists

1138

* lists

1139

*/

1139

*/

1140

do {

1140

do {

1141

batch_free++;

1141

batch_free++;

1142

if (++migratetype == MIGRATE_PCPTYPES)

1142

if (++migratetype == MIGRATE_PCPTYPES)

1143

migratetype = 0;

1143

migratetype = 0;

1144

list = &pcp->lists[migratetype];

1144

list = &pcp->lists[migratetype];

1145

} while (list_empty(list));

1145

} while (list_empty(list));

1146

1147

/* This is the only non-empty list. Free them all. */

1147

/* This is the only non-empty list. Free them all. */

1148

if (batch_free == MIGRATE_PCPTYPES)

1148

if (batch_free == MIGRATE_PCPTYPES)

1149

batch_free = count;

1149

batch_free = count;

1150

1151

do {

1151

do {

1152

int mt; /* migratetype of the to-be-freed page */

1152

int mt; /* migratetype of the to-be-freed page */

1153

1154

page = list_last_entry(list, struct page, lru);

1154

page = list_last_entry(list, struct page, lru);

1155

/* must delete as __free_one_page list manipulates */

1155

/* must delete as __free_one_page list manipulates */

1156

list_del(&page->lru);

1156

list_del(&page->lru);

1157

1158

mt = get_pcppage_migratetype(page);

1158

mt = get_pcppage_migratetype(page);

1159

/* MIGRATE_ISOLATE page should not go to pcplists */

1159

/* MIGRATE_ISOLATE page should not go to pcplists */

1160

VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);

1160

VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);

1161

/* Pageblock could have been isolated meanwhile */

1161

/* Pageblock could have been isolated meanwhile */

1162

if (unlikely(isolated_pageblocks))

1162

if (unlikely(isolated_pageblocks))

1163

mt = get_pageblock_migratetype(page);

1163

mt = get_pageblock_migratetype(page);

1164

1165

if (bulkfree_pcp_prepare(page))

1165

if (bulkfree_pcp_prepare(page))

1166

continue;

1166

continue;

1167

1168

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

1168

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

1169

trace_mm_page_pcpu_drain(page, 0, mt);

1169

trace_mm_page_pcpu_drain(page, 0, mt);

1170

} while (--count && --batch_free && !list_empty(list));

1170

} while (--count && --batch_free && !list_empty(list));

1171

}

1171

}

1172

spin_unlock(&zone->lock);

1172

spin_unlock(&zone->lock);

1173

}

1173

}

1174

1175

static void free_one_page(struct zone *zone,

1175

static void free_one_page(struct zone *zone,

1176

struct page *page, unsigned long pfn,

1176

struct page *page, unsigned long pfn,

1177

unsigned int order,

1177

unsigned int order,

1178

int migratetype)

1178

int migratetype)

1179

{

1179

{

1180

spin_lock(&zone->lock);

1180

spin_lock(&zone->lock);

1181

if (unlikely(has_isolate_pageblock(zone) ||

1181

if (unlikely(has_isolate_pageblock(zone) ||

1182

is_migrate_isolate(migratetype))) {

1182

is_migrate_isolate(migratetype))) {

1183

migratetype = get_pfnblock_migratetype(page, pfn);

1183

migratetype = get_pfnblock_migratetype(page, pfn);

1184

}

1184

}

1185

__free_one_page(page, pfn, zone, order, migratetype);

1185

__free_one_page(page, pfn, zone, order, migratetype);

1186

spin_unlock(&zone->lock);

1186

spin_unlock(&zone->lock);

1187

}

1187

}

1188

1189

static void __meminit __init_single_page(struct page *page, unsigned long pfn,

1189

static void __meminit __init_single_page(struct page *page, unsigned long pfn,

1190

unsigned long zone, int nid)

1190

unsigned long zone, int nid)

1191

{

1191

{

1192

set_page_links(page, zone, nid, pfn);

1192

set_page_links(page, zone, nid, pfn);

1193

init_page_count(page);

1193

init_page_count(page);

1194

page_mapcount_reset(page);

1194

page_mapcount_reset(page);

1195

page_cpupid_reset_last(page);

1195

page_cpupid_reset_last(page);

1196

1197

INIT_LIST_HEAD(&page->lru);

1197

INIT_LIST_HEAD(&page->lru);

1198

#ifdef WANT_PAGE_VIRTUAL

1198

#ifdef WANT_PAGE_VIRTUAL

1199

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

1199

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

1200

if (!is_highmem_idx(zone))

1200

if (!is_highmem_idx(zone))

1201

set_page_address(page, __va(pfn << PAGE_SHIFT));

1201

set_page_address(page, __va(pfn << PAGE_SHIFT));

1202

#endif

1202

#endif

1203

}

1203

}

1204

1205

static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,

1205

static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,

1206

int nid)

1206

int nid)

1207

{

1207

{

1208

return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);

1208

return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);

1209

}

1209

}

1210

1211

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1211

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1212

static void __meminit init_reserved_page(unsigned long pfn)

1212

static void __meminit init_reserved_page(unsigned long pfn)

1213

{

1213

{

1214

pg_data_t *pgdat;

1214

pg_data_t *pgdat;

1215

int nid, zid;

1215

int nid, zid;

1216

1217

if (!early_page_uninitialised(pfn))

1217

if (!early_page_uninitialised(pfn))

1218

return;

1218

return;

1219

1220

nid = early_pfn_to_nid(pfn);

1220

nid = early_pfn_to_nid(pfn);

1221

pgdat = NODE_DATA(nid);

1221

pgdat = NODE_DATA(nid);

1222

1223

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

1223

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

1224

struct zone *zone = &pgdat->node_zones[zid];

1224

struct zone *zone = &pgdat->node_zones[zid];

1225

1226

if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))

1226

if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))

1227

break;

1227

break;

1228

}

1228

}

1229

__init_single_pfn(pfn, zid, nid);

1229

__init_single_pfn(pfn, zid, nid);

1230

}

1230

}

1231

#else

1231

#else

1232

static inline void init_reserved_page(unsigned long pfn)

1232

static inline void init_reserved_page(unsigned long pfn)

1233

{

1233

{

1234

}

1234

}

1235

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

1235

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

1236

1237

/*

1237

/*

1238

* Initialised pages do not have PageReserved set. This function is

1238

* Initialised pages do not have PageReserved set. This function is

1239

* called for each range allocated by the bootmem allocator and

1239

* called for each range allocated by the bootmem allocator and

1240

* marks the pages PageReserved. The remaining valid pages are later

1240

* marks the pages PageReserved. The remaining valid pages are later

1241

* sent to the buddy page allocator.

1241

* sent to the buddy page allocator.

1242

*/

1242

*/

1243

void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)

1243

void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)

1244

{

1244

{

1245

unsigned long start_pfn = PFN_DOWN(start);

1245

unsigned long start_pfn = PFN_DOWN(start);

1246

unsigned long end_pfn = PFN_UP(end);

1246

unsigned long end_pfn = PFN_UP(end);

1247

1248

for (; start_pfn < end_pfn; start_pfn++) {

1248

for (; start_pfn < end_pfn; start_pfn++) {

1249

if (pfn_valid(start_pfn)) {

1249

if (pfn_valid(start_pfn)) {

1250

struct page *page = pfn_to_page(start_pfn);

1250

struct page *page = pfn_to_page(start_pfn);

1251

1252

init_reserved_page(start_pfn);

1252

init_reserved_page(start_pfn);

1253

1254

/* Avoid false-positive PageTail() */

1254

/* Avoid false-positive PageTail() */

1255

INIT_LIST_HEAD(&page->lru);

1255

INIT_LIST_HEAD(&page->lru);

1256

1257

SetPageReserved(page);

1257

SetPageReserved(page);

1258

}

1258

}

1259

}

1259

}

1260

}

1260

}

1261

1262

static void __free_pages_ok(struct page *page, unsigned int order)

1262

static void __free_pages_ok(struct page *page, unsigned int order)

1263

{

1263

{

1264

unsigned long flags;

1264

unsigned long flags;

1265

int migratetype;

1265

int migratetype;

1266

unsigned long pfn = page_to_pfn(page);

1266

unsigned long pfn = page_to_pfn(page);

1267

1268

if (!free_pages_prepare(page, order, true))

1268

if (!free_pages_prepare(page, order, true))

1269

return;

1269

return;

1270

1271

migratetype = get_pfnblock_migratetype(page, pfn);

1271

migratetype = get_pfnblock_migratetype(page, pfn);

1272

local_irq_save(flags);

1272

local_irq_save(flags);

1273

__count_vm_events(PGFREE, 1 << order);

1273

__count_vm_events(PGFREE, 1 << order);

1274

free_one_page(page_zone(page), page, pfn, order, migratetype);

1274

free_one_page(page_zone(page), page, pfn, order, migratetype);

1275

local_irq_restore(flags);

1275

local_irq_restore(flags);

1276

}

1276

}

1277

1278

static void __init __free_pages_boot_core(struct page *page, unsigned int order)

1278

static void __init __free_pages_boot_core(struct page *page, unsigned int order)

1279

{

1279

{

1280

unsigned int nr_pages = 1 << order;

1280

unsigned int nr_pages = 1 << order;

1281

struct page *p = page;

1281

struct page *p = page;

1282

unsigned int loop;

1282

unsigned int loop;

1283

1284

prefetchw(p);

1284

prefetchw(p);

1285

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

1285

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

1286

prefetchw(p + 1);

1286

prefetchw(p + 1);

1287

__ClearPageReserved(p);

1287

__ClearPageReserved(p);

1288

set_page_count(p, 0);

1288

set_page_count(p, 0);

1289

}

1289

}

1290

__ClearPageReserved(p);

1290

__ClearPageReserved(p);

1291

set_page_count(p, 0);

1291

set_page_count(p, 0);

1292

1293

page_zone(page)->managed_pages += nr_pages;

1293

page_zone(page)->managed_pages += nr_pages;

1294

set_page_refcounted(page);

1294

set_page_refcounted(page);

1295

__free_pages(page, order);

1295

__free_pages(page, order);

1296

}

1296

}

1297

1298

#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \

1298

#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \

1299

defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)

1299

defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)

1300

1301

static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;

1301

static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;

1302

1303

int __meminit early_pfn_to_nid(unsigned long pfn)

1303

int __meminit early_pfn_to_nid(unsigned long pfn)

1304

{

1304

{

1305

static DEFINE_SPINLOCK(early_pfn_lock);

1305

static DEFINE_SPINLOCK(early_pfn_lock);

1306

int nid;

1306

int nid;

1307

1308

spin_lock(&early_pfn_lock);

1308

spin_lock(&early_pfn_lock);

1309

nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);

1309

nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);

1310

if (nid < 0)

1310

if (nid < 0)

1311

nid = first_online_node;

1311

nid = first_online_node;

1312

spin_unlock(&early_pfn_lock);

1312

spin_unlock(&early_pfn_lock);

1313

1314

return nid;

1314

return nid;

1315

}

1315

}

1316

#endif

1316

#endif

1317

1318

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

1318

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

1319

static inline bool __meminit __maybe_unused

1319

static inline bool __meminit __maybe_unused

1320

meminit_pfn_in_nid(unsigned long pfn, int node,

1320

meminit_pfn_in_nid(unsigned long pfn, int node,

1321

struct mminit_pfnnid_cache *state)

1321

struct mminit_pfnnid_cache *state)

1322

{

1322

{

1323

int nid;

1323

int nid;

1324

1325

nid = __early_pfn_to_nid(pfn, state);

1325

nid = __early_pfn_to_nid(pfn, state);

1326

if (nid >= 0 && nid != node)

1326

if (nid >= 0 && nid != node)

1327

return false;

1327

return false;

1328

return true;

1328

return true;

1329

}

1329

}

1330

1331

/* Only safe to use early in boot when initialisation is single-threaded */

1331

/* Only safe to use early in boot when initialisation is single-threaded */

1332

static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

1332

static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

1333

{

1333

{

1334

return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);

1334

return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);

1335

}

1335

}

1336

1337

#else

1337

#else

1338

1339

static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

1339

static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

1340

{

1340

{

1341

return true;

1341

return true;

1342

}

1342

}

1343

static inline bool __meminit __maybe_unused

1343

static inline bool __meminit __maybe_unused

1344

meminit_pfn_in_nid(unsigned long pfn, int node,

1344

meminit_pfn_in_nid(unsigned long pfn, int node,

1345

struct mminit_pfnnid_cache *state)

1345

struct mminit_pfnnid_cache *state)

1346

{

1346

{

1347

return true;

1347

return true;

1348

}

1348

}

1349

#endif

1349

#endif

1350

1351

1352

void __init __free_pages_bootmem(struct page *page, unsigned long pfn,

1352

void __init __free_pages_bootmem(struct page *page, unsigned long pfn,

1353

unsigned int order)

1353

unsigned int order)

1354

{

1354

{

1355

if (early_page_uninitialised(pfn))

1355

if (early_page_uninitialised(pfn))

1356

return;

1356

return;

1357

return __free_pages_boot_core(page, order);

1357

return __free_pages_boot_core(page, order);

1358

}

1358

}

1359

1360

/*

1360

/*

1361

* Check that the whole (or subset of) a pageblock given by the interval of

1361

* Check that the whole (or subset of) a pageblock given by the interval of

1362

* [start_pfn, end_pfn) is valid and within the same zone, before scanning it

1362

* [start_pfn, end_pfn) is valid and within the same zone, before scanning it

1363

* with the migration of free compaction scanner. The scanners then need to

1363

* with the migration of free compaction scanner. The scanners then need to

1364

* use only pfn_valid_within() check for arches that allow holes within

1364

* use only pfn_valid_within() check for arches that allow holes within

1365

* pageblocks.

1365

* pageblocks.

1366

*

1366

*

1367

* Return struct page pointer of start_pfn, or NULL if checks were not passed.

1367

* Return struct page pointer of start_pfn, or NULL if checks were not passed.

1368

*

1368

*

1369

* It's possible on some configurations to have a setup like node0 node1 node0

1369

* It's possible on some configurations to have a setup like node0 node1 node0

1370

* i.e. it's possible that all pages within a zones range of pages do not

1370

* i.e. it's possible that all pages within a zones range of pages do not

1371

* belong to a single zone. We assume that a border between node0 and node1

1371

* belong to a single zone. We assume that a border between node0 and node1

1372

* can occur within a single pageblock, but not a node0 node1 node0

1372

* can occur within a single pageblock, but not a node0 node1 node0

1373

* interleaving within a single pageblock. It is therefore sufficient to check

1373

* interleaving within a single pageblock. It is therefore sufficient to check

1374

* the first and last page of a pageblock and avoid checking each individual

1374

* the first and last page of a pageblock and avoid checking each individual

1375

* page in a pageblock.

1375

* page in a pageblock.

1376

*/

1376

*/

1377

struct page *__pageblock_pfn_to_page(unsigned long start_pfn,

1377

struct page *__pageblock_pfn_to_page(unsigned long start_pfn,

1378

unsigned long end_pfn, struct zone *zone)

1378

unsigned long end_pfn, struct zone *zone)

1379

{

1379

{

1380

struct page *start_page;

1380

struct page *start_page;

1381

struct page *end_page;

1381

struct page *end_page;

1382

1383

/* end_pfn is one past the range we are checking */

1383

/* end_pfn is one past the range we are checking */

1384

end_pfn--;

1384

end_pfn--;

1385

1386

if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))

1386

if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))

1387

return NULL;

1387

return NULL;

1388

1389

start_page = pfn_to_online_page(start_pfn);

1389

start_page = pfn_to_online_page(start_pfn);

1390

if (!start_page)

1390

if (!start_page)

1391

return NULL;

1391

return NULL;

1392

1393

if (page_zone(start_page) != zone)

1393

if (page_zone(start_page) != zone)

1394

return NULL;

1394

return NULL;

1395

1396

end_page = pfn_to_page(end_pfn);

1396

end_page = pfn_to_page(end_pfn);

1397

1398

/* This gives a shorter code than deriving page_zone(end_page) */

1398

/* This gives a shorter code than deriving page_zone(end_page) */

1399

if (page_zone_id(start_page) != page_zone_id(end_page))

1399

if (page_zone_id(start_page) != page_zone_id(end_page))

1400

return NULL;

1400

return NULL;

1401

1402

return start_page;

1402

return start_page;

1403

}

1403

}

1404

1405

void set_zone_contiguous(struct zone *zone)

1405

void set_zone_contiguous(struct zone *zone)

1406

{

1406

{

1407

unsigned long block_start_pfn = zone->zone_start_pfn;

1407

unsigned long block_start_pfn = zone->zone_start_pfn;

1408

unsigned long block_end_pfn;

1408

unsigned long block_end_pfn;

1409

1410

block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);

1410

block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);

1411

for (; block_start_pfn < zone_end_pfn(zone);

1411

for (; block_start_pfn < zone_end_pfn(zone);

1412

block_start_pfn = block_end_pfn,

1412

block_start_pfn = block_end_pfn,

1413

block_end_pfn += pageblock_nr_pages) {

1413

block_end_pfn += pageblock_nr_pages) {

1414

1415

block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));

1415

block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));

1416

1417

if (!__pageblock_pfn_to_page(block_start_pfn,

1417

if (!__pageblock_pfn_to_page(block_start_pfn,

1418

block_end_pfn, zone))

1418

block_end_pfn, zone))

1419

return;

1419

return;

1420

}

1420

}

1421

1422

/* We confirm that there is no hole */

1422

/* We confirm that there is no hole */

1423

zone->contiguous = true;

1423

zone->contiguous = true;

1424

}

1424

}

1425

1426

void clear_zone_contiguous(struct zone *zone)

1426

void clear_zone_contiguous(struct zone *zone)

1427

{

1427

{

1428

zone->contiguous = false;

1428

zone->contiguous = false;

1429

}

1429

}

1430

1431

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1431

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1432

static void __init deferred_free_range(struct page *page,

1432

static void __init deferred_free_range(struct page *page,

1433

unsigned long pfn, int nr_pages)

1433

unsigned long pfn, int nr_pages)

1434

{

1434

{

1435

int i;

1435

int i;

1436

1437

if (!page)

1437

if (!page)

1438

return;

1438

return;

1439

1440

/* Free a large naturally-aligned chunk if possible */

1440

/* Free a large naturally-aligned chunk if possible */

1441

if (nr_pages == pageblock_nr_pages &&

1441

if (nr_pages == pageblock_nr_pages &&

1442

(pfn & (pageblock_nr_pages - 1)) == 0) {

1442

(pfn & (pageblock_nr_pages - 1)) == 0) {

1443

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1443

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1444

__free_pages_boot_core(page, pageblock_order);

1444

__free_pages_boot_core(page, pageblock_order);

1445

return;

1445

return;

1446

}

1446

}

1447

1448

for (i = 0; i < nr_pages; i++, page++, pfn++) {

1448

for (i = 0; i < nr_pages; i++, page++, pfn++) {

1449

if ((pfn & (pageblock_nr_pages - 1)) == 0)

1449

if ((pfn & (pageblock_nr_pages - 1)) == 0)

1450

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1450

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

1451

__free_pages_boot_core(page, 0);

1451

__free_pages_boot_core(page, 0);

1452

}

1452

}

1453

}

1453

}

1454

1455

/* Completion tracking for deferred_init_memmap() threads */

1455

/* Completion tracking for deferred_init_memmap() threads */

1456

static atomic_t pgdat_init_n_undone __initdata;

1456

static atomic_t pgdat_init_n_undone __initdata;

1457

static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);

1457

static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);

1458

1459

static inline void __init pgdat_init_report_one_done(void)

1459

static inline void __init pgdat_init_report_one_done(void)

1460

{

1460

{

1461

if (atomic_dec_and_test(&pgdat_init_n_undone))

1461

if (atomic_dec_and_test(&pgdat_init_n_undone))

1462

complete(&pgdat_init_all_done_comp);

1462

complete(&pgdat_init_all_done_comp);

1463

}

1463

}

1464

1465

/* Initialise remaining memory on a node */

1465

/* Initialise remaining memory on a node */

1466

static int __init deferred_init_memmap(void *data)

1466

static int __init deferred_init_memmap(void *data)

1467

{

1467

{

1468

pg_data_t *pgdat = data;

1468

pg_data_t *pgdat = data;

1469

int nid = pgdat->node_id;

1469

int nid = pgdat->node_id;

1470

struct mminit_pfnnid_cache nid_init_state = { };

1470

struct mminit_pfnnid_cache nid_init_state = { };

1471

unsigned long start = jiffies;

1471

unsigned long start = jiffies;

1472

unsigned long nr_pages = 0;

1472

unsigned long nr_pages = 0;

1473

unsigned long walk_start, walk_end;

1473

unsigned long walk_start, walk_end;

1474

int i, zid;

1474

int i, zid;

1475

struct zone *zone;

1475

struct zone *zone;

1476

unsigned long first_init_pfn = pgdat->first_deferred_pfn;

1476

unsigned long first_init_pfn = pgdat->first_deferred_pfn;

1477

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

1477

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

1478

1479

if (first_init_pfn == ULONG_MAX) {

1479

if (first_init_pfn == ULONG_MAX) {

1480

pgdat_init_report_one_done();

1480

pgdat_init_report_one_done();

1481

return 0;

1481

return 0;

1482

}

1482

}

1483

1484

/* Bind memory initialisation thread to a local node if possible */

1484

/* Bind memory initialisation thread to a local node if possible */

1485

if (!cpumask_empty(cpumask))

1485

if (!cpumask_empty(cpumask))

1486

set_cpus_allowed_ptr(current, cpumask);

1486

set_cpus_allowed_ptr(current, cpumask);

1487

1488

/* Sanity check boundaries */

1488

/* Sanity check boundaries */

1489

BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);

1489

BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);

1490

BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));

1490

BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));

1491

pgdat->first_deferred_pfn = ULONG_MAX;

1491

pgdat->first_deferred_pfn = ULONG_MAX;

1492

1493

/* Only the highest zone is deferred so find it */

1493

/* Only the highest zone is deferred so find it */

1494

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

1494

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

1495

zone = pgdat->node_zones + zid;

1495

zone = pgdat->node_zones + zid;

1496

if (first_init_pfn < zone_end_pfn(zone))

1496

if (first_init_pfn < zone_end_pfn(zone))

1497

break;

1497

break;

1498

}

1498

}

1499

1500

for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {

1500

for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {

1501

unsigned long pfn, end_pfn;

1501

unsigned long pfn, end_pfn;

1502

struct page *page = NULL;

1502

struct page *page = NULL;

1503

struct page *free_base_page = NULL;

1503

struct page *free_base_page = NULL;

1504

unsigned long free_base_pfn = 0;

1504

unsigned long free_base_pfn = 0;

1505

int nr_to_free = 0;

1505

int nr_to_free = 0;

1506

1507

end_pfn = min(walk_end, zone_end_pfn(zone));

1507

end_pfn = min(walk_end, zone_end_pfn(zone));

1508

pfn = first_init_pfn;

1508

pfn = first_init_pfn;

1509

if (pfn < walk_start)

1509

if (pfn < walk_start)

1510

pfn = walk_start;

1510

pfn = walk_start;

1511

if (pfn < zone->zone_start_pfn)

1511

if (pfn < zone->zone_start_pfn)

1512

pfn = zone->zone_start_pfn;

1512

pfn = zone->zone_start_pfn;

1513

1514

for (; pfn < end_pfn; pfn++) {

1514

for (; pfn < end_pfn; pfn++) {

1515

if (!pfn_valid_within(pfn))

1515

if (!pfn_valid_within(pfn))

1516

goto free_range;

1516

goto free_range;

1517

1518

/*

1518

/*

1519

* Ensure pfn_valid is checked every

1519

* Ensure pfn_valid is checked every

1520

* pageblock_nr_pages for memory holes

1520

* pageblock_nr_pages for memory holes

1521

*/

1521

*/

1522

if ((pfn & (pageblock_nr_pages - 1)) == 0) {

1522

if ((pfn & (pageblock_nr_pages - 1)) == 0) {

1523

if (!pfn_valid(pfn)) {

1523

if (!pfn_valid(pfn)) {

1524

page = NULL;

1524

page = NULL;

1525

goto free_range;

1525

goto free_range;

1526

}

1526

}

1527

}

1527

}

1528

1529

if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {

1529

if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {

1530

page = NULL;

1530

page = NULL;

1531

goto free_range;

1531

goto free_range;

1532

}

1532

}

1533

1534

/* Minimise pfn page lookups and scheduler checks */

1534

/* Minimise pfn page lookups and scheduler checks */

1535

if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {

1535

if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {

1536

page++;

1536

page++;

1537

} else {

1537

} else {

1538

nr_pages += nr_to_free;

1538

nr_pages += nr_to_free;

1539

deferred_free_range(free_base_page,

1539

deferred_free_range(free_base_page,

1540

free_base_pfn, nr_to_free);

1540

free_base_pfn, nr_to_free);

1541

free_base_page = NULL;

1541

free_base_page = NULL;

1542

free_base_pfn = nr_to_free = 0;

1542

free_base_pfn = nr_to_free = 0;

1543

1544

page = pfn_to_page(pfn);

1544

page = pfn_to_page(pfn);

1545

cond_resched();

1545

cond_resched();

1546

}

1546

}

1547

1548

if (page->flags) {

1548

if (page->flags) {

1549

VM_BUG_ON(page_zone(page) != zone);

1549

VM_BUG_ON(page_zone(page) != zone);

1550

goto free_range;

1550

goto free_range;

1551

}

1551

}

1552

1553

__init_single_page(page, pfn, zid, nid);

1553

__init_single_page(page, pfn, zid, nid);

1554

if (!free_base_page) {

1554

if (!free_base_page) {

1555

free_base_page = page;

1555

free_base_page = page;

1556

free_base_pfn = pfn;

1556

free_base_pfn = pfn;

1557

nr_to_free = 0;

1557

nr_to_free = 0;

1558

}

1558

}

1559

nr_to_free++;

1559

nr_to_free++;

1560

1561

/* Where possible, batch up pages for a single free */

1561

/* Where possible, batch up pages for a single free */

1562

continue;

1562

continue;

1563

free_range:

1563

free_range:

1564

/* Free the current block of pages to allocator */

1564

/* Free the current block of pages to allocator */

1565

nr_pages += nr_to_free;

1565

nr_pages += nr_to_free;

1566

deferred_free_range(free_base_page, free_base_pfn,

1566

deferred_free_range(free_base_page, free_base_pfn,

1567

nr_to_free);

1567

nr_to_free);

1568

free_base_page = NULL;

1568

free_base_page = NULL;

1569

free_base_pfn = nr_to_free = 0;

1569

free_base_pfn = nr_to_free = 0;

1570

}

1570

}

1571

/* Free the last block of pages to allocator */

1571

/* Free the last block of pages to allocator */

1572

nr_pages += nr_to_free;

1572

nr_pages += nr_to_free;

1573

deferred_free_range(free_base_page, free_base_pfn, nr_to_free);

1573

deferred_free_range(free_base_page, free_base_pfn, nr_to_free);

1574

1575

first_init_pfn = max(end_pfn, first_init_pfn);

1575

first_init_pfn = max(end_pfn, first_init_pfn);

1576

}

1576

}

1577

1578

/* Sanity check that the next zone really is unpopulated */

1578

/* Sanity check that the next zone really is unpopulated */

1579

WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));

1579

WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));

1580

1581

pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,

1581

pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,

1582

jiffies_to_msecs(jiffies - start));

1582

jiffies_to_msecs(jiffies - start));

1583

1584

pgdat_init_report_one_done();

1584

pgdat_init_report_one_done();

1585

return 0;

1585

return 0;

1586

}

1586

}

1587

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

1587

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

1588

1589

void __init page_alloc_init_late(void)

1589

void __init page_alloc_init_late(void)

1590

{

1590

{

1591

struct zone *zone;

1591

struct zone *zone;

1592

1593

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1593

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT

1594

int nid;

1594

int nid;

1595

1596

/* There will be num_node_state(N_MEMORY) threads */

1596

/* There will be num_node_state(N_MEMORY) threads */

1597

atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));

1597

atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));

1598

for_each_node_state(nid, N_MEMORY) {

1598

for_each_node_state(nid, N_MEMORY) {

1599

kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);

1599

kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);

1600

}

1600

}

1601

1602

/* Block until all are initialised */

1602

/* Block until all are initialised */

1603

wait_for_completion(&pgdat_init_all_done_comp);

1603

wait_for_completion(&pgdat_init_all_done_comp);

1604

1605

/* Reinit limits that are based on free pages after the kernel is up */

1605

/* Reinit limits that are based on free pages after the kernel is up */

1606

files_maxfiles_init();

1606

files_maxfiles_init();

1607

#endif

1607

#endif

1608

#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK

1608

#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK

1609

/* Discard memblock private memory */

1609

/* Discard memblock private memory */

1610

memblock_discard();

1610

memblock_discard();

1611

#endif

1611

#endif

1612

1613

for_each_populated_zone(zone)

1613

for_each_populated_zone(zone)

1614

set_zone_contiguous(zone);

1614

set_zone_contiguous(zone);

1615

}

1615

}

1616

1617

#ifdef CONFIG_CMA

1617

#ifdef CONFIG_CMA

1618

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

1618

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

1619

void __init init_cma_reserved_pageblock(struct page *page)

1619

void __init init_cma_reserved_pageblock(struct page *page)

1620

{

1620

{

1621

unsigned i = pageblock_nr_pages;

1621

unsigned i = pageblock_nr_pages;

1622

struct page *p = page;

1622

struct page *p = page;

1623

1624

do {

1624

do {

1625

__ClearPageReserved(p);

1625

__ClearPageReserved(p);

1626

set_page_count(p, 0);

1626

set_page_count(p, 0);

1627

} while (++p, --i);

1627

} while (++p, --i);

1628

1629

set_pageblock_migratetype(page, MIGRATE_CMA);

1629

set_pageblock_migratetype(page, MIGRATE_CMA);

1630

1631

if (pageblock_order >= MAX_ORDER) {

1631

if (pageblock_order >= MAX_ORDER) {

1632

i = pageblock_nr_pages;

1632

i = pageblock_nr_pages;

1633

p = page;

1633

p = page;

1634

do {

1634

do {

1635

set_page_refcounted(p);

1635

set_page_refcounted(p);

1636

__free_pages(p, MAX_ORDER - 1);

1636

__free_pages(p, MAX_ORDER - 1);

1637

p += MAX_ORDER_NR_PAGES;

1637

p += MAX_ORDER_NR_PAGES;

1638

} while (i -= MAX_ORDER_NR_PAGES);

1638

} while (i -= MAX_ORDER_NR_PAGES);

1639

} else {

1639

} else {

1640

set_page_refcounted(page);

1640

set_page_refcounted(page);

1641

__free_pages(page, pageblock_order);

1641

__free_pages(page, pageblock_order);

1642

}

1642

}

1643

1644

adjust_managed_page_count(page, pageblock_nr_pages);

1644

adjust_managed_page_count(page, pageblock_nr_pages);

1645

}

1645

}

1646

#endif

1646

#endif

1647

1648

/*

1648

/*

1649

* The order of subdivision here is critical for the IO subsystem.

1649

* The order of subdivision here is critical for the IO subsystem.

1650

* Please do not alter this order without good reasons and regression

1650

* Please do not alter this order without good reasons and regression

1651

* testing. Specifically, as large blocks of memory are subdivided,

1651

* testing. Specifically, as large blocks of memory are subdivided,

1652

* the order in which smaller blocks are delivered depends on the order

1652

* the order in which smaller blocks are delivered depends on the order

1653

* they're subdivided in this function. This is the primary factor

1653

* they're subdivided in this function. This is the primary factor

1654

* influencing the order in which pages are delivered to the IO

1654

* influencing the order in which pages are delivered to the IO

1655

* subsystem according to empirical testing, and this is also justified

1655

* subsystem according to empirical testing, and this is also justified

1656

* by considering the behavior of a buddy system containing a single

1656

* by considering the behavior of a buddy system containing a single

1657

* large block of memory acted on by a series of small allocations.

1657

* large block of memory acted on by a series of small allocations.

1658

* This behavior is a critical factor in sglist merging's success.

1658

* This behavior is a critical factor in sglist merging's success.

1659

*

1659

*

1660

* -- nyc

1660

* -- nyc

1661

*/

1661

*/

1662

static inline void expand(struct zone *zone, struct page *page,

1662

static inline void expand(struct zone *zone, struct page *page,

1663

int low, int high, struct free_area *area,

1663

int low, int high, struct free_area *area,

1664

int migratetype)

1664

int migratetype)

1665

{

1665

{

1666

unsigned long size = 1 << high;

1666

unsigned long size = 1 << high;

1667

1668

while (high > low) {

1668

while (high > low) {

1669

area--;

1669

area--;

1670

high--;

1670

high--;

1671

size >>= 1;

1671

size >>= 1;

1672

VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

1672

VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);

1673

1674

/*

1674

/*

1675

* Mark as guard pages (or page), that will allow to

1675

* Mark as guard pages (or page), that will allow to

1676

* merge back to allocator when buddy will be freed.

1676

* merge back to allocator when buddy will be freed.

1677

* Corresponding page table entries will not be touched,

1677

* Corresponding page table entries will not be touched,

1678

* pages will stay not present in virtual address space

1678

* pages will stay not present in virtual address space

1679

*/

1679

*/

1680

if (set_page_guard(zone, &page[size], high, migratetype))

1680

if (set_page_guard(zone, &page[size], high, migratetype))

1681

continue;

1681

continue;

1682

1683

list_add(&page[size].lru, &area->free_list[migratetype]);

1683

list_add(&page[size].lru, &area->free_list[migratetype]);

1684

area->nr_free++;

1684

area->nr_free++;

1685

set_page_order(&page[size], high);

1685

set_page_order(&page[size], high);

1686

}

1686

}

1687

}

1687

}

1688

1689

static void check_new_page_bad(struct page *page)

1689

static void check_new_page_bad(struct page *page)

1690

{

1690

{

1691

const char *bad_reason = NULL;

1691

const char *bad_reason = NULL;

1692

unsigned long bad_flags = 0;

1692

unsigned long bad_flags = 0;

1693

1694

if (unlikely(atomic_read(&page->_mapcount) != -1))

1694

if (unlikely(atomic_read(&page->_mapcount) != -1))

1695

bad_reason = "nonzero mapcount";

1695

bad_reason = "nonzero mapcount";

1696

if (unlikely(page->mapping != NULL))

1696

if (unlikely(page->mapping != NULL))

1697

bad_reason = "non-NULL mapping";

1697

bad_reason = "non-NULL mapping";

1698

if (unlikely(page_ref_count(page) != 0))

1698

if (unlikely(page_ref_count(page) != 0))

1699

bad_reason = "nonzero _count";

1699

bad_reason = "nonzero _count";

1700

if (unlikely(page->flags & __PG_HWPOISON)) {

1700

if (unlikely(page->flags & __PG_HWPOISON)) {

1701

bad_reason = "HWPoisoned (hardware-corrupted)";

1701

bad_reason = "HWPoisoned (hardware-corrupted)";

1702

bad_flags = __PG_HWPOISON;

1702

bad_flags = __PG_HWPOISON;

1703

/* Don't complain about hwpoisoned pages */

1703

/* Don't complain about hwpoisoned pages */

1704

page_mapcount_reset(page); /* remove PageBuddy */

1704

page_mapcount_reset(page); /* remove PageBuddy */

1705

return;

1705

return;

1706

}

1706

}

1707

if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {

1707

if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {

1708

bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";

1708

bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";

1709

bad_flags = PAGE_FLAGS_CHECK_AT_PREP;

1709

bad_flags = PAGE_FLAGS_CHECK_AT_PREP;

1710

}

1710

}

1711

#ifdef CONFIG_MEMCG

1711

#ifdef CONFIG_MEMCG

1712

if (unlikely(page->mem_cgroup))

1712

if (unlikely(page->mem_cgroup))

1713

bad_reason = "page still charged to cgroup";

1713

bad_reason = "page still charged to cgroup";

1714

#endif

1714

#endif

1715

bad_page(page, bad_reason, bad_flags);

1715

bad_page(page, bad_reason, bad_flags);

1716

}

1716

}

1717

1718

/*

1718

/*

1719

* This page is about to be returned from the page allocator

1719

* This page is about to be returned from the page allocator

1720

*/

1720

*/

1721

static inline int check_new_page(struct page *page)

1721

static inline int check_new_page(struct page *page)

1722

{

1722

{

1723

if (likely(page_expected_state(page,

1723

if (likely(page_expected_state(page,

1724

PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))

1724

PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))

1725

return 0;

1725

return 0;

1726

1727

check_new_page_bad(page);

1727

check_new_page_bad(page);

1728

return 1;

1728

return 1;

1729

}

1729

}

1730

1731

static inline bool free_pages_prezeroed(void)

1731

static inline bool free_pages_prezeroed(void)

1732

{

1732

{

1733

return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&

1733

return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&

1734

page_poisoning_enabled();

1734

page_poisoning_enabled();

1735

}

1735

}

1736

1737

#ifdef CONFIG_DEBUG_VM

1737

#ifdef CONFIG_DEBUG_VM

1738

static bool check_pcp_refill(struct page *page)

1738

static bool check_pcp_refill(struct page *page)

1739

{

1739

{

1740

return false;

1740

return false;

1741

}

1741

}

1742

1743

static bool check_new_pcp(struct page *page)

1743

static bool check_new_pcp(struct page *page)

1744

{

1744

{

1745

return check_new_page(page);

1745

return check_new_page(page);

1746

}

1746

}

1747

#else

1747

#else

1748

static bool check_pcp_refill(struct page *page)

1748

static bool check_pcp_refill(struct page *page)

1749

{

1749

{

1750

return check_new_page(page);

1750

return check_new_page(page);

1751

}

1751

}

1752

static bool check_new_pcp(struct page *page)

1752

static bool check_new_pcp(struct page *page)

1753

{

1753

{

1754

return false;

1754

return false;

1755

}

1755

}

1756

#endif /* CONFIG_DEBUG_VM */

1756

#endif /* CONFIG_DEBUG_VM */

1757

1758

static bool check_new_pages(struct page *page, unsigned int order)

1758

static bool check_new_pages(struct page *page, unsigned int order)

1759

{

1759

{

1760

int i;

1760

int i;

1761

for (i = 0; i < (1 << order); i++) {

1761

for (i = 0; i < (1 << order); i++) {

1762

struct page *p = page + i;

1762

struct page *p = page + i;

1763

1764

if (unlikely(check_new_page(p)))

1764

if (unlikely(check_new_page(p)))

1765

return true;

1765

return true;

1766

}

1766

}

1767

1768

return false;

1768

return false;

1769

}

1769

}

1770

1771

inline void post_alloc_hook(struct page *page, unsigned int order,

1771

inline void post_alloc_hook(struct page *page, unsigned int order,

1772

gfp_t gfp_flags)

1772

gfp_t gfp_flags)

1773

{

1773

{

1774

set_page_private(page, 0);

1774

set_page_private(page, 0);

1775

set_page_refcounted(page);

1775

set_page_refcounted(page);

1776

1777

arch_alloc_page(page, order);

1777

arch_alloc_page(page, order);

1778

kernel_map_pages(page, 1 << order, 1);

1778

kernel_map_pages(page, 1 << order, 1);

1779

kernel_poison_pages(page, 1 << order, 1);

1779

kernel_poison_pages(page, 1 << order, 1);

1780

kasan_alloc_pages(page, order);

1780

kasan_alloc_pages(page, order);

1781

set_page_owner(page, order, gfp_flags);

1781

set_page_owner(page, order, gfp_flags);

1782

}

1782

}

1783

1784

static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,

1784

static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,

1785

unsigned int alloc_flags)

1785

unsigned int alloc_flags)

1786

{

1786

{

1787

int i;

1787

int i;

1788

1789

post_alloc_hook(page, order, gfp_flags);

1789

post_alloc_hook(page, order, gfp_flags);

1790

1791

if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))

1791

if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))

1792

for (i = 0; i < (1 << order); i++)

1792

for (i = 0; i < (1 << order); i++)

1793

clear_highpage(page + i);

1793

clear_highpage(page + i);

1794

1795

if (order && (gfp_flags & __GFP_COMP))

1795

if (order && (gfp_flags & __GFP_COMP))

1796

prep_compound_page(page, order);

1796

prep_compound_page(page, order);

1797

1798

/*

1798

/*

1799

* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to

1799

* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to

1800

* allocate the page. The expectation is that the caller is taking

1800

* allocate the page. The expectation is that the caller is taking

1801

* steps that will free more memory. The caller should avoid the page

1801

* steps that will free more memory. The caller should avoid the page

1802

* being used for !PFMEMALLOC purposes.

1802

* being used for !PFMEMALLOC purposes.

1803

*/

1803

*/

1804

if (alloc_flags & ALLOC_NO_WATERMARKS)

1804

if (alloc_flags & ALLOC_NO_WATERMARKS)

1805

set_page_pfmemalloc(page);

1805

set_page_pfmemalloc(page);

1806

else

1806

else

1807

clear_page_pfmemalloc(page);

1807

clear_page_pfmemalloc(page);

1808

}

1808

}

1809

1810

/*

1810

/*

1811

* Go through the free lists for the given migratetype and remove

1811

* Go through the free lists for the given migratetype and remove

1812

* the smallest available page from the freelists

1812

* the smallest available page from the freelists

1813

*/

1813

*/

1814

static inline

1814

static inline

1815

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

1815

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

1816

int migratetype)

1816

int migratetype)

1817

{

1817

{

1818

unsigned int current_order;

1818

unsigned int current_order;

1819

struct free_area *area;

1819

struct free_area *area;

1820

struct page *page;

1820

struct page *page;

1821

1822

/* Find a page of the appropriate size in the preferred list */

1822

/* Find a page of the appropriate size in the preferred list */

1823

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

1823

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

1824

area = &(zone->free_area[current_order]);

1824

area = &(zone->free_area[current_order]);

1825

page = list_first_entry_or_null(&area->free_list[migratetype],

1825

page = list_first_entry_or_null(&area->free_list[migratetype],

1826

struct page, lru);

1826

struct page, lru);

1827

if (!page)

1827

if (!page)

1828

continue;

1828

continue;

1829

list_del(&page->lru);

1829

list_del(&page->lru);

1830

rmv_page_order(page);

1830

rmv_page_order(page);

1831

area->nr_free--;

1831

area->nr_free--;

1832

expand(zone, page, order, current_order, area, migratetype);

1832

expand(zone, page, order, current_order, area, migratetype);

1833

set_pcppage_migratetype(page, migratetype);

1833

set_pcppage_migratetype(page, migratetype);

1834

return page;

1834

return page;

1835

}

1835

}

1836

1837

return NULL;

1837

return NULL;

1838

}

1838

}

1839

1840

1841

/*

1841

/*

1842

* This array describes the order lists are fallen back to when

1842

* This array describes the order lists are fallen back to when

1843

* the free lists for the desirable migrate type are depleted

1843

* the free lists for the desirable migrate type are depleted

1844

*/

1844

*/

1845

static int fallbacks[MIGRATE_TYPES][4] = {

1845

static int fallbacks[MIGRATE_TYPES][4] = {

1846

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },

1846

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },

1847

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },

1847

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },

1848

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },

1848

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },

1849

#ifdef CONFIG_CMA

1849

#ifdef CONFIG_CMA

1850

[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */

1850

[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */

1851

#endif

1851

#endif

1852

#ifdef CONFIG_MEMORY_ISOLATION

1852

#ifdef CONFIG_MEMORY_ISOLATION

1853

[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */

1853

[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */

1854

#endif

1854

#endif

1855

};

1855

};

1856

1857

#ifdef CONFIG_CMA

1857

#ifdef CONFIG_CMA

1858

static struct page *__rmqueue_cma_fallback(struct zone *zone,

1858

static struct page *__rmqueue_cma_fallback(struct zone *zone,

1859

unsigned int order)

1859

unsigned int order)

1860

{

1860

{

1861

return __rmqueue_smallest(zone, order, MIGRATE_CMA);

1861

return __rmqueue_smallest(zone, order, MIGRATE_CMA);

1862

}

1862

}

1863

#else

1863

#else

1864

static inline struct page *__rmqueue_cma_fallback(struct zone *zone,

1864

static inline struct page *__rmqueue_cma_fallback(struct zone *zone,

1865

unsigned int order) { return NULL; }

1865

unsigned int order) { return NULL; }

1866

#endif

1866

#endif

1867

1868

/*

1868

/*

1869

* Move the free pages in a range to the free lists of the requested type.

1869

* Move the free pages in a range to the free lists of the requested type.

1870

* Note that start_page and end_pages are not aligned on a pageblock

1870

* Note that start_page and end_pages are not aligned on a pageblock

1871

* boundary. If alignment is required, use move_freepages_block()

1871

* boundary. If alignment is required, use move_freepages_block()

1872

*/

1872

*/

1873

static int move_freepages(struct zone *zone,

1873

static int move_freepages(struct zone *zone,

1874

struct page *start_page, struct page *end_page,

1874

struct page *start_page, struct page *end_page,

1875

int migratetype, int *num_movable)

1875

int migratetype, int *num_movable)

1876

{

1876

{

1877

struct page *page;

1877

struct page *page;

1878

unsigned int order;

1878

unsigned int order;

1879

int pages_moved = 0;

1879

int pages_moved = 0;

1880

1881

#ifndef CONFIG_HOLES_IN_ZONE

1881

#ifndef CONFIG_HOLES_IN_ZONE

1882

/*

1882

/*

1883

* page_zone is not safe to call in this context when

1883

* page_zone is not safe to call in this context when

1884

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

1884

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

1885

* anyway as we check zone boundaries in move_freepages_block().

1885

* anyway as we check zone boundaries in move_freepages_block().

1886

* Remove at a later date when no bug reports exist related to

1886

* Remove at a later date when no bug reports exist related to

1887

* grouping pages by mobility

1887

* grouping pages by mobility

1888

*/

1888

*/

1889

VM_BUG_ON(page_zone(start_page) != page_zone(end_page));

1889

VM_BUG_ON(page_zone(start_page) != page_zone(end_page));

1890

#endif

1890

#endif

1891

1892

if (num_movable)

1892

if (num_movable)

1893

*num_movable = 0;

1893

*num_movable = 0;

1894

1895

for (page = start_page; page <= end_page;) {

1895

for (page = start_page; page <= end_page;) {

1896

if (!pfn_valid_within(page_to_pfn(page))) {

1896

if (!pfn_valid_within(page_to_pfn(page))) {

1897

page++;

1897

page++;

1898

continue;

1898

continue;

1899

}

1899

}

1900

1901

/* Make sure we are not inadvertently changing nodes */

1901

/* Make sure we are not inadvertently changing nodes */

1902

VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);

1902

VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);

1903

1904

if (!PageBuddy(page)) {

1904

if (!PageBuddy(page)) {

1905

/*

1905

/*

1906

* We assume that pages that could be isolated for

1906

* We assume that pages that could be isolated for

1907

* migration are movable. But we don't actually try

1907

* migration are movable. But we don't actually try

1908

* isolating, as that would be expensive.

1908

* isolating, as that would be expensive.

1909

*/

1909

*/

1910

if (num_movable &&

1910

if (num_movable &&

1911

(PageLRU(page) || __PageMovable(page)))

1911

(PageLRU(page) || __PageMovable(page)))

1912

(*num_movable)++;

1912

(*num_movable)++;

1913

1914

page++;

1914

page++;

1915

continue;

1915

continue;

1916

}

1916

}

1917

1918

order = page_order(page);

1918

order = page_order(page);

1919

list_move(&page->lru,

1919

list_move(&page->lru,

1920

&zone->free_area[order].free_list[migratetype]);

1920

&zone->free_area[order].free_list[migratetype]);

1921

page += 1 << order;

1921

page += 1 << order;

1922

pages_moved += 1 << order;

1922

pages_moved += 1 << order;

1923

}

1923

}

1924

1925

return pages_moved;

1925

return pages_moved;

1926

}

1926

}

1927

1928

int move_freepages_block(struct zone *zone, struct page *page,

1928

int move_freepages_block(struct zone *zone, struct page *page,

1929

int migratetype, int *num_movable)

1929

int migratetype, int *num_movable)

1930

{

1930

{

1931

unsigned long start_pfn, end_pfn;

1931

unsigned long start_pfn, end_pfn;

1932

struct page *start_page, *end_page;

1932

struct page *start_page, *end_page;

1933

1934

start_pfn = page_to_pfn(page);

1934

start_pfn = page_to_pfn(page);

1935

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1935

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1936

start_page = pfn_to_page(start_pfn);

1936

start_page = pfn_to_page(start_pfn);

1937

end_page = start_page + pageblock_nr_pages - 1;

1937

end_page = start_page + pageblock_nr_pages - 1;

1938

end_pfn = start_pfn + pageblock_nr_pages - 1;

1938

end_pfn = start_pfn + pageblock_nr_pages - 1;

1939

1940

/* Do not cross zone boundaries */

1940

/* Do not cross zone boundaries */

1941

if (!zone_spans_pfn(zone, start_pfn))

1941

if (!zone_spans_pfn(zone, start_pfn))

1942

start_page = page;

1942

start_page = page;

1943

if (!zone_spans_pfn(zone, end_pfn))

1943

if (!zone_spans_pfn(zone, end_pfn))

1944

return 0;

1944

return 0;

1945

1946

return move_freepages(zone, start_page, end_page, migratetype,

1946

return move_freepages(zone, start_page, end_page, migratetype,

1947

num_movable);

1947

num_movable);

1948

}

1948

}

1949

1950

static void change_pageblock_range(struct page *pageblock_page,

1950

static void change_pageblock_range(struct page *pageblock_page,

1951

int start_order, int migratetype)

1951

int start_order, int migratetype)

1952

{

1952

{

1953

int nr_pageblocks = 1 << (start_order - pageblock_order);

1953

int nr_pageblocks = 1 << (start_order - pageblock_order);

1954

1955

while (nr_pageblocks--) {

1955

while (nr_pageblocks--) {

1956

set_pageblock_migratetype(pageblock_page, migratetype);

1956

set_pageblock_migratetype(pageblock_page, migratetype);

1957

pageblock_page += pageblock_nr_pages;

1957

pageblock_page += pageblock_nr_pages;

1958

}

1958

}

1959

}

1959

}

1960

1961

/*

1961

/*

1962

* When we are falling back to another migratetype during allocation, try to

1962

* When we are falling back to another migratetype during allocation, try to

1963

* steal extra free pages from the same pageblocks to satisfy further

1963

* steal extra free pages from the same pageblocks to satisfy further

1964

* allocations, instead of polluting multiple pageblocks.

1964

* allocations, instead of polluting multiple pageblocks.

1965

*

1965

*

1966

* If we are stealing a relatively large buddy page, it is likely there will

1966

* If we are stealing a relatively large buddy page, it is likely there will

1967

* be more free pages in the pageblock, so try to steal them all. For

1967

* be more free pages in the pageblock, so try to steal them all. For

1968

* reclaimable and unmovable allocations, we steal regardless of page size,

1968

* reclaimable and unmovable allocations, we steal regardless of page size,

1969

* as fragmentation caused by those allocations polluting movable pageblocks

1969

* as fragmentation caused by those allocations polluting movable pageblocks

1970

* is worse than movable allocations stealing from unmovable and reclaimable

1970

* is worse than movable allocations stealing from unmovable and reclaimable

1971

* pageblocks.

1971

* pageblocks.

1972

*/

1972

*/

1973

static bool can_steal_fallback(unsigned int order, int start_mt)

1973

static bool can_steal_fallback(unsigned int order, int start_mt)

1974

{

1974

{

1975

/*

1975

/*

1976

* Leaving this order check is intended, although there is

1976

* Leaving this order check is intended, although there is

1977

* relaxed order check in next check. The reason is that

1977

* relaxed order check in next check. The reason is that

1978

* we can actually steal whole pageblock if this condition met,

1978

* we can actually steal whole pageblock if this condition met,

1979

* but, below check doesn't guarantee it and that is just heuristic

1979

* but, below check doesn't guarantee it and that is just heuristic

1980

* so could be changed anytime.

1980

* so could be changed anytime.

1981

*/

1981

*/

1982

if (order >= pageblock_order)

1982

if (order >= pageblock_order)

1983

return true;

1983

return true;

1984

1985

if (order >= pageblock_order / 2 ||

1985

if (order >= pageblock_order / 2 ||

1986

start_mt == MIGRATE_RECLAIMABLE ||

1986

start_mt == MIGRATE_RECLAIMABLE ||

1987

start_mt == MIGRATE_UNMOVABLE ||

1987

start_mt == MIGRATE_UNMOVABLE ||

1988

page_group_by_mobility_disabled)

1988

page_group_by_mobility_disabled)

1989

return true;

1989

return true;

1990

1991

return false;

1991

return false;

1992

}

1992

}

1993

1994

/*

1994

/*

1995

* This function implements actual steal behaviour. If order is large enough,

1995

* This function implements actual steal behaviour. If order is large enough,

1996

* we can steal whole pageblock. If not, we first move freepages in this

1996

* we can steal whole pageblock. If not, we first move freepages in this

1997

* pageblock to our migratetype and determine how many already-allocated pages

1997

* pageblock to our migratetype and determine how many already-allocated pages

1998

* are there in the pageblock with a compatible migratetype. If at least half

1998

* are there in the pageblock with a compatible migratetype. If at least half

1999

* of pages are free or compatible, we can change migratetype of the pageblock

1999

* of pages are free or compatible, we can change migratetype of the pageblock

2000

* itself, so pages freed in the future will be put on the correct free list.

2000

* itself, so pages freed in the future will be put on the correct free list.

2001

*/

2001

*/

2002

static void steal_suitable_fallback(struct zone *zone, struct page *page,

2002

static void steal_suitable_fallback(struct zone *zone, struct page *page,

2003

int start_type, bool whole_block)

2003

int start_type, bool whole_block)

2004

{

2004

{

2005

unsigned int current_order = page_order(page);

2005

unsigned int current_order = page_order(page);

2006

struct free_area *area;

2006

struct free_area *area;

2007

int free_pages, movable_pages, alike_pages;

2007

int free_pages, movable_pages, alike_pages;

2008

int old_block_type;

2008

int old_block_type;

2009

2010

old_block_type = get_pageblock_migratetype(page);

2010

old_block_type = get_pageblock_migratetype(page);

2011

2012

/*

2012

/*

2013

* This can happen due to races and we want to prevent broken

2013

* This can happen due to races and we want to prevent broken

2014

* highatomic accounting.

2014

* highatomic accounting.

2015

*/

2015

*/

2016

if (is_migrate_highatomic(old_block_type))

2016

if (is_migrate_highatomic(old_block_type))

2017

goto single_page;

2017

goto single_page;

2018

2019

/* Take ownership for orders >= pageblock_order */

2019

/* Take ownership for orders >= pageblock_order */

2020

if (current_order >= pageblock_order) {

2020

if (current_order >= pageblock_order) {

2021

change_pageblock_range(page, current_order, start_type);

2021

change_pageblock_range(page, current_order, start_type);

2022

goto single_page;

2022

goto single_page;

2023

}

2023

}

2024

2025

/* We are not allowed to try stealing from the whole block */

2025

/* We are not allowed to try stealing from the whole block */

2026

if (!whole_block)

2026

if (!whole_block)

2027

goto single_page;

2027

goto single_page;

2028

2029

free_pages = move_freepages_block(zone, page, start_type,

2029

free_pages = move_freepages_block(zone, page, start_type,

2030

&movable_pages);

2030

&movable_pages);

2031

/*

2031

/*

2032

* Determine how many pages are compatible with our allocation.

2032

* Determine how many pages are compatible with our allocation.

2033

* For movable allocation, it's the number of movable pages which

2033

* For movable allocation, it's the number of movable pages which

2034

* we just obtained. For other types it's a bit more tricky.

2034

* we just obtained. For other types it's a bit more tricky.

2035

*/

2035

*/

2036

if (start_type == MIGRATE_MOVABLE) {

2036

if (start_type == MIGRATE_MOVABLE) {

2037

alike_pages = movable_pages;

2037

alike_pages = movable_pages;

2038

} else {

2038

} else {

2039

/*

2039

/*

2040

* If we are falling back a RECLAIMABLE or UNMOVABLE allocation

2040

* If we are falling back a RECLAIMABLE or UNMOVABLE allocation

2041

* to MOVABLE pageblock, consider all non-movable pages as

2041

* to MOVABLE pageblock, consider all non-movable pages as

2042

* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or

2042

* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or

2043

* vice versa, be conservative since we can't distinguish the

2043

* vice versa, be conservative since we can't distinguish the

2044

* exact migratetype of non-movable pages.

2044

* exact migratetype of non-movable pages.

2045

*/

2045

*/

2046

if (old_block_type == MIGRATE_MOVABLE)

2046

if (old_block_type == MIGRATE_MOVABLE)

2047

alike_pages = pageblock_nr_pages

2047

alike_pages = pageblock_nr_pages

2048

- (free_pages + movable_pages);

2048

- (free_pages + movable_pages);

2049

else

2049

else

2050

alike_pages = 0;

2050

alike_pages = 0;

2051

}

2051

}

2052

2053

/* moving whole block can fail due to zone boundary conditions */

2053

/* moving whole block can fail due to zone boundary conditions */

2054

if (!free_pages)

2054

if (!free_pages)

2055

goto single_page;

2055

goto single_page;

2056

2057

/*

2057

/*

2058

* If a sufficient number of pages in the block are either free or of

2058

* If a sufficient number of pages in the block are either free or of

2059

* comparable migratability as our allocation, claim the whole block.

2059

* comparable migratability as our allocation, claim the whole block.

2060

*/

2060

*/

2061

if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||

2061

if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||

2062

page_group_by_mobility_disabled)

2062

page_group_by_mobility_disabled)

2063

set_pageblock_migratetype(page, start_type);

2063

set_pageblock_migratetype(page, start_type);

2064

2065

return;

2065

return;

2066

2067

single_page:

2067

single_page:

2068

area = &zone->free_area[current_order];

2068

area = &zone->free_area[current_order];

2069

list_move(&page->lru, &area->free_list[start_type]);

2069

list_move(&page->lru, &area->free_list[start_type]);

2070

}

2070

}

2071

2072

/*

2072

/*

2073

* Check whether there is a suitable fallback freepage with requested order.

2073

* Check whether there is a suitable fallback freepage with requested order.

2074

* If only_stealable is true, this function returns fallback_mt only if

2074

* If only_stealable is true, this function returns fallback_mt only if

2075

* we can steal other freepages all together. This would help to reduce

2075

* we can steal other freepages all together. This would help to reduce

2076

* fragmentation due to mixed migratetype pages in one pageblock.

2076

* fragmentation due to mixed migratetype pages in one pageblock.

2077

*/

2077

*/

2078

int find_suitable_fallback(struct free_area *area, unsigned int order,

2078

int find_suitable_fallback(struct free_area *area, unsigned int order,

2079

int migratetype, bool only_stealable, bool *can_steal)

2079

int migratetype, bool only_stealable, bool *can_steal)

2080

{

2080

{

2081

int i;

2081

int i;

2082

int fallback_mt;

2082

int fallback_mt;

2083

2084

if (area->nr_free == 0)

2084

if (area->nr_free == 0)

2085

return -1;

2085

return -1;

2086

2087

*can_steal = false;

2087

*can_steal = false;

2088

for (i = 0;; i++) {

2088

for (i = 0;; i++) {

2089

fallback_mt = fallbacks[migratetype][i];

2089

fallback_mt = fallbacks[migratetype][i];

2090

if (fallback_mt == MIGRATE_TYPES)

2090

if (fallback_mt == MIGRATE_TYPES)

2091

break;

2091

break;

2092

2093

if (list_empty(&area->free_list[fallback_mt]))

2093

if (list_empty(&area->free_list[fallback_mt]))

2094

continue;

2094

continue;

2095

2096

if (can_steal_fallback(order, migratetype))

2096

if (can_steal_fallback(order, migratetype))

2097

*can_steal = true;

2097

*can_steal = true;

2098

2099

if (!only_stealable)

2099

if (!only_stealable)

2100

return fallback_mt;

2100

return fallback_mt;

2101

2102

if (*can_steal)

2102

if (*can_steal)

2103

return fallback_mt;

2103

return fallback_mt;

2104

}

2104

}

2105

2106

return -1;

2106

return -1;

2107

}

2107

}

2108

2109

/*

2109

/*

2110

* Reserve a pageblock for exclusive use of high-order atomic allocations if

2110

* Reserve a pageblock for exclusive use of high-order atomic allocations if

2111

* there are no empty page blocks that contain a page with a suitable order

2111

* there are no empty page blocks that contain a page with a suitable order

2112

*/

2112

*/

2113

static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,

2113

static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,

2114

unsigned int alloc_order)

2114

unsigned int alloc_order)

2115

{

2115

{

2116

int mt;

2116

int mt;

2117

unsigned long max_managed, flags;

2117

unsigned long max_managed, flags;

2118

2119

/*

2119

/*

2120

* Limit the number reserved to 1 pageblock or roughly 1% of a zone.

2120

* Limit the number reserved to 1 pageblock or roughly 1% of a zone.

2121

* Check is race-prone but harmless.

2121

* Check is race-prone but harmless.

2122

*/

2122

*/

2123

max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;

2123

max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;

2124

if (zone->nr_reserved_highatomic >= max_managed)

2124

if (zone->nr_reserved_highatomic >= max_managed)

2125

return;

2125

return;

2126

2127

spin_lock_irqsave(&zone->lock, flags);

2127

spin_lock_irqsave(&zone->lock, flags);

2128

2129

/* Recheck the nr_reserved_highatomic limit under the lock */

2129

/* Recheck the nr_reserved_highatomic limit under the lock */

2130

if (zone->nr_reserved_highatomic >= max_managed)

2130

if (zone->nr_reserved_highatomic >= max_managed)

2131

goto out_unlock;

2131

goto out_unlock;

2132

2133

/* Yoink! */

2133

/* Yoink! */

2134

mt = get_pageblock_migratetype(page);

2134

mt = get_pageblock_migratetype(page);

2135

if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)

2135

if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)

2136

&& !is_migrate_cma(mt)) {

2136

&& !is_migrate_cma(mt)) {

2137

zone->nr_reserved_highatomic += pageblock_nr_pages;

2137

zone->nr_reserved_highatomic += pageblock_nr_pages;

2138

set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);

2138

set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);

2139

move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);

2139

move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);

2140

}

2140

}

2141

2142

out_unlock:

2142

out_unlock:

2143

spin_unlock_irqrestore(&zone->lock, flags);

2143

spin_unlock_irqrestore(&zone->lock, flags);

2144

}

2144

}

2145

2146

/*

2146

/*

2147

* Used when an allocation is about to fail under memory pressure. This

2147

* Used when an allocation is about to fail under memory pressure. This

2148

* potentially hurts the reliability of high-order allocations when under

2148

* potentially hurts the reliability of high-order allocations when under

2149

* intense memory pressure but failed atomic allocations should be easier

2149

* intense memory pressure but failed atomic allocations should be easier

2150

* to recover from than an OOM.

2150

* to recover from than an OOM.

2151

*

2151

*

2152

* If @force is true, try to unreserve a pageblock even though highatomic

2152

* If @force is true, try to unreserve a pageblock even though highatomic

2153

* pageblock is exhausted.

2153

* pageblock is exhausted.

2154

*/

2154

*/

2155

static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,

2155

static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,

2156

bool force)

2156

bool force)

2157

{

2157

{

2158

struct zonelist *zonelist = ac->zonelist;

2158

struct zonelist *zonelist = ac->zonelist;

2159

unsigned long flags;

2159

unsigned long flags;

2160

struct zoneref *z;

2160

struct zoneref *z;

2161

struct zone *zone;

2161

struct zone *zone;

2162

struct page *page;

2162

struct page *page;

2163

int order;

2163

int order;

2164

bool ret;

2164

bool ret;

2165

2166

for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,

2166

for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,

2167

ac->nodemask) {

2167

ac->nodemask) {

2168

/*

2168

/*

2169

* Preserve at least one pageblock unless memory pressure

2169

* Preserve at least one pageblock unless memory pressure

2170

* is really high.

2170

* is really high.

2171

*/

2171

*/

2172

if (!force && zone->nr_reserved_highatomic <=

2172

if (!force && zone->nr_reserved_highatomic <=

2173

pageblock_nr_pages)

2173

pageblock_nr_pages)

2174

continue;

2174

continue;

2175

2176

spin_lock_irqsave(&zone->lock, flags);

2176

spin_lock_irqsave(&zone->lock, flags);

2177

for (order = 0; order < MAX_ORDER; order++) {

2177

for (order = 0; order < MAX_ORDER; order++) {

2178

struct free_area *area = &(zone->free_area[order]);

2178

struct free_area *area = &(zone->free_area[order]);

2179

2180

page = list_first_entry_or_null(

2180

page = list_first_entry_or_null(

2181

&area->free_list[MIGRATE_HIGHATOMIC],

2181

&area->free_list[MIGRATE_HIGHATOMIC],

2182

struct page, lru);

2182

struct page, lru);

2183

if (!page)

2183

if (!page)

2184

continue;

2184

continue;

2185

2186

/*

2186

/*

2187

* In page freeing path, migratetype change is racy so

2187

* In page freeing path, migratetype change is racy so

2188

* we can counter several free pages in a pageblock

2188

* we can counter several free pages in a pageblock

2189

* in this loop althoug we changed the pageblock type

2189

* in this loop althoug we changed the pageblock type

2190

* from highatomic to ac->migratetype. So we should

2190

* from highatomic to ac->migratetype. So we should

2191

* adjust the count once.

2191

* adjust the count once.

2192

*/

2192

*/

2193

if (is_migrate_highatomic_page(page)) {

2193

if (is_migrate_highatomic_page(page)) {

2194

/*

2194

/*

2195

* It should never happen but changes to

2195

* It should never happen but changes to

2196

* locking could inadvertently allow a per-cpu

2196

* locking could inadvertently allow a per-cpu

2197

* drain to add pages to MIGRATE_HIGHATOMIC

2197

* drain to add pages to MIGRATE_HIGHATOMIC

2198

* while unreserving so be safe and watch for

2198

* while unreserving so be safe and watch for

2199

* underflows.

2199

* underflows.

2200

*/

2200

*/

2201

zone->nr_reserved_highatomic -= min(

2201

zone->nr_reserved_highatomic -= min(

2202

pageblock_nr_pages,

2202

pageblock_nr_pages,

2203

zone->nr_reserved_highatomic);

2203

zone->nr_reserved_highatomic);

2204

}

2204

}

2205

2206

/*

2206

/*

2207

* Convert to ac->migratetype and avoid the normal

2207

* Convert to ac->migratetype and avoid the normal

2208

* pageblock stealing heuristics. Minimally, the caller

2208

* pageblock stealing heuristics. Minimally, the caller

2209

* is doing the work and needs the pages. More

2209

* is doing the work and needs the pages. More

2210

* importantly, if the block was always converted to

2210

* importantly, if the block was always converted to

2211

* MIGRATE_UNMOVABLE or another type then the number

2211

* MIGRATE_UNMOVABLE or another type then the number

2212

* of pageblocks that cannot be completely freed

2212

* of pageblocks that cannot be completely freed

2213

* may increase.

2213

* may increase.

2214

*/

2214

*/

2215

set_pageblock_migratetype(page, ac->migratetype);

2215

set_pageblock_migratetype(page, ac->migratetype);

2216

ret = move_freepages_block(zone, page, ac->migratetype,

2216

ret = move_freepages_block(zone, page, ac->migratetype,

2217

NULL);

2217

NULL);

2218

if (ret) {

2218

if (ret) {

2219

spin_unlock_irqrestore(&zone->lock, flags);

2219

spin_unlock_irqrestore(&zone->lock, flags);

2220

return ret;

2220

return ret;

2221

}

2221

}

2222

}

2222

}

2223

spin_unlock_irqrestore(&zone->lock, flags);

2223

spin_unlock_irqrestore(&zone->lock, flags);

2224

}

2224

}

2225

2226

return false;

2226

return false;

2227

}

2227

}

2228

2229

/*

2229

/*

2230

* Try finding a free buddy page on the fallback list and put it on the free

2230

* Try finding a free buddy page on the fallback list and put it on the free

2231

* list of requested migratetype, possibly along with other pages from the same

2231

* list of requested migratetype, possibly along with other pages from the same

2232

* block, depending on fragmentation avoidance heuristics. Returns true if

2232

* block, depending on fragmentation avoidance heuristics. Returns true if

2233

* fallback was found so that __rmqueue_smallest() can grab it.

2233

* fallback was found so that __rmqueue_smallest() can grab it.

2234

*

2234

*

2235

* The use of signed ints for order and current_order is a deliberate

2235

* The use of signed ints for order and current_order is a deliberate

2236

* deviation from the rest of this file, to make the for loop

2236

* deviation from the rest of this file, to make the for loop

2237

* condition simpler.

2237

* condition simpler.

2238

*/

2238

*/

2239

static inline bool

2239

static inline bool

2240

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

2240

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

2241

{

2241

{

2242

struct free_area *area;

2242

struct free_area *area;

2243

int current_order;

2243

int current_order;

2244

struct page *page;

2244

struct page *page;

2245

int fallback_mt;

2245

int fallback_mt;

2246

bool can_steal;

2246

bool can_steal;

2247

2248

/*

2248

/*

2249

* Find the largest available free page in the other list. This roughly

2249

* Find the largest available free page in the other list. This roughly

2250

* approximates finding the pageblock with the most free pages, which

2250

* approximates finding the pageblock with the most free pages, which

2251

* would be too costly to do exactly.

2251

* would be too costly to do exactly.

2252

*/

2252

*/

2253

for (current_order = MAX_ORDER - 1; current_order >= order;

2253

for (current_order = MAX_ORDER - 1; current_order >= order;

2254

--current_order) {

2254

--current_order) {

2255

area = &(zone->free_area[current_order]);

2255

area = &(zone->free_area[current_order]);

2256

fallback_mt = find_suitable_fallback(area, current_order,

2256

fallback_mt = find_suitable_fallback(area, current_order,

2257

start_migratetype, false, &can_steal);

2257

start_migratetype, false, &can_steal);

2258

if (fallback_mt == -1)

2258

if (fallback_mt == -1)

2259

continue;

2259

continue;

2260

2261

/*

2261

/*

2262

* We cannot steal all free pages from the pageblock and the

2262

* We cannot steal all free pages from the pageblock and the

2263

* requested migratetype is movable. In that case it's better to

2263

* requested migratetype is movable. In that case it's better to

2264

* steal and split the smallest available page instead of the

2264

* steal and split the smallest available page instead of the

2265

* largest available page, because even if the next movable

2265

* largest available page, because even if the next movable

2266

* allocation falls back into a different pageblock than this

2266

* allocation falls back into a different pageblock than this

2267

* one, it won't cause permanent fragmentation.

2267

* one, it won't cause permanent fragmentation.

2268

*/

2268

*/

2269

if (!can_steal && start_migratetype == MIGRATE_MOVABLE

2269

if (!can_steal && start_migratetype == MIGRATE_MOVABLE

2270

&& current_order > order)

2270

&& current_order > order)

2271

goto find_smallest;

2271

goto find_smallest;

2272

2273

goto do_steal;

2273

goto do_steal;

2274

}

2274

}

2275

2276

return false;

2276

return false;

2277

2278

find_smallest:

2278

find_smallest:

2279

for (current_order = order; current_order < MAX_ORDER;

2279

for (current_order = order; current_order < MAX_ORDER;

2280

current_order++) {

2280

current_order++) {

2281

area = &(zone->free_area[current_order]);

2281

area = &(zone->free_area[current_order]);

2282

fallback_mt = find_suitable_fallback(area, current_order,

2282

fallback_mt = find_suitable_fallback(area, current_order,

2283

start_migratetype, false, &can_steal);

2283

start_migratetype, false, &can_steal);

2284

if (fallback_mt != -1)

2284

if (fallback_mt != -1)

2285

break;

2285

break;

2286

}

2286

}

2287

2288

/*

2288

/*

2289

* This should not happen - we already found a suitable fallback

2289

* This should not happen - we already found a suitable fallback

2290

* when looking for the largest page.

2290

* when looking for the largest page.

2291

*/

2291

*/

2292

VM_BUG_ON(current_order == MAX_ORDER);

2292

VM_BUG_ON(current_order == MAX_ORDER);

2293

2294

do_steal:

2294

do_steal:

2295

page = list_first_entry(&area->free_list[fallback_mt],

2295

page = list_first_entry(&area->free_list[fallback_mt],

2296

struct page, lru);

2296

struct page, lru);

2297

2298

steal_suitable_fallback(zone, page, start_migratetype, can_steal);

2298

steal_suitable_fallback(zone, page, start_migratetype, can_steal);

2299

2300

trace_mm_page_alloc_extfrag(page, order, current_order,

2300

trace_mm_page_alloc_extfrag(page, order, current_order,

2301

start_migratetype, fallback_mt);

2301

start_migratetype, fallback_mt);

2302

2303

return true;

2303

return true;

2304

2305

}

2305

}

2306

2307

/*

2307

/*

2308

* Do the hard work of removing an element from the buddy allocator.

2308

* Do the hard work of removing an element from the buddy allocator.

2309

* Call me with the zone->lock already held.

2309

* Call me with the zone->lock already held.

2310

*/

2310

*/

2311

static struct page *__rmqueue(struct zone *zone, unsigned int order,

2311

static struct page *__rmqueue(struct zone *zone, unsigned int order,

2312

int migratetype)

2312

int migratetype)

2313

{

2313

{

2314

struct page *page;

2314

struct page *page;

2315

2316

retry:

2316

retry:

2317

page = __rmqueue_smallest(zone, order, migratetype);

2317

page = __rmqueue_smallest(zone, order, migratetype);

2318

if (unlikely(!page)) {

2318

if (unlikely(!page)) {

2319

if (migratetype == MIGRATE_MOVABLE)

2319

if (migratetype == MIGRATE_MOVABLE)

2320

page = __rmqueue_cma_fallback(zone, order);

2320

page = __rmqueue_cma_fallback(zone, order);

2321

2322

if (!page && __rmqueue_fallback(zone, order, migratetype))

2322

if (!page && __rmqueue_fallback(zone, order, migratetype))

2323

goto retry;

2323

goto retry;

2324

}

2324

}

2325

2326

trace_mm_page_alloc_zone_locked(page, order, migratetype);

2326

trace_mm_page_alloc_zone_locked(page, order, migratetype);

2327

return page;

2327

return page;

2328

}

2328

}

2329

2330

/*

2330

/*

2331

* Obtain a specified number of elements from the buddy allocator, all under

2331

* Obtain a specified number of elements from the buddy allocator, all under

2332

* a single hold of the lock, for efficiency. Add them to the supplied list.

2332

* a single hold of the lock, for efficiency. Add them to the supplied list.

2333

* Returns the number of new pages which were placed at *list.

2333

* Returns the number of new pages which were placed at *list.

2334

*/

2334

*/

2335

static int rmqueue_bulk(struct zone *zone, unsigned int order,

2335

static int rmqueue_bulk(struct zone *zone, unsigned int order,

2336

unsigned long count, struct list_head *list,

2336

unsigned long count, struct list_head *list,

2337

int migratetype, bool cold)

2337

int migratetype, bool cold)

2338

{

2338

{

2339

int i, alloced = 0;

2339

int i, alloced = 0;

2340

2341

spin_lock(&zone->lock);

2341

spin_lock(&zone->lock);

2342

for (i = 0; i < count; ++i) {

2342

for (i = 0; i < count; ++i) {

2343

struct page *page = __rmqueue(zone, order, migratetype);

2343

struct page *page = __rmqueue(zone, order, migratetype);

2344

if (unlikely(page == NULL))

2344

if (unlikely(page == NULL))

2345

break;

2345

break;

2346

2347

if (unlikely(check_pcp_refill(page)))

2347

if (unlikely(check_pcp_refill(page)))

2348

continue;

2348

continue;

2349

2350

/*

2350

/*

2351

* Split buddy pages returned by expand() are received here

2351

* Split buddy pages returned by expand() are received here

2352

* in physical page order. The page is added to the callers and

2352

* in physical page order. The page is added to the callers and

2353

* list and the list head then moves forward. From the callers

2353

* list and the list head then moves forward. From the callers

2354

* perspective, the linked list is ordered by page number in

2354

* perspective, the linked list is ordered by page number in

2355

* some conditions. This is useful for IO devices that can

2355

* some conditions. This is useful for IO devices that can

2356

* merge IO requests if the physical pages are ordered

2356

* merge IO requests if the physical pages are ordered

2357

* properly.

2357

* properly.

2358

*/

2358

*/

2359

if (likely(!cold))

2359

if (likely(!cold))

2360

list_add(&page->lru, list);

2360

list_add(&page->lru, list);

2361

else

2361

else

2362

list_add_tail(&page->lru, list);

2362

list_add_tail(&page->lru, list);

2363

list = &page->lru;

2363

list = &page->lru;

2364

alloced++;

2364

alloced++;

2365

if (is_migrate_cma(get_pcppage_migratetype(page)))

2365

if (is_migrate_cma(get_pcppage_migratetype(page)))

2366

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

2366

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

2367

-(1 << order));

2367

-(1 << order));

2368

}

2368

}

2369

2370

/*

2370

/*

2371

* i pages were removed from the buddy list even if some leak due

2371

* i pages were removed from the buddy list even if some leak due

2372

* to check_pcp_refill failing so adjust NR_FREE_PAGES based

2372

* to check_pcp_refill failing so adjust NR_FREE_PAGES based

2373

* on i. Do not confuse with 'alloced' which is the number of

2373

* on i. Do not confuse with 'alloced' which is the number of

2374

* pages added to the pcp list.

2374

* pages added to the pcp list.

2375

*/

2375

*/

2376

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

2376

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

2377

spin_unlock(&zone->lock);

2377

spin_unlock(&zone->lock);

2378

return alloced;

2378

return alloced;

2379

}

2379

}

2380

2381

#ifdef CONFIG_NUMA

2381

#ifdef CONFIG_NUMA

2382

/*

2382

/*

2383

* Called from the vmstat counter updater to drain pagesets of this

2383

* Called from the vmstat counter updater to drain pagesets of this

2384

* currently executing processor on remote nodes after they have

2384

* currently executing processor on remote nodes after they have

2385

* expired.

2385

* expired.

2386

*

2386

*

2387

* Note that this function must be called with the thread pinned to

2387

* Note that this function must be called with the thread pinned to

2388

* a single processor.

2388

* a single processor.

2389

*/

2389

*/

2390

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

2390

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

2391

{

2391

{

2392

unsigned long flags;

2392

unsigned long flags;

2393

int to_drain, batch;

2393

int to_drain, batch;

2394

2395

local_irq_save(flags);

2395

local_irq_save(flags);

2396

batch = READ_ONCE(pcp->batch);

2396

batch = READ_ONCE(pcp->batch);

2397

to_drain = min(pcp->count, batch);

2397

to_drain = min(pcp->count, batch);

2398

if (to_drain > 0) {

2398

if (to_drain > 0) {

2399

free_pcppages_bulk(zone, to_drain, pcp);

2399

free_pcppages_bulk(zone, to_drain, pcp);

2400

pcp->count -= to_drain;

2400

pcp->count -= to_drain;

2401

}

2401

}

2402

local_irq_restore(flags);

2402

local_irq_restore(flags);

2403

}

2403

}

2404

#endif

2404

#endif

2405

2406

/*

2406

/*

2407

* Drain pcplists of the indicated processor and zone.

2407

* Drain pcplists of the indicated processor and zone.

2408

*

2408

*

2409

* The processor must either be the current processor and the

2409

* The processor must either be the current processor and the

2410

* thread pinned to the current processor or a processor that

2410

* thread pinned to the current processor or a processor that

2411

* is not online.

2411

* is not online.

2412

*/

2412

*/

2413

static void drain_pages_zone(unsigned int cpu, struct zone *zone)

2413

static void drain_pages_zone(unsigned int cpu, struct zone *zone)

2414

{

2414

{

2415

unsigned long flags;

2415

unsigned long flags;

2416

struct per_cpu_pageset *pset;

2416

struct per_cpu_pageset *pset;

2417

struct per_cpu_pages *pcp;

2417

struct per_cpu_pages *pcp;

2418

2419

local_irq_save(flags);

2419

local_irq_save(flags);

2420

pset = per_cpu_ptr(zone->pageset, cpu);

2420

pset = per_cpu_ptr(zone->pageset, cpu);

2421

2422

pcp = &pset->pcp;

2422

pcp = &pset->pcp;

2423

if (pcp->count) {

2423

if (pcp->count) {

2424

free_pcppages_bulk(zone, pcp->count, pcp);

2424

free_pcppages_bulk(zone, pcp->count, pcp);

2425

pcp->count = 0;

2425

pcp->count = 0;

2426

}

2426

}

2427

local_irq_restore(flags);

2427

local_irq_restore(flags);

2428

}

2428

}

2429

2430

/*

2430

/*

2431

* Drain pcplists of all zones on the indicated processor.

2431

* Drain pcplists of all zones on the indicated processor.

2432

*

2432

*

2433

* The processor must either be the current processor and the

2433

* The processor must either be the current processor and the

2434

* thread pinned to the current processor or a processor that

2434

* thread pinned to the current processor or a processor that

2435

* is not online.

2435

* is not online.

2436

*/

2436

*/

2437

static void drain_pages(unsigned int cpu)

2437

static void drain_pages(unsigned int cpu)

2438

{

2438

{

2439

struct zone *zone;

2439

struct zone *zone;

2440

2441

for_each_populated_zone(zone) {

2441

for_each_populated_zone(zone) {

2442

drain_pages_zone(cpu, zone);

2442

drain_pages_zone(cpu, zone);

2443

}

2443

}

2444

}

2444

}

2445

2446

/*

2446

/*

2447

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

2447

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

2448

*

2448

*

2449

* The CPU has to be pinned. When zone parameter is non-NULL, spill just

2449

* The CPU has to be pinned. When zone parameter is non-NULL, spill just

2450

* the single zone's pages.

2450

* the single zone's pages.

2451

*/

2451

*/

2452

void drain_local_pages(struct zone *zone)

2452

void drain_local_pages(struct zone *zone)

2453

{

2453

{

2454

int cpu = smp_processor_id();

2454

int cpu = smp_processor_id();

2455

2456

if (zone)

2456

if (zone)

2457

drain_pages_zone(cpu, zone);

2457

drain_pages_zone(cpu, zone);

2458

else

2458

else

2459

drain_pages(cpu);

2459

drain_pages(cpu);

2460

}

2460

}

2461

2462

static void drain_local_pages_wq(struct work_struct *work)

2462

static void drain_local_pages_wq(struct work_struct *work)

2463

{

2463

{

2464

/*

2464

/*

2465

* drain_all_pages doesn't use proper cpu hotplug protection so

2465

* drain_all_pages doesn't use proper cpu hotplug protection so

2466

* we can race with cpu offline when the WQ can move this from

2466

* we can race with cpu offline when the WQ can move this from

2467

* a cpu pinned worker to an unbound one. We can operate on a different

2467

* a cpu pinned worker to an unbound one. We can operate on a different

2468

* cpu which is allright but we also have to make sure to not move to

2468

* cpu which is allright but we also have to make sure to not move to

2469

* a different one.

2469

* a different one.

2470

*/

2470

*/

2471

preempt_disable();

2471

preempt_disable();

2472

drain_local_pages(NULL);

2472

drain_local_pages(NULL);

2473

preempt_enable();

2473

preempt_enable();

2474

}

2474

}

2475

2476

/*

2476

/*

2477

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

2477

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

2478

*

2478

*

2479

* When zone parameter is non-NULL, spill just the single zone's pages.

2479

* When zone parameter is non-NULL, spill just the single zone's pages.

2480

*

2480

*

2481

* Note that this can be extremely slow as the draining happens in a workqueue.

2481

* Note that this can be extremely slow as the draining happens in a workqueue.

2482

*/

2482

*/

2483

void drain_all_pages(struct zone *zone)

2483

void drain_all_pages(struct zone *zone)

2484

{

2484

{

2485

int cpu;

2485

int cpu;

2486

2487

/*

2487

/*

2488

* Allocate in the BSS so we wont require allocation in

2488

* Allocate in the BSS so we wont require allocation in

2489

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

2489

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

2490

*/

2490

*/

2491

static cpumask_t cpus_with_pcps;

2491

static cpumask_t cpus_with_pcps;

2492

2493

/*

2493

/*

2494

* Make sure nobody triggers this path before mm_percpu_wq is fully

2494

* Make sure nobody triggers this path before mm_percpu_wq is fully

2495

* initialized.

2495

* initialized.

2496

*/

2496

*/

2497

if (WARN_ON_ONCE(!mm_percpu_wq))

2497

if (WARN_ON_ONCE(!mm_percpu_wq))

2498

return;

2498

return;

2499

2500

/*

2500

/*

2501

* Do not drain if one is already in progress unless it's specific to

2501

* Do not drain if one is already in progress unless it's specific to

2502

* a zone. Such callers are primarily CMA and memory hotplug and need

2502

* a zone. Such callers are primarily CMA and memory hotplug and need

2503

* the drain to be complete when the call returns.

2503

* the drain to be complete when the call returns.

2504

*/

2504

*/

2505

if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {

2505

if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {

2506

if (!zone)

2506

if (!zone)

2507

return;

2507

return;

2508

mutex_lock(&pcpu_drain_mutex);

2508

mutex_lock(&pcpu_drain_mutex);

2509

}

2509

}

2510

2511

/*

2511

/*

2512

* We don't care about racing with CPU hotplug event

2512

* We don't care about racing with CPU hotplug event

2513

* as offline notification will cause the notified

2513

* as offline notification will cause the notified

2514

* cpu to drain that CPU pcps and on_each_cpu_mask

2514

* cpu to drain that CPU pcps and on_each_cpu_mask

2515

* disables preemption as part of its processing

2515

* disables preemption as part of its processing

2516

*/

2516

*/

2517

for_each_online_cpu(cpu) {

2517

for_each_online_cpu(cpu) {

2518

struct per_cpu_pageset *pcp;

2518

struct per_cpu_pageset *pcp;

2519

struct zone *z;

2519

struct zone *z;

2520

bool has_pcps = false;

2520

bool has_pcps = false;

2521

2522

if (zone) {

2522

if (zone) {

2523

pcp = per_cpu_ptr(zone->pageset, cpu);

2523

pcp = per_cpu_ptr(zone->pageset, cpu);

2524

if (pcp->pcp.count)

2524

if (pcp->pcp.count)

2525

has_pcps = true;

2525

has_pcps = true;

2526

} else {

2526

} else {

2527

for_each_populated_zone(z) {

2527

for_each_populated_zone(z) {

2528

pcp = per_cpu_ptr(z->pageset, cpu);

2528

pcp = per_cpu_ptr(z->pageset, cpu);

2529

if (pcp->pcp.count) {

2529

if (pcp->pcp.count) {

2530

has_pcps = true;

2530

has_pcps = true;

2531

break;

2531

break;

2532

}

2532

}

2533

}

2533

}

2534

}

2534

}

2535

2536

if (has_pcps)

2536

if (has_pcps)

2537

cpumask_set_cpu(cpu, &cpus_with_pcps);

2537

cpumask_set_cpu(cpu, &cpus_with_pcps);

2538

else

2538

else

2539

cpumask_clear_cpu(cpu, &cpus_with_pcps);

2539

cpumask_clear_cpu(cpu, &cpus_with_pcps);

2540

}

2540

}

2541

2542

for_each_cpu(cpu, &cpus_with_pcps) {

2542

for_each_cpu(cpu, &cpus_with_pcps) {

2543

struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);

2543

struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);

2544

INIT_WORK(work, drain_local_pages_wq);

2544

INIT_WORK(work, drain_local_pages_wq);

2545

queue_work_on(cpu, mm_percpu_wq, work);

2545

queue_work_on(cpu, mm_percpu_wq, work);

2546

}

2546

}

2547

for_each_cpu(cpu, &cpus_with_pcps)

2547

for_each_cpu(cpu, &cpus_with_pcps)

2548

flush_work(per_cpu_ptr(&pcpu_drain, cpu));

2548

flush_work(per_cpu_ptr(&pcpu_drain, cpu));

2549

2550

mutex_unlock(&pcpu_drain_mutex);

2550

mutex_unlock(&pcpu_drain_mutex);

2551

}

2551

}

2552

2553

#ifdef CONFIG_HIBERNATION

2553

#ifdef CONFIG_HIBERNATION

2554

2555

/*

2555

/*

2556

* Touch the watchdog for every WD_PAGE_COUNT pages.

2556

* Touch the watchdog for every WD_PAGE_COUNT pages.

2557

*/

2557

*/

2558

#define WD_PAGE_COUNT (128*1024)

2558

#define WD_PAGE_COUNT (128*1024)

2559

2560

void mark_free_pages(struct zone *zone)

2560

void mark_free_pages(struct zone *zone)

2561

{

2561

{

2562

unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;

2562

unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;

2563

unsigned long flags;

2563

unsigned long flags;

2564

unsigned int order, t;

2564

unsigned int order, t;

2565

struct page *page;

2565

struct page *page;

2566

2567

if (zone_is_empty(zone))

2567

if (zone_is_empty(zone))

2568

return;

2568

return;

2569

2570

spin_lock_irqsave(&zone->lock, flags);

2570

spin_lock_irqsave(&zone->lock, flags);

2571

2572

max_zone_pfn = zone_end_pfn(zone);

2572

max_zone_pfn = zone_end_pfn(zone);

2573

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

2573

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

2574

if (pfn_valid(pfn)) {

2574

if (pfn_valid(pfn)) {

2575

page = pfn_to_page(pfn);

2575

page = pfn_to_page(pfn);

2576

2577

if (!--page_count) {

2577

if (!--page_count) {

2578

touch_nmi_watchdog();

2578

touch_nmi_watchdog();

2579

page_count = WD_PAGE_COUNT;

2579

page_count = WD_PAGE_COUNT;

2580

}

2580

}

2581

2582

if (page_zone(page) != zone)

2582

if (page_zone(page) != zone)

2583

continue;

2583

continue;

2584

2585

if (!swsusp_page_is_forbidden(page))

2585

if (!swsusp_page_is_forbidden(page))

2586

swsusp_unset_page_free(page);

2586

swsusp_unset_page_free(page);

2587

}

2587

}

2588

2589

for_each_migratetype_order(order, t) {

2589

for_each_migratetype_order(order, t) {

2590

list_for_each_entry(page,

2590

list_for_each_entry(page,

2591

&zone->free_area[order].free_list[t], lru) {

2591

&zone->free_area[order].free_list[t], lru) {

2592

unsigned long i;

2592

unsigned long i;

2593

2594

pfn = page_to_pfn(page);

2594

pfn = page_to_pfn(page);

2595

for (i = 0; i < (1UL << order); i++) {

2595

for (i = 0; i < (1UL << order); i++) {

2596

if (!--page_count) {

2596

if (!--page_count) {

2597

touch_nmi_watchdog();

2597

touch_nmi_watchdog();

2598

page_count = WD_PAGE_COUNT;

2598

page_count = WD_PAGE_COUNT;

2599

}

2599

}

2600

swsusp_set_page_free(pfn_to_page(pfn + i));

2600

swsusp_set_page_free(pfn_to_page(pfn + i));

2601

}

2601

}

2602

}

2602

}

2603

}

2603

}

2604

spin_unlock_irqrestore(&zone->lock, flags);

2604

spin_unlock_irqrestore(&zone->lock, flags);

2605

}

2605

}

2606

#endif /* CONFIG_PM */

2606

#endif /* CONFIG_PM */

2607

2608

/*

2608

/*

2609

* Free a 0-order page

2609

* Free a 0-order page

2610

* cold == true ? free a cold page : free a hot page

2610

* cold == true ? free a cold page : free a hot page

2611

*/

2611

*/

2612

void free_hot_cold_page(struct page *page, bool cold)

2612

void free_hot_cold_page(struct page *page, bool cold)

2613

{

2613

{

2614

struct zone *zone = page_zone(page);

2614

struct zone *zone = page_zone(page);

2615

struct per_cpu_pages *pcp;

2615

struct per_cpu_pages *pcp;

2616

unsigned long flags;

2616

unsigned long flags;

2617

unsigned long pfn = page_to_pfn(page);

2617

unsigned long pfn = page_to_pfn(page);

2618

int migratetype;

2618

int migratetype;

2619

2620

if (!free_pcp_prepare(page))

2620

if (!free_pcp_prepare(page))

2621

return;

2621

return;

2622

2623

migratetype = get_pfnblock_migratetype(page, pfn);

2623

migratetype = get_pfnblock_migratetype(page, pfn);

2624

set_pcppage_migratetype(page, migratetype);

2624

set_pcppage_migratetype(page, migratetype);

2625

local_irq_save(flags);

2625

local_irq_save(flags);

2626

__count_vm_event(PGFREE);

2626

__count_vm_event(PGFREE);

2627

2628

/*

2628

/*

2629

* We only track unmovable, reclaimable and movable on pcp lists.

2629

* We only track unmovable, reclaimable and movable on pcp lists.

2630

* Free ISOLATE pages back to the allocator because they are being

2630

* Free ISOLATE pages back to the allocator because they are being

2631

* offlined but treat HIGHATOMIC as movable pages so we can get those

2631

* offlined but treat HIGHATOMIC as movable pages so we can get those

2632

* areas back if necessary. Otherwise, we may have to free

2632

* areas back if necessary. Otherwise, we may have to free

2633

* excessively into the page allocator

2633

* excessively into the page allocator

2634

*/

2634

*/

2635

if (migratetype >= MIGRATE_PCPTYPES) {

2635

if (migratetype >= MIGRATE_PCPTYPES) {

2636

if (unlikely(is_migrate_isolate(migratetype))) {

2636

if (unlikely(is_migrate_isolate(migratetype))) {

2637

free_one_page(zone, page, pfn, 0, migratetype);

2637

free_one_page(zone, page, pfn, 0, migratetype);

2638

goto out;

2638

goto out;

2639

}

2639

}

2640

migratetype = MIGRATE_MOVABLE;

2640

migratetype = MIGRATE_MOVABLE;

2641

}

2641

}

2642

2643

pcp = &this_cpu_ptr(zone->pageset)->pcp;

2643

pcp = &this_cpu_ptr(zone->pageset)->pcp;

2644

if (!cold)

2644

if (!cold)

2645

list_add(&page->lru, &pcp->lists[migratetype]);

2645

list_add(&page->lru, &pcp->lists[migratetype]);

2646

else

2646

else

2647

list_add_tail(&page->lru, &pcp->lists[migratetype]);

2647

list_add_tail(&page->lru, &pcp->lists[migratetype]);

2648

pcp->count++;

2648

pcp->count++;

2649

if (pcp->count >= pcp->high) {

2649

if (pcp->count >= pcp->high) {

2650

unsigned long batch = READ_ONCE(pcp->batch);

2650

unsigned long batch = READ_ONCE(pcp->batch);

2651

free_pcppages_bulk(zone, batch, pcp);

2651

free_pcppages_bulk(zone, batch, pcp);

2652

pcp->count -= batch;

2652

pcp->count -= batch;

2653

}

2653

}

2654

2655

out:

2655

out:

2656

local_irq_restore(flags);

2656

local_irq_restore(flags);

2657

}

2657

}

2658

2659

/*

2659

/*

2660

* Free a list of 0-order pages

2660

* Free a list of 0-order pages

2661

*/

2661

*/

2662

void free_hot_cold_page_list(struct list_head *list, bool cold)

2662

void free_hot_cold_page_list(struct list_head *list, bool cold)

2663

{

2663

{

2664

struct page *page, *next;

2664

struct page *page, *next;

2665

2666

list_for_each_entry_safe(page, next, list, lru) {

2666

list_for_each_entry_safe(page, next, list, lru) {

2667

trace_mm_page_free_batched(page, cold);

2667

trace_mm_page_free_batched(page, cold);

2668

free_hot_cold_page(page, cold);

2668

free_hot_cold_page(page, cold);

2669

}

2669

}

2670

}

2670

}

2671

2672

/*

2672

/*

2673

* split_page takes a non-compound higher-order page, and splits it into

2673

* split_page takes a non-compound higher-order page, and splits it into

2674

* n (1<<order) sub-pages: page[0..n]

2674

* n (1<<order) sub-pages: page[0..n]

2675

* Each sub-page must be freed individually.

2675

* Each sub-page must be freed individually.

2676

*

2676

*

2677

* Note: this is probably too low level an operation for use in drivers.

2677

* Note: this is probably too low level an operation for use in drivers.

2678

* Please consult with lkml before using this in your driver.

2678

* Please consult with lkml before using this in your driver.

2679

*/

2679

*/

2680

void split_page(struct page *page, unsigned int order)

2680

void split_page(struct page *page, unsigned int order)

2681

{

2681

{

2682

int i;

2682

int i;

2683

2684

VM_BUG_ON_PAGE(PageCompound(page), page);

2684

VM_BUG_ON_PAGE(PageCompound(page), page);

2685

VM_BUG_ON_PAGE(!page_count(page), page);

2685

VM_BUG_ON_PAGE(!page_count(page), page);

2686

2687

for (i = 1; i < (1 << order); i++)

2687

for (i = 1; i < (1 << order); i++)

2688

set_page_refcounted(page + i);

2688

set_page_refcounted(page + i);

2689

split_page_owner(page, order);

2689

split_page_owner(page, order);

2690

}

2690

}

2691

EXPORT_SYMBOL_GPL(split_page);

2691

EXPORT_SYMBOL_GPL(split_page);

2692

2693

int __isolate_free_page(struct page *page, unsigned int order)

2693

int __isolate_free_page(struct page *page, unsigned int order)

2694

{

2694

{

2695

unsigned long watermark;

2695

unsigned long watermark;

2696

struct zone *zone;

2696

struct zone *zone;

2697

int mt;

2697

int mt;

2698

2699

BUG_ON(!PageBuddy(page));

2699

BUG_ON(!PageBuddy(page));

2700

2701

zone = page_zone(page);

2701

zone = page_zone(page);

2702

mt = get_pageblock_migratetype(page);

2702

mt = get_pageblock_migratetype(page);

2703

2704

if (!is_migrate_isolate(mt)) {

2704

if (!is_migrate_isolate(mt)) {

2705

/*

2705

/*

2706

* Obey watermarks as if the page was being allocated. We can

2706

* Obey watermarks as if the page was being allocated. We can

2707

* emulate a high-order watermark check with a raised order-0

2707

* emulate a high-order watermark check with a raised order-0

2708

* watermark, because we already know our high-order page

2708

* watermark, because we already know our high-order page

2709

* exists.

2709

* exists.

2710

*/

2710

*/

2711

watermark = min_wmark_pages(zone) + (1UL << order);

2711

watermark = min_wmark_pages(zone) + (1UL << order);

2712

if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))

2712

if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))

2713

return 0;

2713

return 0;

2714

2715

__mod_zone_freepage_state(zone, -(1UL << order), mt);

2715

__mod_zone_freepage_state(zone, -(1UL << order), mt);

2716

}

2716

}

2717

2718

/* Remove page from free list */

2718

/* Remove page from free list */

2719

list_del(&page->lru);

2719

list_del(&page->lru);

2720

zone->free_area[order].nr_free--;

2720

zone->free_area[order].nr_free--;

2721

rmv_page_order(page);

2721

rmv_page_order(page);

2722

2723

/*

2723

/*

2724

* Set the pageblock if the isolated page is at least half of a

2724

* Set the pageblock if the isolated page is at least half of a

2725

* pageblock

2725

* pageblock

2726

*/

2726

*/

2727

if (order >= pageblock_order - 1) {

2727

if (order >= pageblock_order - 1) {

2728

struct page *endpage = page + (1 << order) - 1;

2728

struct page *endpage = page + (1 << order) - 1;

2729

for (; page < endpage; page += pageblock_nr_pages) {

2729

for (; page < endpage; page += pageblock_nr_pages) {

2730

int mt = get_pageblock_migratetype(page);

2730

int mt = get_pageblock_migratetype(page);

2731

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)

2731

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)

2732

&& !is_migrate_highatomic(mt))

2732

&& !is_migrate_highatomic(mt))

2733

set_pageblock_migratetype(page,

2733

set_pageblock_migratetype(page,

2734

MIGRATE_MOVABLE);

2734

MIGRATE_MOVABLE);

2735

}

2735

}

2736

}

2736

}

2737

2738

2739

return 1UL << order;

2739

return 1UL << order;

2740

}

2740

}

2741

2742

/*

2742

/*

2743

* Update NUMA hit/miss statistics

2743

* Update NUMA hit/miss statistics

2744

*

2744

*

2745

* Must be called with interrupts disabled.

2745

* Must be called with interrupts disabled.

2746

*/

2746

*/

2747

static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)

2747

static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)

2748

{

2748

{

2749

#ifdef CONFIG_NUMA

2749

#ifdef CONFIG_NUMA

2750

enum numa_stat_item local_stat = NUMA_LOCAL;

2750

enum numa_stat_item local_stat = NUMA_LOCAL;

2751

2752

if (z->node != numa_node_id())

2752

if (z->node != numa_node_id())

2753

local_stat = NUMA_OTHER;

2753

local_stat = NUMA_OTHER;

2754

2755

if (z->node == preferred_zone->node)

2755

if (z->node == preferred_zone->node)

2756

__inc_numa_state(z, NUMA_HIT);

2756

__inc_numa_state(z, NUMA_HIT);

2757

else {

2757

else {

2758

__inc_numa_state(z, NUMA_MISS);

2758

__inc_numa_state(z, NUMA_MISS);

2759

__inc_numa_state(preferred_zone, NUMA_FOREIGN);

2759

__inc_numa_state(preferred_zone, NUMA_FOREIGN);

2760

}

2760

}

2761

__inc_numa_state(z, local_stat);

2761

__inc_numa_state(z, local_stat);

2762

#endif

2762

#endif

2763

}

2763

}

2764

2765

/* Remove page from the per-cpu list, caller must protect the list */

2765

/* Remove page from the per-cpu list, caller must protect the list */

2766

static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,

2766

static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,

2767

bool cold, struct per_cpu_pages *pcp,

2767

bool cold, struct per_cpu_pages *pcp,

2768

struct list_head *list)

2768

struct list_head *list)

2769

{

2769

{

2770

struct page *page;

2770

struct page *page;

2771

2772

do {

2772

do {

2773

if (list_empty(list)) {

2773

if (list_empty(list)) {

2774

pcp->count += rmqueue_bulk(zone, 0,

2774

pcp->count += rmqueue_bulk(zone, 0,

2775

pcp->batch, list,

2775

pcp->batch, list,

2776

migratetype, cold);

2776

migratetype, cold);

2777

if (unlikely(list_empty(list)))

2777

if (unlikely(list_empty(list)))

2778

return NULL;

2778

return NULL;

2779

}

2779

}

2780

2781

if (cold)

2781

if (cold)

2782

page = list_last_entry(list, struct page, lru);

2782

page = list_last_entry(list, struct page, lru);

2783

else

2783

else

2784

page = list_first_entry(list, struct page, lru);

2784

page = list_first_entry(list, struct page, lru);

2785

2786

list_del(&page->lru);

2786

list_del(&page->lru);

2787

pcp->count--;

2787

pcp->count--;

2788

} while (check_new_pcp(page));

2788

} while (check_new_pcp(page));

2789

2790

return page;

2790

return page;

2791

}

2791

}

2792

2793

/* Lock and remove page from the per-cpu list */

2793

/* Lock and remove page from the per-cpu list */

2794

static struct page *rmqueue_pcplist(struct zone *preferred_zone,

2794

static struct page *rmqueue_pcplist(struct zone *preferred_zone,

2795

struct zone *zone, unsigned int order,

2795

struct zone *zone, unsigned int order,

2796

gfp_t gfp_flags, int migratetype)

2796

gfp_t gfp_flags, int migratetype)

2797

{

2797

{

2798

struct per_cpu_pages *pcp;

2798

struct per_cpu_pages *pcp;

2799

struct list_head *list;

2799

struct list_head *list;

2800

bool cold = ((gfp_flags & __GFP_COLD) != 0);

2800

bool cold = ((gfp_flags & __GFP_COLD) != 0);

2801

struct page *page;

2801

struct page *page;

2802

unsigned long flags;

2802

unsigned long flags;

2803

2804

local_irq_save(flags);

2804

local_irq_save(flags);

2805

pcp = &this_cpu_ptr(zone->pageset)->pcp;

2805

pcp = &this_cpu_ptr(zone->pageset)->pcp;

2806

list = &pcp->lists[migratetype];

2806

list = &pcp->lists[migratetype];

2807

page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);

2807

page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);

2808

if (page) {

2808

if (page) {

2809

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);

2809

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);

2810

zone_statistics(preferred_zone, zone);

2810

zone_statistics(preferred_zone, zone);

2811

}

2811

}

2812

local_irq_restore(flags);

2812

local_irq_restore(flags);

2813

return page;

2813

return page;

2814

}

2814

}

2815

2816

/*

2816

/*

2817

* Allocate a page from the given zone. Use pcplists for order-0 allocations.

2817

* Allocate a page from the given zone. Use pcplists for order-0 allocations.

2818

*/

2818

*/

2819

static inline

2819

static inline

2820

struct page *rmqueue(struct zone *preferred_zone,

2820

struct page *rmqueue(struct zone *preferred_zone,

2821

struct zone *zone, unsigned int order,

2821

struct zone *zone, unsigned int order,

2822

gfp_t gfp_flags, unsigned int alloc_flags,

2822

gfp_t gfp_flags, unsigned int alloc_flags,

2823

int migratetype)

2823

int migratetype)

2824

{

2824

{

2825

unsigned long flags;

2825

unsigned long flags;

2826

struct page *page;

2826

struct page *page;

2827

2828

if (likely(order == 0)) {

2828

if (likely(order == 0)) {

2829

page = rmqueue_pcplist(preferred_zone, zone, order,

2829

page = rmqueue_pcplist(preferred_zone, zone, order,

2830

gfp_flags, migratetype);

2830

gfp_flags, migratetype);

2831

goto out;

2831

goto out;

2832

}

2832

}

2833

2834

/*

2834

/*

2835

* We most definitely don't want callers attempting to

2835

* We most definitely don't want callers attempting to

2836

* allocate greater than order-1 page units with __GFP_NOFAIL.

2836

* allocate greater than order-1 page units with __GFP_NOFAIL.

2837

*/

2837

*/

2838

WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));

2838

WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));

2839

spin_lock_irqsave(&zone->lock, flags);

2839

spin_lock_irqsave(&zone->lock, flags);

2840

2841

do {

2841

do {

2842

page = NULL;

2842

page = NULL;

2843

if (alloc_flags & ALLOC_HARDER) {

2843

if (alloc_flags & ALLOC_HARDER) {

2844

page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);

2844

page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);

2845

if (page)

2845

if (page)

2846

trace_mm_page_alloc_zone_locked(page, order, migratetype);

2846

trace_mm_page_alloc_zone_locked(page, order, migratetype);

2847

}

2847

}

2848

if (!page)

2848

if (!page)

2849

page = __rmqueue(zone, order, migratetype);

2849

page = __rmqueue(zone, order, migratetype);

2850

} while (page && check_new_pages(page, order));

2850

} while (page && check_new_pages(page, order));

2851

spin_unlock(&zone->lock);

2851

spin_unlock(&zone->lock);

2852

if (!page)

2852

if (!page)

2853

goto failed;

2853

goto failed;

2854

__mod_zone_freepage_state(zone, -(1 << order),

2854

__mod_zone_freepage_state(zone, -(1 << order),

2855

get_pcppage_migratetype(page));

2855

get_pcppage_migratetype(page));

2856

2857

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);

2857

__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);

2858

zone_statistics(preferred_zone, zone);

2858

zone_statistics(preferred_zone, zone);

2859

local_irq_restore(flags);

2859

local_irq_restore(flags);

2860

2861

out:

2861

out:

2862

VM_BUG_ON_PAGE(page && bad_range(zone, page), page);

2862

VM_BUG_ON_PAGE(page && bad_range(zone, page), page);

2863

return page;

2863

return page;

2864

2865

failed:

2865

failed:

2866

local_irq_restore(flags);

2866

local_irq_restore(flags);

2867

return NULL;

2867

return NULL;

2868

}

2868

}

2869

2870

#ifdef CONFIG_FAIL_PAGE_ALLOC

2870

#ifdef CONFIG_FAIL_PAGE_ALLOC

2871

2872

static struct {

2872

static struct {

2873

struct fault_attr attr;

2873

struct fault_attr attr;

2874

2875

bool ignore_gfp_highmem;

2875

bool ignore_gfp_highmem;

2876

bool ignore_gfp_reclaim;

2876

bool ignore_gfp_reclaim;

2877

u32 min_order;

2877

u32 min_order;

2878

} fail_page_alloc = {

2878

} fail_page_alloc = {

2879

.attr = FAULT_ATTR_INITIALIZER,

2879

.attr = FAULT_ATTR_INITIALIZER,

2880

.ignore_gfp_reclaim = true,

2880

.ignore_gfp_reclaim = true,

2881

.ignore_gfp_highmem = true,

2881

.ignore_gfp_highmem = true,

2882

.min_order = 1,

2882

.min_order = 1,

2883

};

2883

};

2884

2885

static int __init setup_fail_page_alloc(char *str)

2885

static int __init setup_fail_page_alloc(char *str)

2886

{

2886

{

2887

return setup_fault_attr(&fail_page_alloc.attr, str);

2887

return setup_fault_attr(&fail_page_alloc.attr, str);

2888

}

2888

}

2889

__setup("fail_page_alloc=", setup_fail_page_alloc);

2889

__setup("fail_page_alloc=", setup_fail_page_alloc);

2890

2891

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

2891

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

2892

{

2892

{

2893

if (order < fail_page_alloc.min_order)

2893

if (order < fail_page_alloc.min_order)

2894

return false;

2894

return false;

2895

if (gfp_mask & __GFP_NOFAIL)

2895

if (gfp_mask & __GFP_NOFAIL)

2896

return false;

2896

return false;

2897

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

2897

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

2898

return false;

2898

return false;

2899

if (fail_page_alloc.ignore_gfp_reclaim &&

2899

if (fail_page_alloc.ignore_gfp_reclaim &&

2900

(gfp_mask & __GFP_DIRECT_RECLAIM))

2900

(gfp_mask & __GFP_DIRECT_RECLAIM))

2901

return false;

2901

return false;

2902

2903

return should_fail(&fail_page_alloc.attr, 1 << order);

2903

return should_fail(&fail_page_alloc.attr, 1 << order);

2904

}

2904

}

2905

2906

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

2906

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

2907

2908

static int __init fail_page_alloc_debugfs(void)

2908

static int __init fail_page_alloc_debugfs(void)

2909

{

2909

{

2910

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

2910

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

2911

struct dentry *dir;

2911

struct dentry *dir;

2912

2913

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

2913

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

2914

&fail_page_alloc.attr);

2914

&fail_page_alloc.attr);

2915

if (IS_ERR(dir))

2915

if (IS_ERR(dir))

2916

return PTR_ERR(dir);

2916

return PTR_ERR(dir);

2917

2918

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

2918

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

2919

&fail_page_alloc.ignore_gfp_reclaim))

2919

&fail_page_alloc.ignore_gfp_reclaim))

2920

goto fail;

2920

goto fail;

2921

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

2921

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

2922

&fail_page_alloc.ignore_gfp_highmem))

2922

&fail_page_alloc.ignore_gfp_highmem))

2923

goto fail;

2923

goto fail;

2924

if (!debugfs_create_u32("min-order", mode, dir,

2924

if (!debugfs_create_u32("min-order", mode, dir,

2925

&fail_page_alloc.min_order))

2925

&fail_page_alloc.min_order))

2926

goto fail;

2926

goto fail;

2927

2928

return 0;

2928

return 0;

2929

fail:

2929

fail:

2930

debugfs_remove_recursive(dir);

2930

debugfs_remove_recursive(dir);

2931

2932

return -ENOMEM;

2932

return -ENOMEM;

2933

}

2933

}

2934

2935

late_initcall(fail_page_alloc_debugfs);

2935

late_initcall(fail_page_alloc_debugfs);

2936

2937

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

2937

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

2938

2939

#else /* CONFIG_FAIL_PAGE_ALLOC */

2939

#else /* CONFIG_FAIL_PAGE_ALLOC */

2940

2941

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

2941

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

2942

{

2942

{

2943

return false;

2943

return false;

2944

}

2944

}

2945

2946

#endif /* CONFIG_FAIL_PAGE_ALLOC */

2946

#endif /* CONFIG_FAIL_PAGE_ALLOC */

2947

2948

/*

2948

/*

2949

* Return true if free base pages are above 'mark'. For high-order checks it

2949

* Return true if free base pages are above 'mark'. For high-order checks it

2950

* will return true of the order-0 watermark is reached and there is at least

2950

* will return true of the order-0 watermark is reached and there is at least

2951

* one free page of a suitable size. Checking now avoids taking the zone lock

2951

* one free page of a suitable size. Checking now avoids taking the zone lock

2952

* to check in the allocation paths if no pages are free.

2952

* to check in the allocation paths if no pages are free.

2953

*/

2953

*/

2954

bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

2954

bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

2955

int classzone_idx, unsigned int alloc_flags,

2955

int classzone_idx, unsigned int alloc_flags,

2956

long free_pages)

2956

long free_pages)

2957

{

2957

{

2958

long min = mark;

2958

long min = mark;

2959

int o;

2959

int o;

2960

const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));

2960

const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));

2961

2962

/* free_pages may go negative - that's OK */

2962

/* free_pages may go negative - that's OK */

2963

free_pages -= (1 << order) - 1;

2963

free_pages -= (1 << order) - 1;

2964

2965

if (alloc_flags & ALLOC_HIGH)

2965

if (alloc_flags & ALLOC_HIGH)

2966

min -= min / 2;

2966

min -= min / 2;

2967

2968

/*

2968

/*

2969

* If the caller does not have rights to ALLOC_HARDER then subtract

2969

* If the caller does not have rights to ALLOC_HARDER then subtract

2970

* the high-atomic reserves. This will over-estimate the size of the

2970

* the high-atomic reserves. This will over-estimate the size of the

2971

* atomic reserve but it avoids a search.

2971

* atomic reserve but it avoids a search.

2972

*/

2972

*/

2973

if (likely(!alloc_harder)) {

2973

if (likely(!alloc_harder)) {

2974

free_pages -= z->nr_reserved_highatomic;

2974

free_pages -= z->nr_reserved_highatomic;

2975

} else {

2975

} else {

2976

/*

2976

/*

2977

* OOM victims can try even harder than normal ALLOC_HARDER

2977

* OOM victims can try even harder than normal ALLOC_HARDER

2978

* users on the grounds that it's definitely going to be in

2978

* users on the grounds that it's definitely going to be in

2979

* the exit path shortly and free memory. Any allocation it

2979

* the exit path shortly and free memory. Any allocation it

2980

* makes during the free path will be small and short-lived.

2980

* makes during the free path will be small and short-lived.

2981

*/

2981

*/

2982

if (alloc_flags & ALLOC_OOM)

2982

if (alloc_flags & ALLOC_OOM)

2983

min -= min / 2;

2983

min -= min / 2;

2984

else

2984

else

2985

min -= min / 4;

2985

min -= min / 4;

2986

}

2986

}

2987

2988

2989

#ifdef CONFIG_CMA

2989

#ifdef CONFIG_CMA

2990

/* If allocation can't use CMA areas don't use free CMA pages */

2990

/* If allocation can't use CMA areas don't use free CMA pages */

2991

if (!(alloc_flags & ALLOC_CMA))

2991

if (!(alloc_flags & ALLOC_CMA))

2992

free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

2992

free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

2993

#endif

2993

#endif

2994

2995

/*

2995

/*

2996

* Check watermarks for an order-0 allocation request. If these

2996

* Check watermarks for an order-0 allocation request. If these

2997

* are not met, then a high-order request also cannot go ahead

2997

* are not met, then a high-order request also cannot go ahead

2998

* even if a suitable page happened to be free.

2998

* even if a suitable page happened to be free.

2999

*/

2999

*/

3000

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

3000

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

3001

return false;

3001

return false;

3002

3003

/* If this is an order-0 request then the watermark is fine */

3003

/* If this is an order-0 request then the watermark is fine */

3004

if (!order)

3004

if (!order)

3005

return true;

3005

return true;

3006

3007

/* For a high-order request, check at least one suitable page is free */

3007

/* For a high-order request, check at least one suitable page is free */

3008

for (o = order; o < MAX_ORDER; o++) {

3008

for (o = order; o < MAX_ORDER; o++) {

3009

struct free_area *area = &z->free_area[o];

3009

struct free_area *area = &z->free_area[o];

3010

int mt;

3010

int mt;

3011

3012

if (!area->nr_free)

3012

if (!area->nr_free)

3013

continue;

3013

continue;

3014

3015

for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {

3015

for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {

3016

if (!list_empty(&area->free_list[mt]))

3016

if (!list_empty(&area->free_list[mt]))

3017

return true;

3017

return true;

3018

}

3018

}

3019

3020

#ifdef CONFIG_CMA

3020

#ifdef CONFIG_CMA

3021

if ((alloc_flags & ALLOC_CMA) &&

3021

if ((alloc_flags & ALLOC_CMA) &&

3022

!list_empty(&area->free_list[MIGRATE_CMA])) {

3022

!list_empty(&area->free_list[MIGRATE_CMA])) {

3023

return true;

3023

return true;

3024

}

3024

}

3025

#endif

3025

#endif

3026

if (alloc_harder &&

3026

if (alloc_harder &&

3027

!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))

3027

!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))

3028

return true;

3028

return true;

3029

}

3029

}

3030

return false;

3030

return false;

3031

}

3031

}

3032

3033

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

3033

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

3034

int classzone_idx, unsigned int alloc_flags)

3034

int classzone_idx, unsigned int alloc_flags)

3035

{

3035

{

3036

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

3036

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

3037

zone_page_state(z, NR_FREE_PAGES));

3037

zone_page_state(z, NR_FREE_PAGES));

3038

}

3038

}

3039

3040

static inline bool zone_watermark_fast(struct zone *z, unsigned int order,

3040

static inline bool zone_watermark_fast(struct zone *z, unsigned int order,

3041

unsigned long mark, int classzone_idx, unsigned int alloc_flags)

3041

unsigned long mark, int classzone_idx, unsigned int alloc_flags)

3042

{

3042

{

3043

long free_pages = zone_page_state(z, NR_FREE_PAGES);

3043

long free_pages = zone_page_state(z, NR_FREE_PAGES);

3044

long cma_pages = 0;

3044

long cma_pages = 0;

3045

3046

#ifdef CONFIG_CMA

3046

#ifdef CONFIG_CMA

3047

/* If allocation can't use CMA areas don't use free CMA pages */

3047

/* If allocation can't use CMA areas don't use free CMA pages */

3048

if (!(alloc_flags & ALLOC_CMA))

3048

if (!(alloc_flags & ALLOC_CMA))

3049

cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);

3049

cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);

3050

#endif

3050

#endif

3051

3052

/*

3052

/*

3053

* Fast check for order-0 only. If this fails then the reserves

3053

* Fast check for order-0 only. If this fails then the reserves

3054

* need to be calculated. There is a corner case where the check

3054

* need to be calculated. There is a corner case where the check

3055

* passes but only the high-order atomic reserve are free. If

3055

* passes but only the high-order atomic reserve are free. If

3056

* the caller is !atomic then it'll uselessly search the free

3056

* the caller is !atomic then it'll uselessly search the free

3057

* list. That corner case is then slower but it is harmless.

3057

* list. That corner case is then slower but it is harmless.

3058

*/

3058

*/

3059

if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])

3059

if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])

3060

return true;

3060

return true;

3061

3062

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

3062

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

3063

free_pages);

3063

free_pages);

3064

}

3064

}

3065

3066

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

3066

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

3067

unsigned long mark, int classzone_idx)

3067

unsigned long mark, int classzone_idx)

3068

{

3068

{

3069

long free_pages = zone_page_state(z, NR_FREE_PAGES);

3069

long free_pages = zone_page_state(z, NR_FREE_PAGES);

3070

3071

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

3071

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

3072

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

3072

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

3073

3074

return __zone_watermark_ok(z, order, mark, classzone_idx, 0,

3074

return __zone_watermark_ok(z, order, mark, classzone_idx, 0,

3075

free_pages);

3075

free_pages);

3076

}

3076

}

3077

3078

#ifdef CONFIG_NUMA

3078

#ifdef CONFIG_NUMA

3079

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

3079

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

3080

{

3080

{

3081

return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=

3081

return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=

3082

RECLAIM_DISTANCE;

3082

RECLAIM_DISTANCE;

3083

}

3083

}

3084

#else /* CONFIG_NUMA */

3084

#else /* CONFIG_NUMA */

3085

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

3085

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

3086

{

3086

{

3087

return true;

3087

return true;

3088

}

3088

}

3089

#endif /* CONFIG_NUMA */

3089

#endif /* CONFIG_NUMA */

3090

3091

/*

3091

/*

3092

* get_page_from_freelist goes through the zonelist trying to allocate

3092

* get_page_from_freelist goes through the zonelist trying to allocate

3093

* a page.

3093

* a page.

3094

*/

3094

*/

3095

static struct page *

3095

static struct page *

3096

get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

3096

get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,

3097

const struct alloc_context *ac)

3097

const struct alloc_context *ac)

3098

{

3098

{

3099

struct zoneref *z = ac->preferred_zoneref;

3099

struct zoneref *z = ac->preferred_zoneref;

3100

struct zone *zone;

3100

struct zone *zone;

3101

struct pglist_data *last_pgdat_dirty_limit = NULL;

3101

struct pglist_data *last_pgdat_dirty_limit = NULL;

3102

3103

/*

3103

/*

3104

* Scan zonelist, looking for a zone with enough free.

3104

* Scan zonelist, looking for a zone with enough free.

3105

* See also __cpuset_node_allowed() comment in kernel/cpuset.c.

3105

* See also __cpuset_node_allowed() comment in kernel/cpuset.c.

3106

*/

3106

*/

3107

for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3107

for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3108

ac->nodemask) {

3108

ac->nodemask) {

3109

struct page *page;

3109

struct page *page;

3110

unsigned long mark;

3110

unsigned long mark;

3111

3112

if (cpusets_enabled() &&

3112

if (cpusets_enabled() &&

3113

(alloc_flags & ALLOC_CPUSET) &&

3113

(alloc_flags & ALLOC_CPUSET) &&

3114

!__cpuset_zone_allowed(zone, gfp_mask))

3114

!__cpuset_zone_allowed(zone, gfp_mask))

3115

continue;

3115

continue;

3116

/*

3116

/*

3117

* When allocating a page cache page for writing, we

3117

* When allocating a page cache page for writing, we

3118

* want to get it from a node that is within its dirty

3118

* want to get it from a node that is within its dirty

3119

* limit, such that no single node holds more than its

3119

* limit, such that no single node holds more than its

3120

* proportional share of globally allowed dirty pages.

3120

* proportional share of globally allowed dirty pages.

3121

* The dirty limits take into account the node's

3121

* The dirty limits take into account the node's

3122

* lowmem reserves and high watermark so that kswapd

3122

* lowmem reserves and high watermark so that kswapd

3123

* should be able to balance it without having to

3123

* should be able to balance it without having to

3124

* write pages from its LRU list.

3124

* write pages from its LRU list.

3125

*

3125

*

3126

* XXX: For now, allow allocations to potentially

3126

* XXX: For now, allow allocations to potentially

3127

* exceed the per-node dirty limit in the slowpath

3127

* exceed the per-node dirty limit in the slowpath

3128

* (spread_dirty_pages unset) before going into reclaim,

3128

* (spread_dirty_pages unset) before going into reclaim,

3129

* which is important when on a NUMA setup the allowed

3129

* which is important when on a NUMA setup the allowed

3130

* nodes are together not big enough to reach the

3130

* nodes are together not big enough to reach the

3131

* global limit. The proper fix for these situations

3131

* global limit. The proper fix for these situations

3132

* will require awareness of nodes in the

3132

* will require awareness of nodes in the

3133

* dirty-throttling and the flusher threads.

3133

* dirty-throttling and the flusher threads.

3134

*/

3134

*/

3135

if (ac->spread_dirty_pages) {

3135

if (ac->spread_dirty_pages) {

3136

if (last_pgdat_dirty_limit == zone->zone_pgdat)

3136

if (last_pgdat_dirty_limit == zone->zone_pgdat)

3137

continue;

3137

continue;

3138

3139

if (!node_dirty_ok(zone->zone_pgdat)) {

3139

if (!node_dirty_ok(zone->zone_pgdat)) {

3140

last_pgdat_dirty_limit = zone->zone_pgdat;

3140

last_pgdat_dirty_limit = zone->zone_pgdat;

3141

continue;

3141

continue;

3142

}

3142

}

3143

}

3143

}

3144

3145

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

3145

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

3146

if (!zone_watermark_fast(zone, order, mark,

3146

if (!zone_watermark_fast(zone, order, mark,

3147

ac_classzone_idx(ac), alloc_flags)) {

3147

ac_classzone_idx(ac), alloc_flags)) {

3148

int ret;

3148

int ret;

3149

3150

/* Checked here to keep the fast path fast */

3150

/* Checked here to keep the fast path fast */

3151

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

3151

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

3152

if (alloc_flags & ALLOC_NO_WATERMARKS)

3152

if (alloc_flags & ALLOC_NO_WATERMARKS)

3153

goto try_this_zone;

3153

goto try_this_zone;

3154

3155

if (node_reclaim_mode == 0 ||

3155

if (node_reclaim_mode == 0 ||

3156

!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))

3156

!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))

3157

continue;

3157

continue;

3158

3159

ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);

3159

ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);

3160

switch (ret) {

3160

switch (ret) {

3161

case NODE_RECLAIM_NOSCAN:

3161

case NODE_RECLAIM_NOSCAN:

3162

/* did not scan */

3162

/* did not scan */

3163

continue;

3163

continue;

3164

case NODE_RECLAIM_FULL:

3164

case NODE_RECLAIM_FULL:

3165

/* scanned but unreclaimable */

3165

/* scanned but unreclaimable */

3166

continue;

3166

continue;

3167

default:

3167

default:

3168

/* did we reclaim enough */

3168

/* did we reclaim enough */

3169

if (zone_watermark_ok(zone, order, mark,

3169

if (zone_watermark_ok(zone, order, mark,

3170

ac_classzone_idx(ac), alloc_flags))

3170

ac_classzone_idx(ac), alloc_flags))

3171

goto try_this_zone;

3171

goto try_this_zone;

3172

3173

continue;

3173

continue;

3174

}

3174

}

3175

}

3175

}

3176

3177

try_this_zone:

3177

try_this_zone:

3178

page = rmqueue(ac->preferred_zoneref->zone, zone, order,

3178

page = rmqueue(ac->preferred_zoneref->zone, zone, order,

3179

gfp_mask, alloc_flags, ac->migratetype);

3179

gfp_mask, alloc_flags, ac->migratetype);

3180

if (page) {

3180

if (page) {

3181

prep_new_page(page, order, gfp_mask, alloc_flags);

3181

prep_new_page(page, order, gfp_mask, alloc_flags);

3182

3183

/*

3183

/*

3184

* If this is a high-order atomic allocation then check

3184

* If this is a high-order atomic allocation then check

3185

* if the pageblock should be reserved for the future

3185

* if the pageblock should be reserved for the future

3186

*/

3186

*/

3187

if (unlikely(order && (alloc_flags & ALLOC_HARDER)))

3187

if (unlikely(order && (alloc_flags & ALLOC_HARDER)))

3188

reserve_highatomic_pageblock(page, zone, order);

3188

reserve_highatomic_pageblock(page, zone, order);

3189

3190

return page;

3190

return page;

3191

}

3191

}

3192

}

3192

}

3193

3194

return NULL;

3194

return NULL;

3195

}

3195

}

3196

3197

/*

3197

/*

3198

* Large machines with many possible nodes should not always dump per-node

3198

* Large machines with many possible nodes should not always dump per-node

3199

* meminfo in irq context.

3199

* meminfo in irq context.

3200

*/

3200

*/

3201

static inline bool should_suppress_show_mem(void)

3201

static inline bool should_suppress_show_mem(void)

3202

{

3202

{

3203

bool ret = false;

3203

bool ret = false;

3204

3205

#if NODES_SHIFT > 8

3205

#if NODES_SHIFT > 8

3206

ret = in_interrupt();

3206

ret = in_interrupt();

3207

#endif

3207

#endif

3208

return ret;

3208

return ret;

3209

}

3209

}

3210

3211

static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)

3211

static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)

3212

{

3212

{

3213

unsigned int filter = SHOW_MEM_FILTER_NODES;

3213

unsigned int filter = SHOW_MEM_FILTER_NODES;

3214

static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);

3214

static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);

3215

3216

if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))

3216

if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))

3217

return;

3217

return;

3218

3219

/*

3219

/*

3220

* This documents exceptions given to allocations in certain

3220

* This documents exceptions given to allocations in certain

3221

* contexts that are allowed to allocate outside current's set

3221

* contexts that are allowed to allocate outside current's set

3222

* of allowed nodes.

3222

* of allowed nodes.

3223

*/

3223

*/

3224

if (!(gfp_mask & __GFP_NOMEMALLOC))

3224

if (!(gfp_mask & __GFP_NOMEMALLOC))

3225

if (tsk_is_oom_victim(current) ||

3225

if (tsk_is_oom_victim(current) ||

3226

(current->flags & (PF_MEMALLOC | PF_EXITING)))

3226

(current->flags & (PF_MEMALLOC | PF_EXITING)))

3227

filter &= ~SHOW_MEM_FILTER_NODES;

3227

filter &= ~SHOW_MEM_FILTER_NODES;

3228

if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))

3228

if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))

3229

filter &= ~SHOW_MEM_FILTER_NODES;

3229

filter &= ~SHOW_MEM_FILTER_NODES;

3230

3231

show_mem(filter, nodemask);

3231

show_mem(filter, nodemask);

3232

}

3232

}

3233

3234

void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)

3234

void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)

3235

{

3235

{

3236

struct va_format vaf;

3236

struct va_format vaf;

3237

va_list args;

3237

va_list args;

3238

static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,

3238

static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,

3239

DEFAULT_RATELIMIT_BURST);

3239

DEFAULT_RATELIMIT_BURST);

3240

3241

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))

3241

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))

3242

return;

3242

return;

3243

3244

pr_warn("%s: ", current->comm);

3244

pr_warn("%s: ", current->comm);

3245

3246

va_start(args, fmt);

3246

va_start(args, fmt);

3247

vaf.fmt = fmt;

3247

vaf.fmt = fmt;

3248

vaf.va = &args;

3248

vaf.va = &args;

3249

pr_cont("%pV", &vaf);

3249

pr_cont("%pV", &vaf);

3250

va_end(args);

3250

va_end(args);

3251

3252

pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);

3252

pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);

3253

if (nodemask)

3253

if (nodemask)

3254

pr_cont("%*pbl\n", nodemask_pr_args(nodemask));

3254

pr_cont("%*pbl\n", nodemask_pr_args(nodemask));

3255

else

3255

else

3256

pr_cont("(null)\n");

3256

pr_cont("(null)\n");

3257

3258

cpuset_print_current_mems_allowed();

3258

cpuset_print_current_mems_allowed();

3259

3260

dump_stack();

3260

dump_stack();

3261

warn_alloc_show_mem(gfp_mask, nodemask);

3261

warn_alloc_show_mem(gfp_mask, nodemask);

3262

}

3262

}

3263

3264

static inline struct page *

3264

static inline struct page *

3265

__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,

3265

__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,

3266

unsigned int alloc_flags,

3266

unsigned int alloc_flags,

3267

const struct alloc_context *ac)

3267

const struct alloc_context *ac)

3268

{

3268

{

3269

struct page *page;

3269

struct page *page;

3270

3271

page = get_page_from_freelist(gfp_mask, order,

3271

page = get_page_from_freelist(gfp_mask, order,

3272

alloc_flags|ALLOC_CPUSET, ac);

3272

alloc_flags|ALLOC_CPUSET, ac);

3273

/*

3273

/*

3274

* fallback to ignore cpuset restriction if our nodes

3274

* fallback to ignore cpuset restriction if our nodes

3275

* are depleted

3275

* are depleted

3276

*/

3276

*/

3277

if (!page)

3277

if (!page)

3278

page = get_page_from_freelist(gfp_mask, order,

3278

page = get_page_from_freelist(gfp_mask, order,

3279

alloc_flags, ac);

3279

alloc_flags, ac);

3280

3281

return page;

3281

return page;

3282

}

3282

}

3283

3284

static inline struct page *

3284

static inline struct page *

3285

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

3285

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

3286

const struct alloc_context *ac, unsigned long *did_some_progress)

3286

const struct alloc_context *ac, unsigned long *did_some_progress)

3287

{

3287

{

3288

struct oom_control oc = {

3288

struct oom_control oc = {

3289

.zonelist = ac->zonelist,

3289

.zonelist = ac->zonelist,

3290

.nodemask = ac->nodemask,

3290

.nodemask = ac->nodemask,

3291

.memcg = NULL,

3291

.memcg = NULL,

3292

.gfp_mask = gfp_mask,

3292

.gfp_mask = gfp_mask,

3293

.order = order,

3293

.order = order,

3294

};

3294

};

3295

struct page *page;

3295

struct page *page;

3296

3297

*did_some_progress = 0;

3297

*did_some_progress = 0;

3298

3299

/*

3299

/*

3300

* Acquire the oom lock. If that fails, somebody else is

3300

* Acquire the oom lock. If that fails, somebody else is

3301

* making progress for us.

3301

* making progress for us.

3302

*/

3302

*/

3303

if (!mutex_trylock(&oom_lock)) {

3303

if (!mutex_trylock(&oom_lock)) {

3304

*did_some_progress = 1;

3304

*did_some_progress = 1;

3305

schedule_timeout_uninterruptible(1);

3305

schedule_timeout_uninterruptible(1);

3306

return NULL;

3306

return NULL;

3307

}

3307

}

3308

3309

/*

3309

/*

3310

* Go through the zonelist yet one more time, keep very high watermark

3310

* Go through the zonelist yet one more time, keep very high watermark

3311

* here, this is only to catch a parallel oom killing, we must fail if

3311

* here, this is only to catch a parallel oom killing, we must fail if

3312

* we're still under heavy pressure. But make sure that this reclaim

3312

* we're still under heavy pressure. But make sure that this reclaim

3313

* attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY

3313

* attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY

3314

* allocation which will never fail due to oom_lock already held.

3314

* allocation which will never fail due to oom_lock already held.

3315

*/

3315

*/

3316

page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &

3316

page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &

3317

~__GFP_DIRECT_RECLAIM, order,

3317

~__GFP_DIRECT_RECLAIM, order,

3318

ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);

3318

ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);

3319

if (page)

3319

if (page)

3320

goto out;

3320

goto out;

3321

3322

/* Coredumps can quickly deplete all memory reserves */

3322

/* Coredumps can quickly deplete all memory reserves */

3323

if (current->flags & PF_DUMPCORE)

3323

if (current->flags & PF_DUMPCORE)

3324

goto out;

3324

goto out;

3325

/* The OOM killer will not help higher order allocs */

3325

/* The OOM killer will not help higher order allocs */

3326

if (order > PAGE_ALLOC_COSTLY_ORDER)

3326

if (order > PAGE_ALLOC_COSTLY_ORDER)

3327

goto out;

3327

goto out;

3328

/*

3328

/*

3329

* We have already exhausted all our reclaim opportunities without any

3329

* We have already exhausted all our reclaim opportunities without any

3330

* success so it is time to admit defeat. We will skip the OOM killer

3330

* success so it is time to admit defeat. We will skip the OOM killer

3331

* because it is very likely that the caller has a more reasonable

3331

* because it is very likely that the caller has a more reasonable

3332

* fallback than shooting a random task.

3332

* fallback than shooting a random task.

3333

*/

3333

*/

3334

if (gfp_mask & __GFP_RETRY_MAYFAIL)

3334

if (gfp_mask & __GFP_RETRY_MAYFAIL)

3335

goto out;

3335

goto out;

3336

/* The OOM killer does not needlessly kill tasks for lowmem */

3336

/* The OOM killer does not needlessly kill tasks for lowmem */

3337

if (ac->high_zoneidx < ZONE_NORMAL)

3337

if (ac->high_zoneidx < ZONE_NORMAL)

3338

goto out;

3338

goto out;

3339

if (pm_suspended_storage())

3339

if (pm_suspended_storage())

3340

goto out;

3340

goto out;

3341

/*

3341

/*

3342

* XXX: GFP_NOFS allocations should rather fail than rely on

3342

* XXX: GFP_NOFS allocations should rather fail than rely on

3343

* other request to make a forward progress.

3343

* other request to make a forward progress.

3344

* We are in an unfortunate situation where out_of_memory cannot

3344

* We are in an unfortunate situation where out_of_memory cannot

3345

* do much for this context but let's try it to at least get

3345

* do much for this context but let's try it to at least get

3346

* access to memory reserved if the current task is killed (see

3346

* access to memory reserved if the current task is killed (see

3347

* out_of_memory). Once filesystems are ready to handle allocation

3347

* out_of_memory). Once filesystems are ready to handle allocation

3348

* failures more gracefully we should just bail out here.

3348

* failures more gracefully we should just bail out here.

3349

*/

3349

*/

3350

3351

/* The OOM killer may not free memory on a specific node */

3351

/* The OOM killer may not free memory on a specific node */

3352

if (gfp_mask & __GFP_THISNODE)

3352

if (gfp_mask & __GFP_THISNODE)

3353

goto out;

3353

goto out;

3354

3355

/* Exhausted what can be done so it's blamo time */

3355

/* Exhausted what can be done so it's blamo time */

3356

if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {

3356

if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {

3357

*did_some_progress = 1;

3357

*did_some_progress = 1;

3358

3359

/*

3359

/*

3360

* Help non-failing allocations by giving them access to memory

3360

* Help non-failing allocations by giving them access to memory

3361

* reserves

3361

* reserves

3362

*/

3362

*/

3363

if (gfp_mask & __GFP_NOFAIL)

3363

if (gfp_mask & __GFP_NOFAIL)

3364

page = __alloc_pages_cpuset_fallback(gfp_mask, order,

3364

page = __alloc_pages_cpuset_fallback(gfp_mask, order,

3365

ALLOC_NO_WATERMARKS, ac);

3365

ALLOC_NO_WATERMARKS, ac);

3366

}

3366

}

3367

out:

3367

out:

3368

mutex_unlock(&oom_lock);

3368

mutex_unlock(&oom_lock);

3369

return page;

3369

return page;

3370

}

3370

}

3371

3372

/*

3372

/*

3373

* Maximum number of compaction retries wit a progress before OOM

3373

* Maximum number of compaction retries wit a progress before OOM

3374

* killer is consider as the only way to move forward.

3374

* killer is consider as the only way to move forward.

3375

*/

3375

*/

3376

#define MAX_COMPACT_RETRIES 16

3376

#define MAX_COMPACT_RETRIES 16

3377

3378

#ifdef CONFIG_COMPACTION

3378

#ifdef CONFIG_COMPACTION

3379

/* Try memory compaction for high-order allocations before reclaim */

3379

/* Try memory compaction for high-order allocations before reclaim */

3380

static struct page *

3380

static struct page *

3381

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

3381

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

3382

unsigned int alloc_flags, const struct alloc_context *ac,

3382

unsigned int alloc_flags, const struct alloc_context *ac,

3383

enum compact_priority prio, enum compact_result *compact_result)

3383

enum compact_priority prio, enum compact_result *compact_result)

3384

{

3384

{

3385

struct page *page;

3385

struct page *page;

3386

unsigned int noreclaim_flag;

3386

unsigned int noreclaim_flag;

3387

3388

if (!order)

3388

if (!order)

3389

return NULL;

3389

return NULL;

3390

3391

noreclaim_flag = memalloc_noreclaim_save();

3391

noreclaim_flag = memalloc_noreclaim_save();

3392

*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,

3392

*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,

3393

prio);

3393

prio);

3394

memalloc_noreclaim_restore(noreclaim_flag);

3394

memalloc_noreclaim_restore(noreclaim_flag);

3395

3396

if (*compact_result <= COMPACT_INACTIVE)

3396

if (*compact_result <= COMPACT_INACTIVE)

3397

return NULL;

3397

return NULL;

3398

3399

/*

3399

/*

3400

* At least in one zone compaction wasn't deferred or skipped, so let's

3400

* At least in one zone compaction wasn't deferred or skipped, so let's

3401

* count a compaction stall

3401

* count a compaction stall

3402

*/

3402

*/

3403

count_vm_event(COMPACTSTALL);

3403

count_vm_event(COMPACTSTALL);

3404

3405

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3405

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3406

3407

if (page) {

3407

if (page) {

3408

struct zone *zone = page_zone(page);

3408

struct zone *zone = page_zone(page);

3409

3410

zone->compact_blockskip_flush = false;

3410

zone->compact_blockskip_flush = false;

3411

compaction_defer_reset(zone, order, true);

3411

compaction_defer_reset(zone, order, true);

3412

count_vm_event(COMPACTSUCCESS);

3412

count_vm_event(COMPACTSUCCESS);

3413

return page;

3413

return page;

3414

}

3414

}

3415

3416

/*

3416

/*

3417

* It's bad if compaction run occurs and fails. The most likely reason

3417

* It's bad if compaction run occurs and fails. The most likely reason

3418

* is that pages exist, but not enough to satisfy watermarks.

3418

* is that pages exist, but not enough to satisfy watermarks.

3419

*/

3419

*/

3420

count_vm_event(COMPACTFAIL);

3420

count_vm_event(COMPACTFAIL);

3421

3422

cond_resched();

3422

cond_resched();

3423

3424

return NULL;

3424

return NULL;

3425

}

3425

}

3426

3427

static inline bool

3427

static inline bool

3428

should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,

3428

should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,

3429

enum compact_result compact_result,

3429

enum compact_result compact_result,

3430

enum compact_priority *compact_priority,

3430

enum compact_priority *compact_priority,

3431

int *compaction_retries)

3431

int *compaction_retries)

3432

{

3432

{

3433

int max_retries = MAX_COMPACT_RETRIES;

3433

int max_retries = MAX_COMPACT_RETRIES;

3434

int min_priority;

3434

int min_priority;

3435

bool ret = false;

3435

bool ret = false;

3436

int retries = *compaction_retries;

3436

int retries = *compaction_retries;

3437

enum compact_priority priority = *compact_priority;

3437

enum compact_priority priority = *compact_priority;

3438

3439

if (!order)

3439

if (!order)

3440

return false;

3440

return false;

3441

3442

if (compaction_made_progress(compact_result))

3442

if (compaction_made_progress(compact_result))

3443

(*compaction_retries)++;

3443

(*compaction_retries)++;

3444

3445

/*

3445

/*

3446

* compaction considers all the zone as desperately out of memory

3446

* compaction considers all the zone as desperately out of memory

3447

* so it doesn't really make much sense to retry except when the

3447

* so it doesn't really make much sense to retry except when the

3448

* failure could be caused by insufficient priority

3448

* failure could be caused by insufficient priority

3449

*/

3449

*/

3450

if (compaction_failed(compact_result))

3450

if (compaction_failed(compact_result))

3451

goto check_priority;

3451

goto check_priority;

3452

3453

/*

3453

/*

3454

* make sure the compaction wasn't deferred or didn't bail out early

3454

* make sure the compaction wasn't deferred or didn't bail out early

3455

* due to locks contention before we declare that we should give up.

3455

* due to locks contention before we declare that we should give up.

3456

* But do not retry if the given zonelist is not suitable for

3456

* But do not retry if the given zonelist is not suitable for

3457

* compaction.

3457

* compaction.

3458

*/

3458

*/

3459

if (compaction_withdrawn(compact_result)) {

3459

if (compaction_withdrawn(compact_result)) {

3460

ret = compaction_zonelist_suitable(ac, order, alloc_flags);

3460

ret = compaction_zonelist_suitable(ac, order, alloc_flags);

3461

goto out;

3461

goto out;

3462

}

3462

}

3463

3464

/*

3464

/*

3465

* !costly requests are much more important than __GFP_RETRY_MAYFAIL

3465

* !costly requests are much more important than __GFP_RETRY_MAYFAIL

3466

* costly ones because they are de facto nofail and invoke OOM

3466

* costly ones because they are de facto nofail and invoke OOM

3467

* killer to move on while costly can fail and users are ready

3467

* killer to move on while costly can fail and users are ready

3468

* to cope with that. 1/4 retries is rather arbitrary but we

3468

* to cope with that. 1/4 retries is rather arbitrary but we

3469

* would need much more detailed feedback from compaction to

3469

* would need much more detailed feedback from compaction to

3470

* make a better decision.

3470

* make a better decision.

3471

*/

3471

*/

3472

if (order > PAGE_ALLOC_COSTLY_ORDER)

3472

if (order > PAGE_ALLOC_COSTLY_ORDER)

3473

max_retries /= 4;

3473

max_retries /= 4;

3474

if (*compaction_retries <= max_retries) {

3474

if (*compaction_retries <= max_retries) {

3475

ret = true;

3475

ret = true;

3476

goto out;

3476

goto out;

3477

}

3477

}

3478

3479

/*

3479

/*

3480

* Make sure there are attempts at the highest priority if we exhausted

3480

* Make sure there are attempts at the highest priority if we exhausted

3481

* all retries or failed at the lower priorities.

3481

* all retries or failed at the lower priorities.

3482

*/

3482

*/

3483

check_priority:

3483

check_priority:

3484

min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?

3484

min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?

3485

MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;

3485

MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;

3486

3487

if (*compact_priority > min_priority) {

3487

if (*compact_priority > min_priority) {

3488

(*compact_priority)--;

3488

(*compact_priority)--;

3489

*compaction_retries = 0;

3489

*compaction_retries = 0;

3490

ret = true;

3490

ret = true;

3491

}

3491

}

3492

out:

3492

out:

3493

trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);

3493

trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);

3494

return ret;

3494

return ret;

3495

}

3495

}

3496

#else

3496

#else

3497

static inline struct page *

3497

static inline struct page *

3498

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

3498

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

3499

unsigned int alloc_flags, const struct alloc_context *ac,

3499

unsigned int alloc_flags, const struct alloc_context *ac,

3500

enum compact_priority prio, enum compact_result *compact_result)

3500

enum compact_priority prio, enum compact_result *compact_result)

3501

{

3501

{

3502

*compact_result = COMPACT_SKIPPED;

3502

*compact_result = COMPACT_SKIPPED;

3503

return NULL;

3503

return NULL;

3504

}

3504

}

3505

3506

static inline bool

3506

static inline bool

3507

should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,

3507

should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,

3508

enum compact_result compact_result,

3508

enum compact_result compact_result,

3509

enum compact_priority *compact_priority,

3509

enum compact_priority *compact_priority,

3510

int *compaction_retries)

3510

int *compaction_retries)

3511

{

3511

{

3512

struct zone *zone;

3512

struct zone *zone;

3513

struct zoneref *z;

3513

struct zoneref *z;

3514

3515

if (!order || order > PAGE_ALLOC_COSTLY_ORDER)

3515

if (!order || order > PAGE_ALLOC_COSTLY_ORDER)

3516

return false;

3516

return false;

3517

3518

/*

3518

/*

3519

* There are setups with compaction disabled which would prefer to loop

3519

* There are setups with compaction disabled which would prefer to loop

3520

* inside the allocator rather than hit the oom killer prematurely.

3520

* inside the allocator rather than hit the oom killer prematurely.

3521

* Let's give them a good hope and keep retrying while the order-0

3521

* Let's give them a good hope and keep retrying while the order-0

3522

* watermarks are OK.

3522

* watermarks are OK.

3523

*/

3523

*/

3524

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3524

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3525

ac->nodemask) {

3525

ac->nodemask) {

3526

if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),

3526

if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),

3527

ac_classzone_idx(ac), alloc_flags))

3527

ac_classzone_idx(ac), alloc_flags))

3528

return true;

3528

return true;

3529

}

3529

}

3530

return false;

3530

return false;

3531

}

3531

}

3532

#endif /* CONFIG_COMPACTION */

3532

#endif /* CONFIG_COMPACTION */

3533

3534

#ifdef CONFIG_LOCKDEP

3534

#ifdef CONFIG_LOCKDEP

3535

struct lockdep_map __fs_reclaim_map =

3535

struct lockdep_map __fs_reclaim_map =

3536

STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);

3536

STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);

3537

3538

static bool __need_fs_reclaim(gfp_t gfp_mask)

3538

static bool __need_fs_reclaim(gfp_t gfp_mask)

3539

{

3539

{

3540

gfp_mask = current_gfp_context(gfp_mask);

3540

gfp_mask = current_gfp_context(gfp_mask);

3541

3542

/* no reclaim without waiting on it */

3542

/* no reclaim without waiting on it */

3543

if (!(gfp_mask & __GFP_DIRECT_RECLAIM))

3543

if (!(gfp_mask & __GFP_DIRECT_RECLAIM))

3544

return false;

3544

return false;

3545

3546

/* this guy won't enter reclaim */

3546

/* this guy won't enter reclaim */

3547

if (current->flags & PF_MEMALLOC)

3547

if (current->flags & PF_MEMALLOC)

3548

return false;

3548

return false;

3549

3550

/* We're only interested __GFP_FS allocations for now */

3550

/* We're only interested __GFP_FS allocations for now */

3551

if (!(gfp_mask & __GFP_FS))

3551

if (!(gfp_mask & __GFP_FS))

3552

return false;

3552

return false;

3553

3554

if (gfp_mask & __GFP_NOLOCKDEP)

3554

if (gfp_mask & __GFP_NOLOCKDEP)

3555

return false;

3555

return false;

3556

3557

return true;

3557

return true;

3558

}

3558

}

3559

3560

void fs_reclaim_acquire(gfp_t gfp_mask)

3560

void fs_reclaim_acquire(gfp_t gfp_mask)

3561

{

3561

{

3562

if (__need_fs_reclaim(gfp_mask))

3562

if (__need_fs_reclaim(gfp_mask))

3563

lock_map_acquire(&__fs_reclaim_map);

3563

lock_map_acquire(&__fs_reclaim_map);

3564

}

3564

}

3565

EXPORT_SYMBOL_GPL(fs_reclaim_acquire);

3565

EXPORT_SYMBOL_GPL(fs_reclaim_acquire);

3566

3567

void fs_reclaim_release(gfp_t gfp_mask)

3567

void fs_reclaim_release(gfp_t gfp_mask)

3568

{

3568

{

3569

if (__need_fs_reclaim(gfp_mask))

3569

if (__need_fs_reclaim(gfp_mask))

3570

lock_map_release(&__fs_reclaim_map);

3570

lock_map_release(&__fs_reclaim_map);

3571

}

3571

}

3572

EXPORT_SYMBOL_GPL(fs_reclaim_release);

3572

EXPORT_SYMBOL_GPL(fs_reclaim_release);

3573

#endif

3573

#endif

3574

3575

/* Perform direct synchronous page reclaim */

3575

/* Perform direct synchronous page reclaim */

3576

static int

3576

static int

3577

__perform_reclaim(gfp_t gfp_mask, unsigned int order,

3577

__perform_reclaim(gfp_t gfp_mask, unsigned int order,

3578

const struct alloc_context *ac)

3578

const struct alloc_context *ac)

3579

{

3579

{

3580

struct reclaim_state reclaim_state;

3580

struct reclaim_state reclaim_state;

3581

int progress;

3581

int progress;

3582

unsigned int noreclaim_flag;

3582

unsigned int noreclaim_flag;

3583

3584

cond_resched();

3584

cond_resched();

3585

3586

/* We now go into synchronous reclaim */

3586

/* We now go into synchronous reclaim */

3587

cpuset_memory_pressure_bump();

3587

cpuset_memory_pressure_bump();

3588

noreclaim_flag = memalloc_noreclaim_save();

3588

noreclaim_flag = memalloc_noreclaim_save();

3589

fs_reclaim_acquire(gfp_mask);

3589

fs_reclaim_acquire(gfp_mask);

3590

reclaim_state.reclaimed_slab = 0;

3590

reclaim_state.reclaimed_slab = 0;

3591

current->reclaim_state = &reclaim_state;

3591

current->reclaim_state = &reclaim_state;

3592

3593

progress = try_to_free_pages(ac->zonelist, order, gfp_mask,

3593

progress = try_to_free_pages(ac->zonelist, order, gfp_mask,

3594

ac->nodemask);

3594

ac->nodemask);

3595

3596

current->reclaim_state = NULL;

3596

current->reclaim_state = NULL;

3597

fs_reclaim_release(gfp_mask);

3597

fs_reclaim_release(gfp_mask);

3598

memalloc_noreclaim_restore(noreclaim_flag);

3598

memalloc_noreclaim_restore(noreclaim_flag);

3599

3600

cond_resched();

3600

cond_resched();

3601

3602

return progress;

3602

return progress;

3603

}

3603

}

3604

3605

/* The really slow allocator path where we enter direct reclaim */

3605

/* The really slow allocator path where we enter direct reclaim */

3606

static inline struct page *

3606

static inline struct page *

3607

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

3607

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

3608

unsigned int alloc_flags, const struct alloc_context *ac,

3608

unsigned int alloc_flags, const struct alloc_context *ac,

3609

unsigned long *did_some_progress)

3609

unsigned long *did_some_progress)

3610

{

3610

{

3611

struct page *page = NULL;

3611

struct page *page = NULL;

3612

bool drained = false;

3612

bool drained = false;

3613

3614

*did_some_progress = __perform_reclaim(gfp_mask, order, ac);

3614

*did_some_progress = __perform_reclaim(gfp_mask, order, ac);

3615

if (unlikely(!(*did_some_progress)))

3615

if (unlikely(!(*did_some_progress)))

3616

return NULL;

3616

return NULL;

3617

3618

retry:

3618

retry:

3619

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3619

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3620

3621

/*

3621

/*

3622

* If an allocation failed after direct reclaim, it could be because

3622

* If an allocation failed after direct reclaim, it could be because

3623

* pages are pinned on the per-cpu lists or in high alloc reserves.

3623

* pages are pinned on the per-cpu lists or in high alloc reserves.

3624

* Shrink them them and try again

3624

* Shrink them them and try again

3625

*/

3625

*/

3626

if (!page && !drained) {

3626

if (!page && !drained) {

3627

unreserve_highatomic_pageblock(ac, false);

3627

unreserve_highatomic_pageblock(ac, false);

3628

drain_all_pages(NULL);

3628

drain_all_pages(NULL);

3629

drained = true;

3629

drained = true;

3630

goto retry;

3630

goto retry;

3631

}

3631

}

3632

3633

return page;

3633

return page;

3634

}

3634

}

3635

3636

static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)

3636

static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)

3637

{

3637

{

3638

struct zoneref *z;

3638

struct zoneref *z;

3639

struct zone *zone;

3639

struct zone *zone;

3640

pg_data_t *last_pgdat = NULL;

3640

pg_data_t *last_pgdat = NULL;

3641

3642

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,

3642

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,

3643

ac->high_zoneidx, ac->nodemask) {

3643

ac->high_zoneidx, ac->nodemask) {

3644

if (last_pgdat != zone->zone_pgdat)

3644

if (last_pgdat != zone->zone_pgdat)

3645

wakeup_kswapd(zone, order, ac->high_zoneidx);

3645

wakeup_kswapd(zone, order, ac->high_zoneidx);

3646

last_pgdat = zone->zone_pgdat;

3646

last_pgdat = zone->zone_pgdat;

3647

}

3647

}

3648

}

3648

}

3649

3650

static inline unsigned int

3650

static inline unsigned int

3651

gfp_to_alloc_flags(gfp_t gfp_mask)

3651

gfp_to_alloc_flags(gfp_t gfp_mask)

3652

{

3652

{

3653

unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

3653

unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

3654

3655

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

3655

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

3656

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

3656

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

3657

3658

/*

3658

/*

3659

* The caller may dip into page reserves a bit more if the caller

3659

* The caller may dip into page reserves a bit more if the caller

3660

* cannot run direct reclaim, or if the caller has realtime scheduling

3660

* cannot run direct reclaim, or if the caller has realtime scheduling

3661

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

3661

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

3662

* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).

3662

* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).

3663

*/

3663

*/

3664

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

3664

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

3665

3666

if (gfp_mask & __GFP_ATOMIC) {

3666

if (gfp_mask & __GFP_ATOMIC) {

3667

/*

3667

/*

3668

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

3668

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

3669

* if it can't schedule.

3669

* if it can't schedule.

3670

*/

3670

*/

3671

if (!(gfp_mask & __GFP_NOMEMALLOC))

3671

if (!(gfp_mask & __GFP_NOMEMALLOC))

3672

alloc_flags |= ALLOC_HARDER;

3672

alloc_flags |= ALLOC_HARDER;

3673

/*

3673

/*

3674

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

3674

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

3675

* comment for __cpuset_node_allowed().

3675

* comment for __cpuset_node_allowed().

3676

*/

3676

*/

3677

alloc_flags &= ~ALLOC_CPUSET;

3677

alloc_flags &= ~ALLOC_CPUSET;

3678

} else if (unlikely(rt_task(current)) && !in_interrupt())

3678

} else if (unlikely(rt_task(current)) && !in_interrupt())

3679

alloc_flags |= ALLOC_HARDER;

3679

alloc_flags |= ALLOC_HARDER;

3680

3681

#ifdef CONFIG_CMA

3681

#ifdef CONFIG_CMA

3682

if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

3682

if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

3683

alloc_flags |= ALLOC_CMA;

3683

alloc_flags |= ALLOC_CMA;

3684

#endif

3684

#endif

3685

return alloc_flags;

3685

return alloc_flags;

3686

}

3686

}

3687

3688

static bool oom_reserves_allowed(struct task_struct *tsk)

3688

static bool oom_reserves_allowed(struct task_struct *tsk)

3689

{

3689

{

3690

if (!tsk_is_oom_victim(tsk))

3690

if (!tsk_is_oom_victim(tsk))

3691

return false;

3691

return false;

3692

3693

/*

3693

/*

3694

* !MMU doesn't have oom reaper so give access to memory reserves

3694

* !MMU doesn't have oom reaper so give access to memory reserves

3695

* only to the thread with TIF_MEMDIE set

3695

* only to the thread with TIF_MEMDIE set

3696

*/

3696

*/

3697

if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))

3697

if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))

3698

return false;

3698

return false;

3699

3700

return true;

3700

return true;

3701

}

3701

}

3702

3703

/*

3703

/*

3704

* Distinguish requests which really need access to full memory

3704

* Distinguish requests which really need access to full memory

3705

* reserves from oom victims which can live with a portion of it

3705

* reserves from oom victims which can live with a portion of it

3706

*/

3706

*/

3707

static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)

3707

static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)

3708

{

3708

{

3709

if (unlikely(gfp_mask & __GFP_NOMEMALLOC))

3709

if (unlikely(gfp_mask & __GFP_NOMEMALLOC))

3710

return 0;

3710

return 0;

3711

if (gfp_mask & __GFP_MEMALLOC)

3711

if (gfp_mask & __GFP_MEMALLOC)

3712

return ALLOC_NO_WATERMARKS;

3712

return ALLOC_NO_WATERMARKS;

3713

if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

3713

if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

3714

return ALLOC_NO_WATERMARKS;

3714

return ALLOC_NO_WATERMARKS;

3715

if (!in_interrupt()) {

3715

if (!in_interrupt()) {

3716

if (current->flags & PF_MEMALLOC)

3716

if (current->flags & PF_MEMALLOC)

3717

return ALLOC_NO_WATERMARKS;

3717

return ALLOC_NO_WATERMARKS;

3718

else if (oom_reserves_allowed(current))

3718

else if (oom_reserves_allowed(current))

3719

return ALLOC_OOM;

3719

return ALLOC_OOM;

3720

}

3720

}

3721

3722

return 0;

3722

return 0;

3723

}

3723

}

3724

3725

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

3725

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

3726

{

3726

{

3727

return !!__gfp_pfmemalloc_flags(gfp_mask);

3727

return !!__gfp_pfmemalloc_flags(gfp_mask);

3728

}

3728

}

3729

3730

/*

3730

/*

3731

* Checks whether it makes sense to retry the reclaim to make a forward progress

3731

* Checks whether it makes sense to retry the reclaim to make a forward progress

3732

* for the given allocation request.

3732

* for the given allocation request.

3733

*

3733

*

3734

* We give up when we either have tried MAX_RECLAIM_RETRIES in a row

3734

* We give up when we either have tried MAX_RECLAIM_RETRIES in a row

3735

* without success, or when we couldn't even meet the watermark if we

3735

* without success, or when we couldn't even meet the watermark if we

3736

* reclaimed all remaining pages on the LRU lists.

3736

* reclaimed all remaining pages on the LRU lists.

3737

*

3737

*

3738

* Returns true if a retry is viable or false to enter the oom path.

3738

* Returns true if a retry is viable or false to enter the oom path.

3739

*/

3739

*/

3740

static inline bool

3740

static inline bool

3741

should_reclaim_retry(gfp_t gfp_mask, unsigned order,

3741

should_reclaim_retry(gfp_t gfp_mask, unsigned order,

3742

struct alloc_context *ac, int alloc_flags,

3742

struct alloc_context *ac, int alloc_flags,

3743

bool did_some_progress, int *no_progress_loops)

3743

bool did_some_progress, int *no_progress_loops)

3744

{

3744

{

3745

struct zone *zone;

3745

struct zone *zone;

3746

struct zoneref *z;

3746

struct zoneref *z;

3747

3748

/*

3748

/*

3749

* Costly allocations might have made a progress but this doesn't mean

3749

* Costly allocations might have made a progress but this doesn't mean

3750

* their order will become available due to high fragmentation so

3750

* their order will become available due to high fragmentation so

3751

* always increment the no progress counter for them

3751

* always increment the no progress counter for them

3752

*/

3752

*/

3753

if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)

3753

if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)

3754

*no_progress_loops = 0;

3754

*no_progress_loops = 0;

3755

else

3755

else

3756

(*no_progress_loops)++;

3756

(*no_progress_loops)++;

3757

3758

/*

3758

/*

3759

* Make sure we converge to OOM if we cannot make any progress

3759

* Make sure we converge to OOM if we cannot make any progress

3760

* several times in the row.

3760

* several times in the row.

3761

*/

3761

*/

3762

if (*no_progress_loops > MAX_RECLAIM_RETRIES) {

3762

if (*no_progress_loops > MAX_RECLAIM_RETRIES) {

3763

/* Before OOM, exhaust highatomic_reserve */

3763

/* Before OOM, exhaust highatomic_reserve */

3764

return unreserve_highatomic_pageblock(ac, true);

3764

return unreserve_highatomic_pageblock(ac, true);

3765

}

3765

}

3766

3767

/*

3767

/*

3768

* Keep reclaiming pages while there is a chance this will lead

3768

* Keep reclaiming pages while there is a chance this will lead

3769

* somewhere. If none of the target zones can satisfy our allocation

3769

* somewhere. If none of the target zones can satisfy our allocation

3770

* request even if all reclaimable pages are considered then we are

3770

* request even if all reclaimable pages are considered then we are

3771

* screwed and have to go OOM.

3771

* screwed and have to go OOM.

3772

*/

3772

*/

3773

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3773

for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,

3774

ac->nodemask) {

3774

ac->nodemask) {

3775

unsigned long available;

3775

unsigned long available;

3776

unsigned long reclaimable;

3776

unsigned long reclaimable;

3777

unsigned long min_wmark = min_wmark_pages(zone);

3777

unsigned long min_wmark = min_wmark_pages(zone);

3778

bool wmark;

3778

bool wmark;

3779

3780

available = reclaimable = zone_reclaimable_pages(zone);

3780

available = reclaimable = zone_reclaimable_pages(zone);

3781

available += zone_page_state_snapshot(zone, NR_FREE_PAGES);

3781

available += zone_page_state_snapshot(zone, NR_FREE_PAGES);

3782

3783

/*

3783

/*

3784

* Would the allocation succeed if we reclaimed all

3784

* Would the allocation succeed if we reclaimed all

3785

* reclaimable pages?

3785

* reclaimable pages?

3786

*/

3786

*/

3787

wmark = __zone_watermark_ok(zone, order, min_wmark,

3787

wmark = __zone_watermark_ok(zone, order, min_wmark,

3788

ac_classzone_idx(ac), alloc_flags, available);

3788

ac_classzone_idx(ac), alloc_flags, available);

3789

trace_reclaim_retry_zone(z, order, reclaimable,

3789

trace_reclaim_retry_zone(z, order, reclaimable,

3790

available, min_wmark, *no_progress_loops, wmark);

3790

available, min_wmark, *no_progress_loops, wmark);

3791

if (wmark) {

3791

if (wmark) {

3792

/*

3792

/*

3793

* If we didn't make any progress and have a lot of

3793

* If we didn't make any progress and have a lot of

3794

* dirty + writeback pages then we should wait for

3794

* dirty + writeback pages then we should wait for

3795

* an IO to complete to slow down the reclaim and

3795

* an IO to complete to slow down the reclaim and

3796

* prevent from pre mature OOM

3796

* prevent from pre mature OOM

3797

*/

3797

*/

3798

if (!did_some_progress) {

3798

if (!did_some_progress) {

3799

unsigned long write_pending;

3799

unsigned long write_pending;

3800

3801

write_pending = zone_page_state_snapshot(zone,

3801

write_pending = zone_page_state_snapshot(zone,

3802

NR_ZONE_WRITE_PENDING);

3802

NR_ZONE_WRITE_PENDING);

3803

3804

if (2 * write_pending > reclaimable) {

3804

if (2 * write_pending > reclaimable) {

3805

congestion_wait(BLK_RW_ASYNC, HZ/10);

3805

congestion_wait(BLK_RW_ASYNC, HZ/10);

3806

return true;

3806

return true;

3807

}

3807

}

3808

}

3808

}

3809

3810

/*

3810

/*

3811

* Memory allocation/reclaim might be called from a WQ

3811

* Memory allocation/reclaim might be called from a WQ

3812

* context and the current implementation of the WQ

3812

* context and the current implementation of the WQ

3813

* concurrency control doesn't recognize that

3813

* concurrency control doesn't recognize that

3814

* a particular WQ is congested if the worker thread is

3814

* a particular WQ is congested if the worker thread is

3815

* looping without ever sleeping. Therefore we have to

3815

* looping without ever sleeping. Therefore we have to

3816

* do a short sleep here rather than calling

3816

* do a short sleep here rather than calling

3817

* cond_resched().

3817

* cond_resched().

3818

*/

3818

*/

3819

if (current->flags & PF_WQ_WORKER)

3819

if (current->flags & PF_WQ_WORKER)

3820

schedule_timeout_uninterruptible(1);

3820

schedule_timeout_uninterruptible(1);

3821

else

3821

else

3822

cond_resched();

3822

cond_resched();

3823

3824

return true;

3824

return true;

3825

}

3825

}

3826

}

3826

}

3827

3828

return false;

3828

return false;

3829

}

3829

}

3830

3831

static inline bool

3831

static inline bool

3832

check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)

3832

check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)

3833

{

3833

{

3834

/*

3834

/*

3835

* It's possible that cpuset's mems_allowed and the nodemask from

3835

* It's possible that cpuset's mems_allowed and the nodemask from

3836

* mempolicy don't intersect. This should be normally dealt with by

3836

* mempolicy don't intersect. This should be normally dealt with by

3837

* policy_nodemask(), but it's possible to race with cpuset update in

3837

* policy_nodemask(), but it's possible to race with cpuset update in

3838

* such a way the check therein was true, and then it became false

3838

* such a way the check therein was true, and then it became false

3839

* before we got our cpuset_mems_cookie here.

3839

* before we got our cpuset_mems_cookie here.

3840

* This assumes that for all allocations, ac->nodemask can come only

3840

* This assumes that for all allocations, ac->nodemask can come only

3841

* from MPOL_BIND mempolicy (whose documented semantics is to be ignored

3841

* from MPOL_BIND mempolicy (whose documented semantics is to be ignored

3842

* when it does not intersect with the cpuset restrictions) or the

3842

* when it does not intersect with the cpuset restrictions) or the

3843

* caller can deal with a violated nodemask.

3843

* caller can deal with a violated nodemask.

3844

*/

3844

*/

3845

if (cpusets_enabled() && ac->nodemask &&

3845

if (cpusets_enabled() && ac->nodemask &&

3846

!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {

3846

!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {

3847

ac->nodemask = NULL;

3847

ac->nodemask = NULL;

3848

return true;

3848

return true;

3849

}

3849

}

3850

3851

/*

3851

/*

3852

* When updating a task's mems_allowed or mempolicy nodemask, it is

3852

* When updating a task's mems_allowed or mempolicy nodemask, it is

3853

* possible to race with parallel threads in such a way that our

3853

* possible to race with parallel threads in such a way that our

3854

* allocation can fail while the mask is being updated. If we are about

3854

* allocation can fail while the mask is being updated. If we are about

3855

* to fail, check if the cpuset changed during allocation and if so,

3855

* to fail, check if the cpuset changed during allocation and if so,

3856

* retry.

3856

* retry.

3857

*/

3857

*/

3858

if (read_mems_allowed_retry(cpuset_mems_cookie))

3858

if (read_mems_allowed_retry(cpuset_mems_cookie))

3859

return true;

3859

return true;

3860

3861

return false;

3861

return false;

3862

}

3862

}

3863

3864

static inline struct page *

3864

static inline struct page *

3865

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

3865

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

3866

struct alloc_context *ac)

3866

struct alloc_context *ac)

3867

{

3867

{

3868

bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;

3868

bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;

3869

const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;

3869

const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;

3870

struct page *page = NULL;

3870

struct page *page = NULL;

3871

unsigned int alloc_flags;

3871

unsigned int alloc_flags;

3872

unsigned long did_some_progress;

3872

unsigned long did_some_progress;

3873

enum compact_priority compact_priority;

3873

enum compact_priority compact_priority;

3874

enum compact_result compact_result;

3874

enum compact_result compact_result;

3875

int compaction_retries;

3875

int compaction_retries;

3876

int no_progress_loops;

3876

int no_progress_loops;

3877

unsigned int cpuset_mems_cookie;

3877

unsigned int cpuset_mems_cookie;

3878

int reserve_flags;

3878

int reserve_flags;

3879

3880

/*

3880

/*

3881

* We also sanity check to catch abuse of atomic reserves being used by

3881

* We also sanity check to catch abuse of atomic reserves being used by

3882

* callers that are not in atomic context.

3882

* callers that are not in atomic context.

3883

*/

3883

*/

3884

if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==

3884

if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==

3885

(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))

3885

(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))

3886

gfp_mask &= ~__GFP_ATOMIC;

3886

gfp_mask &= ~__GFP_ATOMIC;

3887

3888

retry_cpuset:

3888

retry_cpuset:

3889

compaction_retries = 0;

3889

compaction_retries = 0;

3890

no_progress_loops = 0;

3890

no_progress_loops = 0;

3891

compact_priority = DEF_COMPACT_PRIORITY;

3891

compact_priority = DEF_COMPACT_PRIORITY;

3892

cpuset_mems_cookie = read_mems_allowed_begin();

3892

cpuset_mems_cookie = read_mems_allowed_begin();

3893

3894

/*

3894

/*

3895

* The fast path uses conservative alloc_flags to succeed only until

3895

* The fast path uses conservative alloc_flags to succeed only until

3896

* kswapd needs to be woken up, and to avoid the cost of setting up

3896

* kswapd needs to be woken up, and to avoid the cost of setting up

3897

* alloc_flags precisely. So we do that now.

3897

* alloc_flags precisely. So we do that now.

3898

*/

3898

*/

3899

alloc_flags = gfp_to_alloc_flags(gfp_mask);

3899

alloc_flags = gfp_to_alloc_flags(gfp_mask);

3900

3901

/*

3901

/*

3902

* We need to recalculate the starting point for the zonelist iterator

3902

* We need to recalculate the starting point for the zonelist iterator

3903

* because we might have used different nodemask in the fast path, or

3903

* because we might have used different nodemask in the fast path, or

3904

* there was a cpuset modification and we are retrying - otherwise we

3904

* there was a cpuset modification and we are retrying - otherwise we

3905

* could end up iterating over non-eligible zones endlessly.

3905

* could end up iterating over non-eligible zones endlessly.

3906

*/

3906

*/

3907

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

3907

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

3908

ac->high_zoneidx, ac->nodemask);

3908

ac->high_zoneidx, ac->nodemask);

3909

if (!ac->preferred_zoneref->zone)

3909

if (!ac->preferred_zoneref->zone)

3910

goto nopage;

3910

goto nopage;

3911

3912

if (gfp_mask & __GFP_KSWAPD_RECLAIM)

3912

if (gfp_mask & __GFP_KSWAPD_RECLAIM)

3913

wake_all_kswapds(order, ac);

3913

wake_all_kswapds(order, ac);

3914

3915

/*

3915

/*

3916

* The adjusted alloc_flags might result in immediate success, so try

3916

* The adjusted alloc_flags might result in immediate success, so try

3917

* that first

3917

* that first

3918

*/

3918

*/

3919

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3919

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3920

if (page)

3920

if (page)

3921

goto got_pg;

3921

goto got_pg;

3922

3923

/*

3923

/*

3924

* For costly allocations, try direct compaction first, as it's likely

3924

* For costly allocations, try direct compaction first, as it's likely

3925

* that we have enough base pages and don't need to reclaim. For non-

3925

* that we have enough base pages and don't need to reclaim. For non-

3926

* movable high-order allocations, do that as well, as compaction will

3926

* movable high-order allocations, do that as well, as compaction will

3927

* try prevent permanent fragmentation by migrating from blocks of the

3927

* try prevent permanent fragmentation by migrating from blocks of the

3928

* same migratetype.

3928

* same migratetype.

3929

* Don't try this for allocations that are allowed to ignore

3929

* Don't try this for allocations that are allowed to ignore

3930

* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.

3930

* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.

3931

*/

3931

*/

3932

if (can_direct_reclaim &&

3932

if (can_direct_reclaim &&

3933

(costly_order ||

3933

(costly_order ||

3934

(order > 0 && ac->migratetype != MIGRATE_MOVABLE))

3934

(order > 0 && ac->migratetype != MIGRATE_MOVABLE))

3935

&& !gfp_pfmemalloc_allowed(gfp_mask)) {

3935

&& !gfp_pfmemalloc_allowed(gfp_mask)) {

3936

page = __alloc_pages_direct_compact(gfp_mask, order,

3936

page = __alloc_pages_direct_compact(gfp_mask, order,

3937

alloc_flags, ac,

3937

alloc_flags, ac,

3938

INIT_COMPACT_PRIORITY,

3938

INIT_COMPACT_PRIORITY,

3939

&compact_result);

3939

&compact_result);

3940

if (page)

3940

if (page)

3941

goto got_pg;

3941

goto got_pg;

3942

3943

/*

3943

/*

3944

* Checks for costly allocations with __GFP_NORETRY, which

3944

* Checks for costly allocations with __GFP_NORETRY, which

3945

* includes THP page fault allocations

3945

* includes THP page fault allocations

3946

*/

3946

*/

3947

if (costly_order && (gfp_mask & __GFP_NORETRY)) {

3947

if (costly_order && (gfp_mask & __GFP_NORETRY)) {

3948

/*

3948

/*

3949

* If compaction is deferred for high-order allocations,

3949

* If compaction is deferred for high-order allocations,

3950

* it is because sync compaction recently failed. If

3950

* it is because sync compaction recently failed. If

3951

* this is the case and the caller requested a THP

3951

* this is the case and the caller requested a THP

3952

* allocation, we do not want to heavily disrupt the

3952

* allocation, we do not want to heavily disrupt the

3953

* system, so we fail the allocation instead of entering

3953

* system, so we fail the allocation instead of entering

3954

* direct reclaim.

3954

* direct reclaim.

3955

*/

3955

*/

3956

if (compact_result == COMPACT_DEFERRED)

3956

if (compact_result == COMPACT_DEFERRED)

3957

goto nopage;

3957

goto nopage;

3958

3959

/*

3959

/*

3960

* Looks like reclaim/compaction is worth trying, but

3960

* Looks like reclaim/compaction is worth trying, but

3961

* sync compaction could be very expensive, so keep

3961

* sync compaction could be very expensive, so keep

3962

* using async compaction.

3962

* using async compaction.

3963

*/

3963

*/

3964

compact_priority = INIT_COMPACT_PRIORITY;

3964

compact_priority = INIT_COMPACT_PRIORITY;

3965

}

3965

}

3966

}

3966

}

3967

3968

retry:

3968

retry:

3969

/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */

3969

/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */

3970

if (gfp_mask & __GFP_KSWAPD_RECLAIM)

3970

if (gfp_mask & __GFP_KSWAPD_RECLAIM)

3971

wake_all_kswapds(order, ac);

3971

wake_all_kswapds(order, ac);

3972

3973

reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);

3973

reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);

3974

if (reserve_flags)

3974

if (reserve_flags)

3975

alloc_flags = reserve_flags;

3975

alloc_flags = reserve_flags;

3976

3977

/*

3977

/*

3978

* Reset the zonelist iterators if memory policies can be ignored.

3978

* Reset the zonelist iterators if memory policies can be ignored.

3979

* These allocations are high priority and system rather than user

3979

* These allocations are high priority and system rather than user

3980

* orientated.

3980

* orientated.

3981

*/

3981

*/

3982

if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {

3982

if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {

3983

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

3983

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

3984

ac->high_zoneidx, ac->nodemask);

3984

ac->high_zoneidx, ac->nodemask);

3985

}

3985

}

3986

3987

/* Attempt with potentially adjusted zonelist and alloc_flags */

3987

/* Attempt with potentially adjusted zonelist and alloc_flags */

3988

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3988

page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);

3989

if (page)

3989

if (page)

3990

goto got_pg;

3990

goto got_pg;

3991

3992

/* Caller is not willing to reclaim, we can't balance anything */

3992

/* Caller is not willing to reclaim, we can't balance anything */

3993

if (!can_direct_reclaim)

3993

if (!can_direct_reclaim)

3994

goto nopage;

3994

goto nopage;

3995

3996

/* Avoid recursion of direct reclaim */

3996

/* Avoid recursion of direct reclaim */

3997

if (current->flags & PF_MEMALLOC)

3997

if (current->flags & PF_MEMALLOC)

3998

goto nopage;

3998

goto nopage;

3999

4000

/* Try direct reclaim and then allocating */

4000

/* Try direct reclaim and then allocating */

4001

page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,

4001

page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,

4002

&did_some_progress);

4002

&did_some_progress);

4003

if (page)

4003

if (page)

4004

goto got_pg;

4004

goto got_pg;

4005

4006

/* Try direct compaction and then allocating */

4006

/* Try direct compaction and then allocating */

4007

page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,

4007

page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,

4008

compact_priority, &compact_result);

4008

compact_priority, &compact_result);

4009

if (page)

4009

if (page)

4010

goto got_pg;

4010

goto got_pg;

4011

4012

/* Do not loop if specifically requested */

4012

/* Do not loop if specifically requested */

4013

if (gfp_mask & __GFP_NORETRY)

4013

if (gfp_mask & __GFP_NORETRY)

4014

goto nopage;

4014

goto nopage;

4015

4016

/*

4016

/*

4017

* Do not retry costly high order allocations unless they are

4017

* Do not retry costly high order allocations unless they are

4018

* __GFP_RETRY_MAYFAIL

4018

* __GFP_RETRY_MAYFAIL

4019

*/

4019

*/

4020

if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))

4020

if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))

4021

goto nopage;

4021

goto nopage;

4022

4023

if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,

4023

if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,

4024

did_some_progress > 0, &no_progress_loops))

4024

did_some_progress > 0, &no_progress_loops))

4025

goto retry;

4025

goto retry;

4026

4027

/*

4027

/*

4028

* It doesn't make any sense to retry for the compaction if the order-0

4028

* It doesn't make any sense to retry for the compaction if the order-0

4029

* reclaim is not able to make any progress because the current

4029

* reclaim is not able to make any progress because the current

4030

* implementation of the compaction depends on the sufficient amount

4030

* implementation of the compaction depends on the sufficient amount

4031

* of free memory (see __compaction_suitable)

4031

* of free memory (see __compaction_suitable)

4032

*/

4032

*/

4033

if (did_some_progress > 0 &&

4033

if (did_some_progress > 0 &&

4034

should_compact_retry(ac, order, alloc_flags,

4034

should_compact_retry(ac, order, alloc_flags,

4035

compact_result, &compact_priority,

4035

compact_result, &compact_priority,

4036

&compaction_retries))

4036

&compaction_retries))

4037

goto retry;

4037

goto retry;

4038

4039

4040

/* Deal with possible cpuset update races before we start OOM killing */

4040

/* Deal with possible cpuset update races before we start OOM killing */

4041

if (check_retry_cpuset(cpuset_mems_cookie, ac))

4041

if (check_retry_cpuset(cpuset_mems_cookie, ac))

4042

goto retry_cpuset;

4042

goto retry_cpuset;

4043

4044

/* Reclaim has failed us, start killing things */

4044

/* Reclaim has failed us, start killing things */

4045

page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);

4045

page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);

4046

if (page)

4046

if (page)

4047

goto got_pg;

4047

goto got_pg;

4048

4049

/* Avoid allocations with no watermarks from looping endlessly */

4049

/* Avoid allocations with no watermarks from looping endlessly */

4050

if (tsk_is_oom_victim(current) &&

4050

if (tsk_is_oom_victim(current) &&

4051

(alloc_flags == ALLOC_OOM ||

4051

(alloc_flags == ALLOC_OOM ||

4052

(gfp_mask & __GFP_NOMEMALLOC)))

4052

(gfp_mask & __GFP_NOMEMALLOC)))

4053

goto nopage;

4053

goto nopage;

4054

4055

/* Retry as long as the OOM killer is making progress */

4055

/* Retry as long as the OOM killer is making progress */

4056

if (did_some_progress) {

4056

if (did_some_progress) {

4057

no_progress_loops = 0;

4057

no_progress_loops = 0;

4058

goto retry;

4058

goto retry;

4059

}

4059

}

4060

4061

nopage:

4061

nopage:

4062

/* Deal with possible cpuset update races before we fail */

4062

/* Deal with possible cpuset update races before we fail */

4063

if (check_retry_cpuset(cpuset_mems_cookie, ac))

4063

if (check_retry_cpuset(cpuset_mems_cookie, ac))

4064

goto retry_cpuset;

4064

goto retry_cpuset;

4065

4066

/*

4066

/*

4067

* Make sure that __GFP_NOFAIL request doesn't leak out and make sure

4067

* Make sure that __GFP_NOFAIL request doesn't leak out and make sure

4068

* we always retry

4068

* we always retry

4069

*/

4069

*/

4070

if (gfp_mask & __GFP_NOFAIL) {

4070

if (gfp_mask & __GFP_NOFAIL) {

4071

/*

4071

/*

4072

* All existing users of the __GFP_NOFAIL are blockable, so warn

4072

* All existing users of the __GFP_NOFAIL are blockable, so warn

4073

* of any new users that actually require GFP_NOWAIT

4073

* of any new users that actually require GFP_NOWAIT

4074

*/

4074

*/

4075

if (WARN_ON_ONCE(!can_direct_reclaim))

4075

if (WARN_ON_ONCE(!can_direct_reclaim))

4076

goto fail;

4076

goto fail;

4077

4078

/*

4078

/*

4079

* PF_MEMALLOC request from this context is rather bizarre

4079

* PF_MEMALLOC request from this context is rather bizarre

4080

* because we cannot reclaim anything and only can loop waiting

4080

* because we cannot reclaim anything and only can loop waiting

4081

* for somebody to do a work for us

4081

* for somebody to do a work for us

4082

*/

4082

*/

4083

WARN_ON_ONCE(current->flags & PF_MEMALLOC);

4083

WARN_ON_ONCE(current->flags & PF_MEMALLOC);

4084

4085

/*

4085

/*

4086

* non failing costly orders are a hard requirement which we

4086

* non failing costly orders are a hard requirement which we

4087

* are not prepared for much so let's warn about these users

4087

* are not prepared for much so let's warn about these users

4088

* so that we can identify them and convert them to something

4088

* so that we can identify them and convert them to something

4089

* else.

4089

* else.

4090

*/

4090

*/

4091

WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

4091

WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);

4092

4093

/*

4093

/*

4094

* Help non-failing allocations by giving them access to memory

4094

* Help non-failing allocations by giving them access to memory

4095

* reserves but do not use ALLOC_NO_WATERMARKS because this

4095

* reserves but do not use ALLOC_NO_WATERMARKS because this

4096

* could deplete whole memory reserves which would just make

4096

* could deplete whole memory reserves which would just make

4097

* the situation worse

4097

* the situation worse

4098

*/

4098

*/

4099

page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);

4099

page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);

4100

if (page)

4100

if (page)

4101

goto got_pg;

4101

goto got_pg;

4102

4103

cond_resched();

4103

cond_resched();

4104

goto retry;

4104

goto retry;

4105

}

4105

}

4106

fail:

4106

fail:

4107

warn_alloc(gfp_mask, ac->nodemask,

4107

warn_alloc(gfp_mask, ac->nodemask,

4108

"page allocation failure: order:%u", order);

4108

"page allocation failure: order:%u", order);

4109

got_pg:

4109

got_pg:

4110

return page;

4110

return page;

4111

}

4111

}

4112

4113

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

4113

static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,

4114

int preferred_nid, nodemask_t *nodemask,

4114

int preferred_nid, nodemask_t *nodemask,

4115

struct alloc_context *ac, gfp_t *alloc_mask,

4115

struct alloc_context *ac, gfp_t *alloc_mask,

4116

unsigned int *alloc_flags)

4116

unsigned int *alloc_flags)

4117

{

4117

{

4118

ac->high_zoneidx = gfp_zone(gfp_mask);

4118

ac->high_zoneidx = gfp_zone(gfp_mask);

4119

ac->zonelist = node_zonelist(preferred_nid, gfp_mask);

4119

ac->zonelist = node_zonelist(preferred_nid, gfp_mask);

4120

ac->nodemask = nodemask;

4120

ac->nodemask = nodemask;

4121

ac->migratetype = gfpflags_to_migratetype(gfp_mask);

4121

ac->migratetype = gfpflags_to_migratetype(gfp_mask);

4122

4123

if (cpusets_enabled()) {

4123

if (cpusets_enabled()) {

4124

*alloc_mask |= __GFP_HARDWALL;

4124

*alloc_mask |= __GFP_HARDWALL;

4125

if (!ac->nodemask)

4125

if (!ac->nodemask)

4126

ac->nodemask = &cpuset_current_mems_allowed;

4126

ac->nodemask = &cpuset_current_mems_allowed;

4127

else

4127

else

4128

*alloc_flags |= ALLOC_CPUSET;

4128

*alloc_flags |= ALLOC_CPUSET;

4129

}

4129

}

4130

4131

fs_reclaim_acquire(gfp_mask);

4131

fs_reclaim_acquire(gfp_mask);

4132

fs_reclaim_release(gfp_mask);

4132

fs_reclaim_release(gfp_mask);

4133

4134

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

4134

might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

4135

4136

if (should_fail_alloc_page(gfp_mask, order))

4136

if (should_fail_alloc_page(gfp_mask, order))

4137

return false;

4137

return false;

4138

4139

if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)

4139

if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)

4140

*alloc_flags |= ALLOC_CMA;

4140

*alloc_flags |= ALLOC_CMA;

4141

4142

return true;

4142

return true;

4143

}

4143

}

4144

4145

/* Determine whether to spread dirty pages and what the first usable zone */

4145

/* Determine whether to spread dirty pages and what the first usable zone */

4146

static inline void finalise_ac(gfp_t gfp_mask,

4146

static inline void finalise_ac(gfp_t gfp_mask,

4147

unsigned int order, struct alloc_context *ac)

4147

unsigned int order, struct alloc_context *ac)

4148

{

4148

{

4149

/* Dirty zone balancing only done in the fast path */

4149

/* Dirty zone balancing only done in the fast path */

4150

ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

4150

ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);

4151

4152

/*

4152

/*

4153

* The preferred zone is used for statistics but crucially it is

4153

* The preferred zone is used for statistics but crucially it is

4154

* also used as the starting point for the zonelist iterator. It

4154

* also used as the starting point for the zonelist iterator. It

4155

* may get reset for allocations that ignore memory policies.

4155

* may get reset for allocations that ignore memory policies.

4156

*/

4156

*/

4157

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

4157

ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,

4158

ac->high_zoneidx, ac->nodemask);

4158

ac->high_zoneidx, ac->nodemask);

4159

}

4159

}

4160

4161

/*

4161

/*

4162

* This is the 'heart' of the zoned buddy allocator.

4162

* This is the 'heart' of the zoned buddy allocator.

4163

*/

4163

*/

4164

struct page *

4164

struct page *

4165

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,

4165

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,

4166

nodemask_t *nodemask)

4166

nodemask_t *nodemask)

4167

{

4167

{

4168

struct page *page;

4168

struct page *page;

4169

unsigned int alloc_flags = ALLOC_WMARK_LOW;

4169

unsigned int alloc_flags = ALLOC_WMARK_LOW;

4170

gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

4170

gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */

4171

struct alloc_context ac = { };

4171

struct alloc_context ac = { };

4172

4173

/*

4173

/*

4174

* There are several places where we assume that the order value is sane

4174

* There are several places where we assume that the order value is sane

4175

* so bail out early if the request is out of bound.

4175

* so bail out early if the request is out of bound.

4176

*/

4176

*/

4177

if (unlikely(order >= MAX_ORDER)) {

4177

if (unlikely(order >= MAX_ORDER)) {

4178

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

4178

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

4179

return NULL;

4179

return NULL;

4180

}

4180

}

4181

4182

gfp_mask &= gfp_allowed_mask;

4182

gfp_mask &= gfp_allowed_mask;

4183

alloc_mask = gfp_mask;

4183

alloc_mask = gfp_mask;

4184

if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))

4184

if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))

4185

return NULL;

4185

return NULL;

4186

4187

finalise_ac(gfp_mask, order, &ac);

4187

finalise_ac(gfp_mask, order, &ac);

4188

4189

/* First allocation attempt */

4189

/* First allocation attempt */

4190

page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

4190

page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);

4191

if (likely(page))

4191

if (likely(page))

4192

goto out;

4192

goto out;

4193

4194

/*

4194

/*

4195

* Apply scoped allocation constraints. This is mainly about GFP_NOFS

4195

* Apply scoped allocation constraints. This is mainly about GFP_NOFS

4196

* resp. GFP_NOIO which has to be inherited for all allocation requests

4196

* resp. GFP_NOIO which has to be inherited for all allocation requests

4197

* from a particular context which has been marked by

4197

* from a particular context which has been marked by

4198

* memalloc_no{fs,io}_{save,restore}.

4198

* memalloc_no{fs,io}_{save,restore}.

4199

*/

4199

*/

4200

alloc_mask = current_gfp_context(gfp_mask);

4200

alloc_mask = current_gfp_context(gfp_mask);

4201

ac.spread_dirty_pages = false;

4201

ac.spread_dirty_pages = false;

4202

4203

/*

4203

/*

4204

* Restore the original nodemask if it was potentially replaced with

4204

* Restore the original nodemask if it was potentially replaced with

4205

* &cpuset_current_mems_allowed to optimize the fast-path attempt.

4205

* &cpuset_current_mems_allowed to optimize the fast-path attempt.

4206

*/

4206

*/

4207

if (unlikely(ac.nodemask != nodemask))

4207

if (unlikely(ac.nodemask != nodemask))

4208

ac.nodemask = nodemask;

4208

ac.nodemask = nodemask;

4209

4210

page = __alloc_pages_slowpath(alloc_mask, order, &ac);

4210

page = __alloc_pages_slowpath(alloc_mask, order, &ac);

4211

4212

out:

4212

out:

4213

if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&

4213

if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&

4214

unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {

4214

unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {

4215

__free_pages(page, order);

4215

__free_pages(page, order);

4216

page = NULL;

4216

page = NULL;

4217

}

4217

}

4218

4219

trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

4219

trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

4220

4221

return page;

4221

return page;

4222

}

4222

}

4223

EXPORT_SYMBOL(__alloc_pages_nodemask);

4223

EXPORT_SYMBOL(__alloc_pages_nodemask);

4224

4225

/*

4225

/*

4226

* Common helper functions.

4226

* Common helper functions.

4227

*/

4227

*/

4228

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

4228

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

4229

{

4229

{

4230

struct page *page;

4230

struct page *page;

4231

4232

/*

4232

/*

4233

* __get_free_pages() returns a 32-bit address, which cannot represent

4233

* __get_free_pages() returns a 32-bit address, which cannot represent

4234

* a highmem page

4234

* a highmem page

4235

*/

4235

*/

4236

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

4236

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

4237

4238

page = alloc_pages(gfp_mask, order);

4238

page = alloc_pages(gfp_mask, order);

4239

if (!page)

4239

if (!page)

4240

return 0;

4240

return 0;

4241

return (unsigned long) page_address(page);

4241

return (unsigned long) page_address(page);

4242

}

4242

}

4243

EXPORT_SYMBOL(__get_free_pages);

4243

EXPORT_SYMBOL(__get_free_pages);

4244

4245

unsigned long get_zeroed_page(gfp_t gfp_mask)

4245

unsigned long get_zeroed_page(gfp_t gfp_mask)

4246

{

4246

{

4247

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

4247

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

4248

}

4248

}

4249

EXPORT_SYMBOL(get_zeroed_page);

4249

EXPORT_SYMBOL(get_zeroed_page);

4250

4251

void __free_pages(struct page *page, unsigned int order)

4251

void __free_pages(struct page *page, unsigned int order)

4252

{

4252

{

4253

if (put_page_testzero(page)) {

4253

if (put_page_testzero(page)) {

4254

if (order == 0)

4254

if (order == 0)

4255

free_hot_cold_page(page, false);

4255

free_hot_cold_page(page, false);

4256

else

4256

else

4257

__free_pages_ok(page, order);

4257

__free_pages_ok(page, order);

4258

}

4258

}

4259

}

4259

}

4260

4261

EXPORT_SYMBOL(__free_pages);

4261

EXPORT_SYMBOL(__free_pages);

4262

4263

void free_pages(unsigned long addr, unsigned int order)

4263

void free_pages(unsigned long addr, unsigned int order)

4264

{

4264

{

4265

if (addr != 0) {

4265

if (addr != 0) {

4266

VM_BUG_ON(!virt_addr_valid((void *)addr));

4266

VM_BUG_ON(!virt_addr_valid((void *)addr));

4267

__free_pages(virt_to_page((void *)addr), order);

4267

__free_pages(virt_to_page((void *)addr), order);

4268

}

4268

}

4269

}

4269

}

4270

4271

EXPORT_SYMBOL(free_pages);

4271

EXPORT_SYMBOL(free_pages);

4272

4273

/*

4273

/*

4274

* Page Fragment:

4274

* Page Fragment:

4275

* An arbitrary-length arbitrary-offset area of memory which resides

4275

* An arbitrary-length arbitrary-offset area of memory which resides

4276

* within a 0 or higher order page. Multiple fragments within that page

4276

* within a 0 or higher order page. Multiple fragments within that page

4277

* are individually refcounted, in the page's reference counter.

4277

* are individually refcounted, in the page's reference counter.

4278

*

4278

*

4279

* The page_frag functions below provide a simple allocation framework for

4279

* The page_frag functions below provide a simple allocation framework for

4280

* page fragments. This is used by the network stack and network device

4280

* page fragments. This is used by the network stack and network device

4281

* drivers to provide a backing region of memory for use as either an

4281

* drivers to provide a backing region of memory for use as either an

4282

* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.

4282

* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.

4283

*/

4283

*/

4284

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,

4284

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,

4285

gfp_t gfp_mask)

4285

gfp_t gfp_mask)

4286

{

4286

{

4287

struct page *page = NULL;

4287

struct page *page = NULL;

4288

gfp_t gfp = gfp_mask;

4288

gfp_t gfp = gfp_mask;

4289

4290

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4290

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4291

gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |

4291

gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |

4292

__GFP_NOMEMALLOC;

4292

__GFP_NOMEMALLOC;

4293

page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,

4293

page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,

4294

PAGE_FRAG_CACHE_MAX_ORDER);

4294

PAGE_FRAG_CACHE_MAX_ORDER);

4295

nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;

4295

nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;

4296

#endif

4296

#endif

4297

if (unlikely(!page))

4297

if (unlikely(!page))

4298

page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);

4298

page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);

4299

4300

nc->va = page ? page_address(page) : NULL;

4300

nc->va = page ? page_address(page) : NULL;

4301

4302

return page;

4302

return page;

4303

}

4303

}

4304

4305

void __page_frag_cache_drain(struct page *page, unsigned int count)

4305

void __page_frag_cache_drain(struct page *page, unsigned int count)

4306

{

4306

{

4307

VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);

4307

VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);

4308

4309

if (page_ref_sub_and_test(page, count)) {

4309

if (page_ref_sub_and_test(page, count)) {

4310

unsigned int order = compound_order(page);

4310

unsigned int order = compound_order(page);

4311

4312

if (order == 0)

4312

if (order == 0)

4313

free_hot_cold_page(page, false);

4313

free_hot_cold_page(page, false);

4314

else

4314

else

4315

__free_pages_ok(page, order);

4315

__free_pages_ok(page, order);

4316

}

4316

}

4317

}

4317

}

4318

EXPORT_SYMBOL(__page_frag_cache_drain);

4318

EXPORT_SYMBOL(__page_frag_cache_drain);

4319

4320

void *page_frag_alloc(struct page_frag_cache *nc,

4320

void *page_frag_alloc(struct page_frag_cache *nc,

4321

unsigned int fragsz, gfp_t gfp_mask)

4321

unsigned int fragsz, gfp_t gfp_mask)

4322

{

4322

{

4323

unsigned int size = PAGE_SIZE;

4323

unsigned int size = PAGE_SIZE;

4324

struct page *page;

4324

struct page *page;

4325

int offset;

4325

int offset;

4326

4327

if (unlikely(!nc->va)) {

4327

if (unlikely(!nc->va)) {

4328

refill:

4328

refill:

4329

page = __page_frag_cache_refill(nc, gfp_mask);

4329

page = __page_frag_cache_refill(nc, gfp_mask);

4330

if (!page)

4330

if (!page)

4331

return NULL;

4331

return NULL;

4332

4333

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4333

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4334

/* if size can vary use size else just use PAGE_SIZE */

4334

/* if size can vary use size else just use PAGE_SIZE */

4335

size = nc->size;

4335

size = nc->size;

4336

#endif

4336

#endif

4337

/* Even if we own the page, we do not use atomic_set().

4337

/* Even if we own the page, we do not use atomic_set().

4338

* This would break get_page_unless_zero() users.

4338

* This would break get_page_unless_zero() users.

4339

*/

4339

*/

4340

page_ref_add(page, size - 1);

4340

page_ref_add(page, size - 1);

4341

4342

/* reset page count bias and offset to start of new frag */

4342

/* reset page count bias and offset to start of new frag */

4343

nc->pfmemalloc = page_is_pfmemalloc(page);

4343

nc->pfmemalloc = page_is_pfmemalloc(page);

4344

nc->pagecnt_bias = size;

4344

nc->pagecnt_bias = size;

4345

nc->offset = size;

4345

nc->offset = size;

4346

}

4346

}

4347

4348

offset = nc->offset - fragsz;

4348

offset = nc->offset - fragsz;

4349

if (unlikely(offset < 0)) {

4349

if (unlikely(offset < 0)) {

4350

page = virt_to_page(nc->va);

4350

page = virt_to_page(nc->va);

4351

4352

if (!page_ref_sub_and_test(page, nc->pagecnt_bias))

4352

if (!page_ref_sub_and_test(page, nc->pagecnt_bias))

4353

goto refill;

4353

goto refill;

4354

4355

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4355

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)

4356

/* if size can vary use size else just use PAGE_SIZE */

4356

/* if size can vary use size else just use PAGE_SIZE */

4357

size = nc->size;

4357

size = nc->size;

4358

#endif

4358

#endif

4359

/* OK, page count is 0, we can safely set it */

4359

/* OK, page count is 0, we can safely set it */

4360

set_page_count(page, size);

4360

set_page_count(page, size);

4361

4362

/* reset page count bias and offset to start of new frag */

4362

/* reset page count bias and offset to start of new frag */

4363

nc->pagecnt_bias = size;

4363

nc->pagecnt_bias = size;

4364

offset = size - fragsz;

4364

offset = size - fragsz;

4365

}

4365

}

4366

4367

nc->pagecnt_bias--;

4367

nc->pagecnt_bias--;

4368

nc->offset = offset;

4368

nc->offset = offset;

4369

4370

return nc->va + offset;

4370

return nc->va + offset;

4371

}

4371

}

4372

EXPORT_SYMBOL(page_frag_alloc);

4372

EXPORT_SYMBOL(page_frag_alloc);

4373

4374

/*

4374

/*

4375

* Frees a page fragment allocated out of either a compound or order 0 page.

4375

* Frees a page fragment allocated out of either a compound or order 0 page.

4376

*/

4376

*/

4377

void page_frag_free(void *addr)

4377

void page_frag_free(void *addr)

4378

{

4378

{

4379

struct page *page = virt_to_head_page(addr);

4379

struct page *page = virt_to_head_page(addr);

4380

4381

if (unlikely(put_page_testzero(page)))

4381

if (unlikely(put_page_testzero(page)))

4382

__free_pages_ok(page, compound_order(page));

4382

__free_pages_ok(page, compound_order(page));

4383

}

4383

}

4384

EXPORT_SYMBOL(page_frag_free);

4384

EXPORT_SYMBOL(page_frag_free);

4385

4386

static void *make_alloc_exact(unsigned long addr, unsigned int order,

4386

static void *make_alloc_exact(unsigned long addr, unsigned int order,

4387

size_t size)

4387

size_t size)

4388

{

4388

{

4389

if (addr) {

4389

if (addr) {

4390

unsigned long alloc_end = addr + (PAGE_SIZE << order);

4390

unsigned long alloc_end = addr + (PAGE_SIZE << order);

4391

unsigned long used = addr + PAGE_ALIGN(size);

4391

unsigned long used = addr + PAGE_ALIGN(size);

4392

4393

split_page(virt_to_page((void *)addr), order);

4393

split_page(virt_to_page((void *)addr), order);

4394

while (used < alloc_end) {

4394

while (used < alloc_end) {

4395

free_page(used);

4395

free_page(used);

4396

used += PAGE_SIZE;

4396

used += PAGE_SIZE;

4397

}

4397

}

4398

}

4398

}

4399

return (void *)addr;

4399

return (void *)addr;

4400

}

4400

}

4401

4402

/**

4402

/**

4403

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

4403

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

4404

* @size: the number of bytes to allocate

4404

* @size: the number of bytes to allocate

4405

* @gfp_mask: GFP flags for the allocation

4405

* @gfp_mask: GFP flags for the allocation

4406

*

4406

*

4407

* This function is similar to alloc_pages(), except that it allocates the

4407

* This function is similar to alloc_pages(), except that it allocates the

4408

* minimum number of pages to satisfy the request. alloc_pages() can only

4408

* minimum number of pages to satisfy the request. alloc_pages() can only

4409

* allocate memory in power-of-two pages.

4409

* allocate memory in power-of-two pages.

4410

*

4410

*

4411

* This function is also limited by MAX_ORDER.

4411

* This function is also limited by MAX_ORDER.

4412

*

4412

*

4413

* Memory allocated by this function must be released by free_pages_exact().

4413

* Memory allocated by this function must be released by free_pages_exact().

4414

*/

4414

*/

4415

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

4415

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

4416

{

4416

{

4417

unsigned int order = get_order(size);

4417

unsigned int order = get_order(size);

4418

unsigned long addr;

4418

unsigned long addr;

4419

4420

addr = __get_free_pages(gfp_mask, order);

4420

addr = __get_free_pages(gfp_mask, order);

4421

return make_alloc_exact(addr, order, size);

4421

return make_alloc_exact(addr, order, size);

4422

}

4422

}

4423

EXPORT_SYMBOL(alloc_pages_exact);

4423

EXPORT_SYMBOL(alloc_pages_exact);

4424

4425

/**

4425

/**

4426

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

4426

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

4427

* pages on a node.

4427

* pages on a node.

4428

* @nid: the preferred node ID where memory should be allocated

4428

* @nid: the preferred node ID where memory should be allocated

4429

* @size: the number of bytes to allocate

4429

* @size: the number of bytes to allocate

4430

* @gfp_mask: GFP flags for the allocation

4430

* @gfp_mask: GFP flags for the allocation

4431

*

4431

*

4432

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

4432

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

4433

* back.

4433

* back.

4434

*/

4434

*/

4435

void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

4435

void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

4436

{

4436

{

4437

unsigned int order = get_order(size);

4437

unsigned int order = get_order(size);

4438

struct page *p = alloc_pages_node(nid, gfp_mask, order);

4438

struct page *p = alloc_pages_node(nid, gfp_mask, order);

4439

if (!p)

4439

if (!p)

4440

return NULL;

4440

return NULL;

4441

return make_alloc_exact((unsigned long)page_address(p), order, size);

4441

return make_alloc_exact((unsigned long)page_address(p), order, size);

4442

}

4442

}

4443

4444

/**

4444

/**

4445

* free_pages_exact - release memory allocated via alloc_pages_exact()

4445

* free_pages_exact - release memory allocated via alloc_pages_exact()

4446

* @virt: the value returned by alloc_pages_exact.

4446

* @virt: the value returned by alloc_pages_exact.

4447

* @size: size of allocation, same value as passed to alloc_pages_exact().

4447

* @size: size of allocation, same value as passed to alloc_pages_exact().

4448

*

4448

*

4449

* Release the memory allocated by a previous call to alloc_pages_exact.

4449

* Release the memory allocated by a previous call to alloc_pages_exact.

4450

*/

4450

*/

4451

void free_pages_exact(void *virt, size_t size)

4451

void free_pages_exact(void *virt, size_t size)

4452

{

4452

{

4453

unsigned long addr = (unsigned long)virt;

4453

unsigned long addr = (unsigned long)virt;

4454

unsigned long end = addr + PAGE_ALIGN(size);

4454

unsigned long end = addr + PAGE_ALIGN(size);

4455

4456

while (addr < end) {

4456

while (addr < end) {

4457

free_page(addr);

4457

free_page(addr);

4458

addr += PAGE_SIZE;

4458

addr += PAGE_SIZE;

4459

}

4459

}

4460

}

4460

}

4461

EXPORT_SYMBOL(free_pages_exact);

4461

EXPORT_SYMBOL(free_pages_exact);

4462

4463

/**

4463

/**

4464

* nr_free_zone_pages - count number of pages beyond high watermark

4464

* nr_free_zone_pages - count number of pages beyond high watermark

4465

* @offset: The zone index of the highest zone

4465

* @offset: The zone index of the highest zone

4466

*

4466

*

4467

* nr_free_zone_pages() counts the number of counts pages which are beyond the

4467

* nr_free_zone_pages() counts the number of counts pages which are beyond the

4468

* high watermark within all zones at or below a given zone index. For each

4468

* high watermark within all zones at or below a given zone index. For each

4469

* zone, the number of pages is calculated as:

4469

* zone, the number of pages is calculated as:

4470

*

4470

*

4471

* nr_free_zone_pages = managed_pages - high_pages

4471

* nr_free_zone_pages = managed_pages - high_pages

4472

*/

4472

*/

4473

static unsigned long nr_free_zone_pages(int offset)

4473

static unsigned long nr_free_zone_pages(int offset)

4474

{

4474

{

4475

struct zoneref *z;

4475

struct zoneref *z;

4476

struct zone *zone;

4476

struct zone *zone;

4477

4478

/* Just pick one node, since fallback list is circular */

4478

/* Just pick one node, since fallback list is circular */

4479

unsigned long sum = 0;

4479

unsigned long sum = 0;

4480

4481

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

4481

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

4482

4483

for_each_zone_zonelist(zone, z, zonelist, offset) {

4483

for_each_zone_zonelist(zone, z, zonelist, offset) {

4484

unsigned long size = zone->managed_pages;

4484

unsigned long size = zone->managed_pages;

4485

unsigned long high = high_wmark_pages(zone);

4485

unsigned long high = high_wmark_pages(zone);

4486

if (size > high)

4486

if (size > high)

4487

sum += size - high;

4487

sum += size - high;

4488

}

4488

}

4489

4490

return sum;

4490

return sum;

4491

}

4491

}

4492

4493

/**

4493

/**

4494

* nr_free_buffer_pages - count number of pages beyond high watermark

4494

* nr_free_buffer_pages - count number of pages beyond high watermark

4495

*

4495

*

4496

* nr_free_buffer_pages() counts the number of pages which are beyond the high

4496

* nr_free_buffer_pages() counts the number of pages which are beyond the high

4497

* watermark within ZONE_DMA and ZONE_NORMAL.

4497

* watermark within ZONE_DMA and ZONE_NORMAL.

4498

*/

4498

*/

4499

unsigned long nr_free_buffer_pages(void)

4499

unsigned long nr_free_buffer_pages(void)

4500

{

4500

{

4501

return nr_free_zone_pages(gfp_zone(GFP_USER));

4501

return nr_free_zone_pages(gfp_zone(GFP_USER));

4502

}

4502

}

4503

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

4503

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

4504

4505

/**

4505

/**

4506

* nr_free_pagecache_pages - count number of pages beyond high watermark

4506

* nr_free_pagecache_pages - count number of pages beyond high watermark

4507

*

4507

*

4508

* nr_free_pagecache_pages() counts the number of pages which are beyond the

4508

* nr_free_pagecache_pages() counts the number of pages which are beyond the

4509

* high watermark within all zones.

4509

* high watermark within all zones.

4510

*/

4510

*/

4511

unsigned long nr_free_pagecache_pages(void)

4511

unsigned long nr_free_pagecache_pages(void)

4512

{

4512

{

4513

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

4513

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

4514

}

4514

}

4515

4516

static inline void show_node(struct zone *zone)

4516

static inline void show_node(struct zone *zone)

4517

{

4517

{

4518

if (IS_ENABLED(CONFIG_NUMA))

4518

if (IS_ENABLED(CONFIG_NUMA))

4519

printk("Node %d ", zone_to_nid(zone));

4519

printk("Node %d ", zone_to_nid(zone));

4520

}

4520

}

4521

4522

long si_mem_available(void)

4522

long si_mem_available(void)

4523

{

4523

{

4524

long available;

4524

long available;

4525

unsigned long pagecache;

4525

unsigned long pagecache;

4526

unsigned long wmark_low = 0;

4526

unsigned long wmark_low = 0;

4527

unsigned long pages[NR_LRU_LISTS];

4527

unsigned long pages[NR_LRU_LISTS];

4528

struct zone *zone;

4528

struct zone *zone;

4529

int lru;

4529

int lru;

4530

4531

for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)

4531

for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)

4532

pages[lru] = global_node_page_state(NR_LRU_BASE + lru);

4532

pages[lru] = global_node_page_state(NR_LRU_BASE + lru);

4533

4534

for_each_zone(zone)

4534

for_each_zone(zone)

4535

wmark_low += zone->watermark[WMARK_LOW];

4535

wmark_low += zone->watermark[WMARK_LOW];

4536

4537

/*

4537

/*

4538

* Estimate the amount of memory available for userspace allocations,

4538

* Estimate the amount of memory available for userspace allocations,

4539

* without causing swapping.

4539

* without causing swapping.

4540

*/

4540

*/

4541

available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;

4541

available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;

4542

4543

/*

4543

/*

4544

* Not all the page cache can be freed, otherwise the system will

4544

* Not all the page cache can be freed, otherwise the system will

4545

* start swapping. Assume at least half of the page cache, or the

4545

* start swapping. Assume at least half of the page cache, or the

4546

* low watermark worth of cache, needs to stay.

4546

* low watermark worth of cache, needs to stay.

4547

*/

4547

*/

4548

pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];

4548

pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];

4549

pagecache -= min(pagecache / 2, wmark_low);

4549

pagecache -= min(pagecache / 2, wmark_low);

4550

available += pagecache;

4550

available += pagecache;

4551

4552

/*

4552

/*

4553

* Part of the reclaimable slab consists of items that are in use,

4553

* Part of the reclaimable slab consists of items that are in use,

4554

* and cannot be freed. Cap this estimate at the low watermark.

4554

* and cannot be freed. Cap this estimate at the low watermark.

4555

*/

4555

*/

4556

available += global_node_page_state(NR_SLAB_RECLAIMABLE) -

4556

available += global_node_page_state(NR_SLAB_RECLAIMABLE) -

4557

min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,

4557

min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,

4558

wmark_low);

4558

wmark_low);

4559

4560

/*

4560

/*

4561

* Part of the kernel memory, which can be released under memory

4561

* Part of the kernel memory, which can be released under memory

4562

* pressure.

4562

* pressure.

4563

*/

4563

*/

4564

available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>

4564

available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>

4565

PAGE_SHIFT;

4565

PAGE_SHIFT;

4566

4567

if (available < 0)

4567

if (available < 0)

4568

available = 0;

4568

available = 0;

4569

return available;

4569

return available;

4570

}

4570

}

4571

EXPORT_SYMBOL_GPL(si_mem_available);

4571

EXPORT_SYMBOL_GPL(si_mem_available);

4572

4573

void si_meminfo(struct sysinfo *val)

4573

void si_meminfo(struct sysinfo *val)

4574

{

4574

{

4575

val->totalram = totalram_pages;

4575

val->totalram = totalram_pages;

4576

val->sharedram = global_node_page_state(NR_SHMEM);

4576

val->sharedram = global_node_page_state(NR_SHMEM);

4577

val->freeram = global_zone_page_state(NR_FREE_PAGES);

4577

val->freeram = global_zone_page_state(NR_FREE_PAGES);

4578

val->bufferram = nr_blockdev_pages();

4578

val->bufferram = nr_blockdev_pages();

4579

val->totalhigh = totalhigh_pages;

4579

val->totalhigh = totalhigh_pages;

4580

val->freehigh = nr_free_highpages();

4580

val->freehigh = nr_free_highpages();

4581

val->mem_unit = PAGE_SIZE;

4581

val->mem_unit = PAGE_SIZE;

4582

}

4582

}

4583

4584

EXPORT_SYMBOL(si_meminfo);

4584

EXPORT_SYMBOL(si_meminfo);

4585

4586

#ifdef CONFIG_NUMA

4586

#ifdef CONFIG_NUMA

4587

void si_meminfo_node(struct sysinfo *val, int nid)

4587

void si_meminfo_node(struct sysinfo *val, int nid)

4588

{

4588

{

4589

int zone_type; /* needs to be signed */

4589

int zone_type; /* needs to be signed */

4590

unsigned long managed_pages = 0;

4590

unsigned long managed_pages = 0;

4591

unsigned long managed_highpages = 0;

4591

unsigned long managed_highpages = 0;

4592

unsigned long free_highpages = 0;

4592

unsigned long free_highpages = 0;

4593

pg_data_t *pgdat = NODE_DATA(nid);

4593

pg_data_t *pgdat = NODE_DATA(nid);

4594

4595

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

4595

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

4596

managed_pages += pgdat->node_zones[zone_type].managed_pages;

4596

managed_pages += pgdat->node_zones[zone_type].managed_pages;

4597

val->totalram = managed_pages;

4597

val->totalram = managed_pages;

4598

val->sharedram = node_page_state(pgdat, NR_SHMEM);

4598

val->sharedram = node_page_state(pgdat, NR_SHMEM);

4599

val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);

4599

val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);

4600

#ifdef CONFIG_HIGHMEM

4600

#ifdef CONFIG_HIGHMEM

4601

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

4601

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

4602

struct zone *zone = &pgdat->node_zones[zone_type];

4602

struct zone *zone = &pgdat->node_zones[zone_type];

4603

4604

if (is_highmem(zone)) {

4604

if (is_highmem(zone)) {

4605

managed_highpages += zone->managed_pages;

4605

managed_highpages += zone->managed_pages;

4606

free_highpages += zone_page_state(zone, NR_FREE_PAGES);

4606

free_highpages += zone_page_state(zone, NR_FREE_PAGES);

4607

}

4607

}

4608

}

4608

}

4609

val->totalhigh = managed_highpages;

4609

val->totalhigh = managed_highpages;

4610

val->freehigh = free_highpages;

4610

val->freehigh = free_highpages;

4611

#else

4611

#else

4612

val->totalhigh = managed_highpages;

4612

val->totalhigh = managed_highpages;

4613

val->freehigh = free_highpages;

4613

val->freehigh = free_highpages;

4614

#endif

4614

#endif

4615

val->mem_unit = PAGE_SIZE;

4615

val->mem_unit = PAGE_SIZE;

4616

}

4616

}

4617

#endif

4617

#endif

4618

4619

/*

4619

/*

4620

* Determine whether the node should be displayed or not, depending on whether

4620

* Determine whether the node should be displayed or not, depending on whether

4621

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

4621

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

4622

*/

4622

*/

4623

static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)

4623

static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)

4624

{

4624

{

4625

if (!(flags & SHOW_MEM_FILTER_NODES))

4625

if (!(flags & SHOW_MEM_FILTER_NODES))

4626

return false;

4626

return false;

4627

4628

/*

4628

/*

4629

* no node mask - aka implicit memory numa policy. Do not bother with

4629

* no node mask - aka implicit memory numa policy. Do not bother with

4630

* the synchronization - read_mems_allowed_begin - because we do not

4630

* the synchronization - read_mems_allowed_begin - because we do not

4631

* have to be precise here.

4631

* have to be precise here.

4632

*/

4632

*/

4633

if (!nodemask)

4633

if (!nodemask)

4634

nodemask = &cpuset_current_mems_allowed;

4634

nodemask = &cpuset_current_mems_allowed;

4635

4636

return !node_isset(nid, *nodemask);

4636

return !node_isset(nid, *nodemask);

4637

}

4637

}

4638

4639

#define K(x) ((x) << (PAGE_SHIFT-10))

4639

#define K(x) ((x) << (PAGE_SHIFT-10))

4640

4641

static void show_migration_types(unsigned char type)

4641

static void show_migration_types(unsigned char type)

4642

{

4642

{

4643

static const char types[MIGRATE_TYPES] = {

4643

static const char types[MIGRATE_TYPES] = {

4644

[MIGRATE_UNMOVABLE] = 'U',

4644

[MIGRATE_UNMOVABLE] = 'U',

4645

[MIGRATE_MOVABLE] = 'M',

4645

[MIGRATE_MOVABLE] = 'M',

4646

[MIGRATE_RECLAIMABLE] = 'E',

4646

[MIGRATE_RECLAIMABLE] = 'E',

4647

[MIGRATE_HIGHATOMIC] = 'H',

4647

[MIGRATE_HIGHATOMIC] = 'H',

4648

#ifdef CONFIG_CMA

4648

#ifdef CONFIG_CMA

4649

[MIGRATE_CMA] = 'C',

4649

[MIGRATE_CMA] = 'C',

4650

#endif

4650

#endif

4651

#ifdef CONFIG_MEMORY_ISOLATION

4651

#ifdef CONFIG_MEMORY_ISOLATION

4652

[MIGRATE_ISOLATE] = 'I',

4652

[MIGRATE_ISOLATE] = 'I',

4653

#endif

4653

#endif

4654

};

4654

};

4655

char tmp[MIGRATE_TYPES + 1];

4655

char tmp[MIGRATE_TYPES + 1];

4656

char *p = tmp;

4656

char *p = tmp;

4657

int i;

4657

int i;

4658

4659

for (i = 0; i < MIGRATE_TYPES; i++) {

4659

for (i = 0; i < MIGRATE_TYPES; i++) {

4660

if (type & (1 << i))

4660

if (type & (1 << i))

4661

*p++ = types[i];

4661

*p++ = types[i];

4662

}

4662

}

4663

4664

*p = '\0';

4664

*p = '\0';

4665

printk(KERN_CONT "(%s) ", tmp);

4665

printk(KERN_CONT "(%s) ", tmp);

4666

}

4666

}

4667

4668

/*

4668

/*

4669

* Show free area list (used inside shift_scroll-lock stuff)

4669

* Show free area list (used inside shift_scroll-lock stuff)

4670

* We also calculate the percentage fragmentation. We do this by counting the

4670

* We also calculate the percentage fragmentation. We do this by counting the

4671

* memory on each free list with the exception of the first item on the list.

4671

* memory on each free list with the exception of the first item on the list.

4672

*

4672

*

4673

* Bits in @filter:

4673

* Bits in @filter:

4674

* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's

4674

* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's

4675

* cpuset.

4675

* cpuset.

4676

*/

4676

*/

4677

void show_free_areas(unsigned int filter, nodemask_t *nodemask)

4677

void show_free_areas(unsigned int filter, nodemask_t *nodemask)

4678

{

4678

{

4679

unsigned long free_pcp = 0;

4679

unsigned long free_pcp = 0;

4680

int cpu;

4680

int cpu;

4681

struct zone *zone;

4681

struct zone *zone;

4682

pg_data_t *pgdat;

4682

pg_data_t *pgdat;

4683

4684

for_each_populated_zone(zone) {

4684

for_each_populated_zone(zone) {

4685

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4685

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4686

continue;

4686

continue;

4687

4688

for_each_online_cpu(cpu)

4688

for_each_online_cpu(cpu)

4689

free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

4689

free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

4690

}

4690

}

4691

4692

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

4692

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

4693

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

4693

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

4694

" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"

4694

" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"

4695

" slab_reclaimable:%lu slab_unreclaimable:%lu\n"

4695

" slab_reclaimable:%lu slab_unreclaimable:%lu\n"

4696

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

4696

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

4697

" free:%lu free_pcp:%lu free_cma:%lu\n",

4697

" free:%lu free_pcp:%lu free_cma:%lu\n",

4698

global_node_page_state(NR_ACTIVE_ANON),

4698

global_node_page_state(NR_ACTIVE_ANON),

4699

global_node_page_state(NR_INACTIVE_ANON),

4699

global_node_page_state(NR_INACTIVE_ANON),

4700

global_node_page_state(NR_ISOLATED_ANON),

4700

global_node_page_state(NR_ISOLATED_ANON),

4701

global_node_page_state(NR_ACTIVE_FILE),

4701

global_node_page_state(NR_ACTIVE_FILE),

4702

global_node_page_state(NR_INACTIVE_FILE),

4702

global_node_page_state(NR_INACTIVE_FILE),

4703

global_node_page_state(NR_ISOLATED_FILE),

4703

global_node_page_state(NR_ISOLATED_FILE),

4704

global_node_page_state(NR_UNEVICTABLE),

4704

global_node_page_state(NR_UNEVICTABLE),

4705

global_node_page_state(NR_FILE_DIRTY),

4705

global_node_page_state(NR_FILE_DIRTY),

4706

global_node_page_state(NR_WRITEBACK),

4706

global_node_page_state(NR_WRITEBACK),

4707

global_node_page_state(NR_UNSTABLE_NFS),

4707

global_node_page_state(NR_UNSTABLE_NFS),

4708

global_node_page_state(NR_SLAB_RECLAIMABLE),

4708

global_node_page_state(NR_SLAB_RECLAIMABLE),

4709

global_node_page_state(NR_SLAB_UNRECLAIMABLE),

4709

global_node_page_state(NR_SLAB_UNRECLAIMABLE),

4710

global_node_page_state(NR_FILE_MAPPED),

4710

global_node_page_state(NR_FILE_MAPPED),

4711

global_node_page_state(NR_SHMEM),

4711

global_node_page_state(NR_SHMEM),

4712

global_zone_page_state(NR_PAGETABLE),

4712

global_zone_page_state(NR_PAGETABLE),

4713

global_zone_page_state(NR_BOUNCE),

4713

global_zone_page_state(NR_BOUNCE),

4714

global_zone_page_state(NR_FREE_PAGES),

4714

global_zone_page_state(NR_FREE_PAGES),

4715

free_pcp,

4715

free_pcp,

4716

global_zone_page_state(NR_FREE_CMA_PAGES));

4716

global_zone_page_state(NR_FREE_CMA_PAGES));

4717

4718

for_each_online_pgdat(pgdat) {

4718

for_each_online_pgdat(pgdat) {

4719

if (show_mem_node_skip(filter, pgdat->node_id, nodemask))

4719

if (show_mem_node_skip(filter, pgdat->node_id, nodemask))

4720

continue;

4720

continue;

4721

4722

printk("Node %d"

4722

printk("Node %d"

4723

" active_anon:%lukB"

4723

" active_anon:%lukB"

4724

" inactive_anon:%lukB"

4724

" inactive_anon:%lukB"

4725

" active_file:%lukB"

4725

" active_file:%lukB"

4726

" inactive_file:%lukB"

4726

" inactive_file:%lukB"

4727

" unevictable:%lukB"

4727

" unevictable:%lukB"

4728

" isolated(anon):%lukB"

4728

" isolated(anon):%lukB"

4729

" isolated(file):%lukB"

4729

" isolated(file):%lukB"

4730

" mapped:%lukB"

4730

" mapped:%lukB"

4731

" dirty:%lukB"

4731

" dirty:%lukB"

4732

" writeback:%lukB"

4732

" writeback:%lukB"

4733

" shmem:%lukB"

4733

" shmem:%lukB"

4734

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

4734

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

4735

" shmem_thp: %lukB"

4735

" shmem_thp: %lukB"

4736

" shmem_pmdmapped: %lukB"

4736

" shmem_pmdmapped: %lukB"

4737

" anon_thp: %lukB"

4737

" anon_thp: %lukB"

4738

#endif

4738

#endif

4739

" writeback_tmp:%lukB"

4739

" writeback_tmp:%lukB"

4740

" unstable:%lukB"

4740

" unstable:%lukB"

4741

" all_unreclaimable? %s"

4741

" all_unreclaimable? %s"

4742

"\n",

4742

"\n",

4743

pgdat->node_id,

4743

pgdat->node_id,

4744

K(node_page_state(pgdat, NR_ACTIVE_ANON)),

4744

K(node_page_state(pgdat, NR_ACTIVE_ANON)),

4745

K(node_page_state(pgdat, NR_INACTIVE_ANON)),

4745

K(node_page_state(pgdat, NR_INACTIVE_ANON)),

4746

K(node_page_state(pgdat, NR_ACTIVE_FILE)),

4746

K(node_page_state(pgdat, NR_ACTIVE_FILE)),

4747

K(node_page_state(pgdat, NR_INACTIVE_FILE)),

4747

K(node_page_state(pgdat, NR_INACTIVE_FILE)),

4748

K(node_page_state(pgdat, NR_UNEVICTABLE)),

4748

K(node_page_state(pgdat, NR_UNEVICTABLE)),

4749

K(node_page_state(pgdat, NR_ISOLATED_ANON)),

4749

K(node_page_state(pgdat, NR_ISOLATED_ANON)),

4750

K(node_page_state(pgdat, NR_ISOLATED_FILE)),

4750

K(node_page_state(pgdat, NR_ISOLATED_FILE)),

4751

K(node_page_state(pgdat, NR_FILE_MAPPED)),

4751

K(node_page_state(pgdat, NR_FILE_MAPPED)),

4752

K(node_page_state(pgdat, NR_FILE_DIRTY)),

4752

K(node_page_state(pgdat, NR_FILE_DIRTY)),

4753

K(node_page_state(pgdat, NR_WRITEBACK)),

4753

K(node_page_state(pgdat, NR_WRITEBACK)),

4754

K(node_page_state(pgdat, NR_SHMEM)),

4754

K(node_page_state(pgdat, NR_SHMEM)),

4755

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

4755

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

4756

K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),

4756

K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),

4757

K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)

4757

K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)

4758

* HPAGE_PMD_NR),

4758

* HPAGE_PMD_NR),

4759

K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),

4759

K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),

4760

#endif

4760

#endif

4761

K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),

4761

K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),

4762

K(node_page_state(pgdat, NR_UNSTABLE_NFS)),

4762

K(node_page_state(pgdat, NR_UNSTABLE_NFS)),

4763

pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?

4763

pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?

4764

"yes" : "no");

4764

"yes" : "no");

4765

}

4765

}

4766

4767

for_each_populated_zone(zone) {

4767

for_each_populated_zone(zone) {

4768

int i;

4768

int i;

4769

4770

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4770

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4771

continue;

4771

continue;

4772

4773

free_pcp = 0;

4773

free_pcp = 0;

4774

for_each_online_cpu(cpu)

4774

for_each_online_cpu(cpu)

4775

free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

4775

free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

4776

4777

show_node(zone);

4777

show_node(zone);

4778

printk(KERN_CONT

4778

printk(KERN_CONT

4779

"%s"

4779

"%s"

4780

" free:%lukB"

4780

" free:%lukB"

4781

" min:%lukB"

4781

" min:%lukB"

4782

" low:%lukB"

4782

" low:%lukB"

4783

" high:%lukB"

4783

" high:%lukB"

4784

" active_anon:%lukB"

4784

" active_anon:%lukB"

4785

" inactive_anon:%lukB"

4785

" inactive_anon:%lukB"

4786

" active_file:%lukB"

4786

" active_file:%lukB"

4787

" inactive_file:%lukB"

4787

" inactive_file:%lukB"

4788

" unevictable:%lukB"

4788

" unevictable:%lukB"

4789

" writepending:%lukB"

4789

" writepending:%lukB"

4790

" present:%lukB"

4790

" present:%lukB"

4791

" managed:%lukB"

4791

" managed:%lukB"

4792

" mlocked:%lukB"

4792

" mlocked:%lukB"

4793

" kernel_stack:%lukB"

4793

" kernel_stack:%lukB"

4794

" pagetables:%lukB"

4794

" pagetables:%lukB"

4795

" bounce:%lukB"

4795

" bounce:%lukB"

4796

" free_pcp:%lukB"

4796

" free_pcp:%lukB"

4797

" local_pcp:%ukB"

4797

" local_pcp:%ukB"

4798

" free_cma:%lukB"

4798

" free_cma:%lukB"

4799

"\n",

4799

"\n",

4800

zone->name,

4800

zone->name,

4801

K(zone_page_state(zone, NR_FREE_PAGES)),

4801

K(zone_page_state(zone, NR_FREE_PAGES)),

4802

K(min_wmark_pages(zone)),

4802

K(min_wmark_pages(zone)),

4803

K(low_wmark_pages(zone)),

4803

K(low_wmark_pages(zone)),

4804

K(high_wmark_pages(zone)),

4804

K(high_wmark_pages(zone)),

4805

K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),

4805

K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),

4806

K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),

4806

K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),

4807

K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),

4807

K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),

4808

K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),

4808

K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),

4809

K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),

4809

K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),

4810

K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),

4810

K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),

4811

K(zone->present_pages),

4811

K(zone->present_pages),

4812

K(zone->managed_pages),

4812

K(zone->managed_pages),

4813

K(zone_page_state(zone, NR_MLOCK)),

4813

K(zone_page_state(zone, NR_MLOCK)),

4814

zone_page_state(zone, NR_KERNEL_STACK_KB),

4814

zone_page_state(zone, NR_KERNEL_STACK_KB),

4815

K(zone_page_state(zone, NR_PAGETABLE)),

4815

K(zone_page_state(zone, NR_PAGETABLE)),

4816

K(zone_page_state(zone, NR_BOUNCE)),

4816

K(zone_page_state(zone, NR_BOUNCE)),

4817

K(free_pcp),

4817

K(free_pcp),

4818

K(this_cpu_read(zone->pageset->pcp.count)),

4818

K(this_cpu_read(zone->pageset->pcp.count)),

4819

K(zone_page_state(zone, NR_FREE_CMA_PAGES)));

4819

K(zone_page_state(zone, NR_FREE_CMA_PAGES)));

4820

printk("lowmem_reserve[]:");

4820

printk("lowmem_reserve[]:");

4821

for (i = 0; i < MAX_NR_ZONES; i++)

4821

for (i = 0; i < MAX_NR_ZONES; i++)

4822

printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);

4822

printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);

4823

printk(KERN_CONT "\n");

4823

printk(KERN_CONT "\n");

4824

}

4824

}

4825

4826

for_each_populated_zone(zone) {

4826

for_each_populated_zone(zone) {

4827

unsigned int order;

4827

unsigned int order;

4828

unsigned long nr[MAX_ORDER], flags, total = 0;

4828

unsigned long nr[MAX_ORDER], flags, total = 0;

4829

unsigned char types[MAX_ORDER];

4829

unsigned char types[MAX_ORDER];

4830

4831

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4831

if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))

4832

continue;

4832

continue;

4833

show_node(zone);

4833

show_node(zone);

4834

printk(KERN_CONT "%s: ", zone->name);

4834

printk(KERN_CONT "%s: ", zone->name);

4835

4836

spin_lock_irqsave(&zone->lock, flags);

4836

spin_lock_irqsave(&zone->lock, flags);

4837

for (order = 0; order < MAX_ORDER; order++) {

4837

for (order = 0; order < MAX_ORDER; order++) {

4838

struct free_area *area = &zone->free_area[order];

4838

struct free_area *area = &zone->free_area[order];

4839

int type;

4839

int type;

4840

4841

nr[order] = area->nr_free;

4841

nr[order] = area->nr_free;

4842

total += nr[order] << order;

4842

total += nr[order] << order;

4843

4844

types[order] = 0;

4844

types[order] = 0;

4845

for (type = 0; type < MIGRATE_TYPES; type++) {

4845

for (type = 0; type < MIGRATE_TYPES; type++) {

4846

if (!list_empty(&area->free_list[type]))

4846

if (!list_empty(&area->free_list[type]))

4847

types[order] |= 1 << type;

4847

types[order] |= 1 << type;

4848

}

4848

}

4849

}

4849

}

4850

spin_unlock_irqrestore(&zone->lock, flags);

4850

spin_unlock_irqrestore(&zone->lock, flags);

4851

for (order = 0; order < MAX_ORDER; order++) {

4851

for (order = 0; order < MAX_ORDER; order++) {

4852

printk(KERN_CONT "%lu*%lukB ",

4852

printk(KERN_CONT "%lu*%lukB ",

4853

nr[order], K(1UL) << order);

4853

nr[order], K(1UL) << order);

4854

if (nr[order])

4854

if (nr[order])

4855

show_migration_types(types[order]);

4855

show_migration_types(types[order]);

4856

}

4856

}

4857

printk(KERN_CONT "= %lukB\n", K(total));

4857

printk(KERN_CONT "= %lukB\n", K(total));

4858

}

4858

}

4859

4860

hugetlb_show_meminfo();

4860

hugetlb_show_meminfo();

4861

4862

printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));

4862

printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));

4863

4864

show_swap_cache_info();

4864

show_swap_cache_info();

4865

}

4865

}

4866

4867

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

4867

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

4868

{

4868

{

4869

zoneref->zone = zone;

4869

zoneref->zone = zone;

4870

zoneref->zone_idx = zone_idx(zone);

4870

zoneref->zone_idx = zone_idx(zone);

4871

}

4871

}

4872

4873

/*

4873

/*

4874

* Builds allocation fallback zone lists.

4874

* Builds allocation fallback zone lists.

4875

*

4875

*

4876

* Add all populated zones of a node to the zonelist.

4876

* Add all populated zones of a node to the zonelist.

4877

*/

4877

*/

4878

static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)

4878

static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)

4879

{

4879

{

4880

struct zone *zone;

4880

struct zone *zone;

4881

enum zone_type zone_type = MAX_NR_ZONES;

4881

enum zone_type zone_type = MAX_NR_ZONES;

4882

int nr_zones = 0;

4882

int nr_zones = 0;

4883

4884

do {

4884

do {

4885

zone_type--;

4885

zone_type--;

4886

zone = pgdat->node_zones + zone_type;

4886

zone = pgdat->node_zones + zone_type;

4887

if (managed_zone(zone)) {

4887

if (managed_zone(zone)) {

4888

zoneref_set_zone(zone, &zonerefs[nr_zones++]);

4888

zoneref_set_zone(zone, &zonerefs[nr_zones++]);

4889

check_highest_zone(zone_type);

4889

check_highest_zone(zone_type);

4890

}

4890

}

4891

} while (zone_type);

4891

} while (zone_type);

4892

4893

return nr_zones;

4893

return nr_zones;

4894

}

4894

}

4895

4896

#ifdef CONFIG_NUMA

4896

#ifdef CONFIG_NUMA

4897

4898

static int __parse_numa_zonelist_order(char *s)

4898

static int __parse_numa_zonelist_order(char *s)

4899

{

4899

{

4900

/*

4900

/*

4901

* We used to support different zonlists modes but they turned

4901

* We used to support different zonlists modes but they turned

4902

* out to be just not useful. Let's keep the warning in place

4902

* out to be just not useful. Let's keep the warning in place

4903

* if somebody still use the cmd line parameter so that we do

4903

* if somebody still use the cmd line parameter so that we do

4904

* not fail it silently

4904

* not fail it silently

4905

*/

4905

*/

4906

if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {

4906

if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {

4907

pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);

4907

pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);

4908

return -EINVAL;

4908

return -EINVAL;

4909

}

4909

}

4910

return 0;

4910

return 0;

4911

}

4911

}

4912

4913

static __init int setup_numa_zonelist_order(char *s)

4913

static __init int setup_numa_zonelist_order(char *s)

4914

{

4914

{

4915

if (!s)

4915

if (!s)

4916

return 0;

4916

return 0;

4917

4918

return __parse_numa_zonelist_order(s);

4918

return __parse_numa_zonelist_order(s);

4919

}

4919

}

4920

early_param("numa_zonelist_order", setup_numa_zonelist_order);

4920

early_param("numa_zonelist_order", setup_numa_zonelist_order);

4921

4922

char numa_zonelist_order[] = "Node";

4922

char numa_zonelist_order[] = "Node";

4923

4924

/*

4924

/*

4925

* sysctl handler for numa_zonelist_order

4925

* sysctl handler for numa_zonelist_order

4926

*/

4926

*/

4927

int numa_zonelist_order_handler(struct ctl_table *table, int write,

4927

int numa_zonelist_order_handler(struct ctl_table *table, int write,

4928

void __user *buffer, size_t *length,

4928

void __user *buffer, size_t *length,

4929

loff_t *ppos)

4929

loff_t *ppos)

4930

{

4930

{

4931

char *str;

4931

char *str;

4932

int ret;

4932

int ret;

4933

4934

if (!write)

4934

if (!write)

4935

return proc_dostring(table, write, buffer, length, ppos);

4935

return proc_dostring(table, write, buffer, length, ppos);

4936

str = memdup_user_nul(buffer, 16);

4936

str = memdup_user_nul(buffer, 16);

4937

if (IS_ERR(str))

4937

if (IS_ERR(str))

4938

return PTR_ERR(str);

4938

return PTR_ERR(str);

4939

4940

ret = __parse_numa_zonelist_order(str);

4940

ret = __parse_numa_zonelist_order(str);

4941

kfree(str);

4941

kfree(str);

4942

return ret;

4942

return ret;

4943

}

4943

}

4944

4945

4946

#define MAX_NODE_LOAD (nr_online_nodes)

4946

#define MAX_NODE_LOAD (nr_online_nodes)

4947

static int node_load[MAX_NUMNODES];

4947

static int node_load[MAX_NUMNODES];

4948

4949

/**

4949

/**

4950

* find_next_best_node - find the next node that should appear in a given node's fallback list

4950

* find_next_best_node - find the next node that should appear in a given node's fallback list

4951

* @node: node whose fallback list we're appending

4951

* @node: node whose fallback list we're appending

4952

* @used_node_mask: nodemask_t of already used nodes

4952

* @used_node_mask: nodemask_t of already used nodes

4953

*

4953

*

4954

* We use a number of factors to determine which is the next node that should

4954

* We use a number of factors to determine which is the next node that should

4955

* appear on a given node's fallback list. The node should not have appeared

4955

* appear on a given node's fallback list. The node should not have appeared

4956

* already in @node's fallback list, and it should be the next closest node

4956

* already in @node's fallback list, and it should be the next closest node

4957

* according to the distance array (which contains arbitrary distance values

4957

* according to the distance array (which contains arbitrary distance values

4958

* from each node to each node in the system), and should also prefer nodes

4958

* from each node to each node in the system), and should also prefer nodes

4959

* with no CPUs, since presumably they'll have very little allocation pressure

4959

* with no CPUs, since presumably they'll have very little allocation pressure

4960

* on them otherwise.

4960

* on them otherwise.

4961

* It returns -1 if no node is found.

4961

* It returns -1 if no node is found.

4962

*/

4962

*/

4963

static int find_next_best_node(int node, nodemask_t *used_node_mask)

4963

static int find_next_best_node(int node, nodemask_t *used_node_mask)

4964

{

4964

{

4965

int n, val;

4965

int n, val;

4966

int min_val = INT_MAX;

4966

int min_val = INT_MAX;

4967

int best_node = NUMA_NO_NODE;

4967

int best_node = NUMA_NO_NODE;

4968

const struct cpumask *tmp = cpumask_of_node(0);

4968

const struct cpumask *tmp = cpumask_of_node(0);

4969

4970

/* Use the local node if we haven't already */

4970

/* Use the local node if we haven't already */

4971

if (!node_isset(node, *used_node_mask)) {

4971

if (!node_isset(node, *used_node_mask)) {

4972

node_set(node, *used_node_mask);

4972

node_set(node, *used_node_mask);

4973

return node;

4973

return node;

4974

}

4974

}

4975

4976

for_each_node_state(n, N_MEMORY) {

4976

for_each_node_state(n, N_MEMORY) {

4977

4978

/* Don't want a node to appear more than once */

4978

/* Don't want a node to appear more than once */

4979

if (node_isset(n, *used_node_mask))

4979

if (node_isset(n, *used_node_mask))

4980

continue;

4980

continue;

4981

4982

/* Use the distance array to find the distance */

4982

/* Use the distance array to find the distance */

4983

val = node_distance(node, n);

4983

val = node_distance(node, n);

4984

4985

/* Penalize nodes under us ("prefer the next node") */

4985

/* Penalize nodes under us ("prefer the next node") */

4986

val += (n < node);

4986

val += (n < node);

4987

4988

/* Give preference to headless and unused nodes */

4988

/* Give preference to headless and unused nodes */

4989

tmp = cpumask_of_node(n);

4989

tmp = cpumask_of_node(n);

4990

if (!cpumask_empty(tmp))

4990

if (!cpumask_empty(tmp))

4991

val += PENALTY_FOR_NODE_WITH_CPUS;

4991

val += PENALTY_FOR_NODE_WITH_CPUS;

4992

4993

/* Slight preference for less loaded node */

4993

/* Slight preference for less loaded node */

4994

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

4994

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

4995

val += node_load[n];

4995

val += node_load[n];

4996

4997

if (val < min_val) {

4997

if (val < min_val) {

4998

min_val = val;

4998

min_val = val;

4999

best_node = n;

4999

best_node = n;

5000

}

5000

}

5001

}

5001

}

5002

5003

if (best_node >= 0)

5003

if (best_node >= 0)

5004

node_set(best_node, *used_node_mask);

5004

node_set(best_node, *used_node_mask);

5005

5006

return best_node;

5006

return best_node;

5007

}

5007

}

5008

5009

5010

/*

5010

/*

5011

* Build zonelists ordered by node and zones within node.

5011

* Build zonelists ordered by node and zones within node.

5012

* This results in maximum locality--normal zone overflows into local

5012

* This results in maximum locality--normal zone overflows into local

5013

* DMA zone, if any--but risks exhausting DMA zone.

5013

* DMA zone, if any--but risks exhausting DMA zone.

5014

*/

5014

*/

5015

static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,

5015

static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,

5016

unsigned nr_nodes)

5016

unsigned nr_nodes)

5017

{

5017

{

5018

struct zoneref *zonerefs;

5018

struct zoneref *zonerefs;

5019

int i;

5019

int i;

5020

5021

zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

5021

zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

5022

5023

for (i = 0; i < nr_nodes; i++) {

5023

for (i = 0; i < nr_nodes; i++) {

5024

int nr_zones;

5024

int nr_zones;

5025

5026

pg_data_t *node = NODE_DATA(node_order[i]);

5026

pg_data_t *node = NODE_DATA(node_order[i]);

5027

5028

nr_zones = build_zonerefs_node(node, zonerefs);

5028

nr_zones = build_zonerefs_node(node, zonerefs);

5029

zonerefs += nr_zones;

5029

zonerefs += nr_zones;

5030

}

5030

}

5031

zonerefs->zone = NULL;

5031

zonerefs->zone = NULL;

5032

zonerefs->zone_idx = 0;

5032

zonerefs->zone_idx = 0;

5033

}

5033

}

5034

5035

/*

5035

/*

5036

* Build gfp_thisnode zonelists

5036

* Build gfp_thisnode zonelists

5037

*/

5037

*/

5038

static void build_thisnode_zonelists(pg_data_t *pgdat)

5038

static void build_thisnode_zonelists(pg_data_t *pgdat)

5039

{

5039

{

5040

struct zoneref *zonerefs;

5040

struct zoneref *zonerefs;

5041

int nr_zones;

5041

int nr_zones;

5042

5043

zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;

5043

zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;

5044

nr_zones = build_zonerefs_node(pgdat, zonerefs);

5044

nr_zones = build_zonerefs_node(pgdat, zonerefs);

5045

zonerefs += nr_zones;

5045

zonerefs += nr_zones;

5046

zonerefs->zone = NULL;

5046

zonerefs->zone = NULL;

5047

zonerefs->zone_idx = 0;

5047

zonerefs->zone_idx = 0;

5048

}

5048

}

5049

5050

/*

5050

/*

5051

* Build zonelists ordered by zone and nodes within zones.

5051

* Build zonelists ordered by zone and nodes within zones.

5052

* This results in conserving DMA zone[s] until all Normal memory is

5052

* This results in conserving DMA zone[s] until all Normal memory is

5053

* exhausted, but results in overflowing to remote node while memory

5053

* exhausted, but results in overflowing to remote node while memory

5054

* may still exist in local DMA zone.

5054

* may still exist in local DMA zone.

5055

*/

5055

*/

5056

5057

static void build_zonelists(pg_data_t *pgdat)

5057

static void build_zonelists(pg_data_t *pgdat)

5058

{

5058

{

5059

static int node_order[MAX_NUMNODES];

5059

static int node_order[MAX_NUMNODES];

5060

int node, load, nr_nodes = 0;

5060

int node, load, nr_nodes = 0;

5061

nodemask_t used_mask;

5061

nodemask_t used_mask;

5062

int local_node, prev_node;

5062

int local_node, prev_node;

5063

5064

/* NUMA-aware ordering of nodes */

5064

/* NUMA-aware ordering of nodes */

5065

local_node = pgdat->node_id;

5065

local_node = pgdat->node_id;

5066

load = nr_online_nodes;

5066

load = nr_online_nodes;

5067

prev_node = local_node;

5067

prev_node = local_node;

5068

nodes_clear(used_mask);

5068

nodes_clear(used_mask);

5069

5070

memset(node_order, 0, sizeof(node_order));

5070

memset(node_order, 0, sizeof(node_order));

5071

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

5071

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

5072

/*

5072

/*

5073

* We don't want to pressure a particular node.

5073

* We don't want to pressure a particular node.

5074

* So adding penalty to the first node in same

5074

* So adding penalty to the first node in same

5075

* distance group to make it round-robin.

5075

* distance group to make it round-robin.

5076

*/

5076

*/

5077

if (node_distance(local_node, node) !=

5077

if (node_distance(local_node, node) !=

5078

node_distance(local_node, prev_node))

5078

node_distance(local_node, prev_node))

5079

node_load[node] = load;

5079

node_load[node] = load;

5080

5081

node_order[nr_nodes++] = node;

5081

node_order[nr_nodes++] = node;

5082

prev_node = node;

5082

prev_node = node;

5083

load--;

5083

load--;

5084

}

5084

}

5085

5086

build_zonelists_in_node_order(pgdat, node_order, nr_nodes);

5086

build_zonelists_in_node_order(pgdat, node_order, nr_nodes);

5087

build_thisnode_zonelists(pgdat);

5087

build_thisnode_zonelists(pgdat);

5088

}

5088

}

5089

5090

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

5090

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

5091

/*

5091

/*

5092

* Return node id of node used for "local" allocations.

5092

* Return node id of node used for "local" allocations.

5093

* I.e., first node id of first zone in arg node's generic zonelist.

5093

* I.e., first node id of first zone in arg node's generic zonelist.

5094

* Used for initializing percpu 'numa_mem', which is used primarily

5094

* Used for initializing percpu 'numa_mem', which is used primarily

5095

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

5095

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

5096

*/

5096

*/

5097

int local_memory_node(int node)

5097

int local_memory_node(int node)

5098

{

5098

{

5099

struct zoneref *z;

5099

struct zoneref *z;

5100

5101

z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

5101

z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

5102

gfp_zone(GFP_KERNEL),

5102

gfp_zone(GFP_KERNEL),

5103

NULL);

5103

NULL);

5104

return z->zone->node;

5104

return z->zone->node;

5105

}

5105

}

5106

#endif

5106

#endif

5107

5108

static void setup_min_unmapped_ratio(void);

5108

static void setup_min_unmapped_ratio(void);

5109

static void setup_min_slab_ratio(void);

5109

static void setup_min_slab_ratio(void);

5110

#else /* CONFIG_NUMA */

5110

#else /* CONFIG_NUMA */

5111

5112

static void build_zonelists(pg_data_t *pgdat)

5112

static void build_zonelists(pg_data_t *pgdat)

5113

{

5113

{

5114

int node, local_node;

5114

int node, local_node;

5115

struct zoneref *zonerefs;

5115

struct zoneref *zonerefs;

5116

int nr_zones;

5116

int nr_zones;

5117

5118

local_node = pgdat->node_id;

5118

local_node = pgdat->node_id;

5119

5120

zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

5120

zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

5121

nr_zones = build_zonerefs_node(pgdat, zonerefs);

5121

nr_zones = build_zonerefs_node(pgdat, zonerefs);

5122

zonerefs += nr_zones;

5122

zonerefs += nr_zones;

5123

5124

/*

5124

/*

5125

* Now we build the zonelist so that it contains the zones

5125

* Now we build the zonelist so that it contains the zones

5126

* of all the other nodes.

5126

* of all the other nodes.

5127

* We don't want to pressure a particular node, so when

5127

* We don't want to pressure a particular node, so when

5128

* building the zones for node N, we make sure that the

5128

* building the zones for node N, we make sure that the

5129

* zones coming right after the local ones are those from

5129

* zones coming right after the local ones are those from

5130

* node N+1 (modulo N)

5130

* node N+1 (modulo N)

5131

*/

5131

*/

5132

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

5132

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

5133

if (!node_online(node))

5133

if (!node_online(node))

5134

continue;

5134

continue;

5135

nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);

5135

nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);

5136

zonerefs += nr_zones;

5136

zonerefs += nr_zones;

5137

}

5137

}

5138

for (node = 0; node < local_node; node++) {

5138

for (node = 0; node < local_node; node++) {

5139

if (!node_online(node))

5139

if (!node_online(node))

5140

continue;

5140

continue;

5141

nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);

5141

nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);

5142

zonerefs += nr_zones;

5142

zonerefs += nr_zones;

5143

}

5143

}

5144

5145

zonerefs->zone = NULL;

5145

zonerefs->zone = NULL;

5146

zonerefs->zone_idx = 0;

5146

zonerefs->zone_idx = 0;

5147

}

5147

}

5148

5149

#endif /* CONFIG_NUMA */

5149

#endif /* CONFIG_NUMA */

5150

5151

/*

5151

/*

5152

* Boot pageset table. One per cpu which is going to be used for all

5152

* Boot pageset table. One per cpu which is going to be used for all

5153

* zones and all nodes. The parameters will be set in such a way

5153

* zones and all nodes. The parameters will be set in such a way

5154

* that an item put on a list will immediately be handed over to

5154

* that an item put on a list will immediately be handed over to

5155

* the buddy list. This is safe since pageset manipulation is done

5155

* the buddy list. This is safe since pageset manipulation is done

5156

* with interrupts disabled.

5156

* with interrupts disabled.

5157

*

5157

*

5158

* The boot_pagesets must be kept even after bootup is complete for

5158

* The boot_pagesets must be kept even after bootup is complete for

5159

* unused processors and/or zones. They do play a role for bootstrapping

5159

* unused processors and/or zones. They do play a role for bootstrapping

5160

* hotplugged processors.

5160

* hotplugged processors.

5161

*

5161

*

5162

* zoneinfo_show() and maybe other functions do

5162

* zoneinfo_show() and maybe other functions do

5163

* not check if the processor is online before following the pageset pointer.

5163

* not check if the processor is online before following the pageset pointer.

5164

* Other parts of the kernel may not check if the zone is available.

5164

* Other parts of the kernel may not check if the zone is available.

5165

*/

5165

*/

5166

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

5166

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

5167

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

5167

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

5168

static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);

5168

static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);

5169

5170

static void __build_all_zonelists(void *data)

5170

static void __build_all_zonelists(void *data)

5171

{

5171

{

5172

int nid;

5172

int nid;

5173

int __maybe_unused cpu;

5173

int __maybe_unused cpu;

5174

pg_data_t *self = data;

5174

pg_data_t *self = data;

5175

static DEFINE_SPINLOCK(lock);

5175

static DEFINE_SPINLOCK(lock);

5176

5177

spin_lock(&lock);

5177

spin_lock(&lock);

5178

5179

#ifdef CONFIG_NUMA

5179

#ifdef CONFIG_NUMA

5180

memset(node_load, 0, sizeof(node_load));

5180

memset(node_load, 0, sizeof(node_load));

5181

#endif

5181

#endif

5182

5183

/*

5183

/*

5184

* This node is hotadded and no memory is yet present. So just

5184

* This node is hotadded and no memory is yet present. So just

5185

* building zonelists is fine - no need to touch other nodes.

5185

* building zonelists is fine - no need to touch other nodes.

5186

*/

5186

*/

5187

if (self && !node_online(self->node_id)) {

5187

if (self && !node_online(self->node_id)) {

5188

build_zonelists(self);

5188

build_zonelists(self);

5189

} else {

5189

} else {

5190

for_each_online_node(nid) {

5190

for_each_online_node(nid) {

5191

pg_data_t *pgdat = NODE_DATA(nid);

5191

pg_data_t *pgdat = NODE_DATA(nid);

5192

5193

build_zonelists(pgdat);

5193

build_zonelists(pgdat);

5194

}

5194

}

5195

5196

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

5196

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

5197

/*

5197

/*

5198

* We now know the "local memory node" for each node--

5198

* We now know the "local memory node" for each node--

5199

* i.e., the node of the first zone in the generic zonelist.

5199

* i.e., the node of the first zone in the generic zonelist.

5200

* Set up numa_mem percpu variable for on-line cpus. During

5200

* Set up numa_mem percpu variable for on-line cpus. During

5201

* boot, only the boot cpu should be on-line; we'll init the

5201

* boot, only the boot cpu should be on-line; we'll init the

5202

* secondary cpus' numa_mem as they come on-line. During

5202

* secondary cpus' numa_mem as they come on-line. During

5203

* node/memory hotplug, we'll fixup all on-line cpus.

5203

* node/memory hotplug, we'll fixup all on-line cpus.

5204

*/

5204

*/

5205

for_each_online_cpu(cpu)

5205

for_each_online_cpu(cpu)

5206

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

5206

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

5207

#endif

5207

#endif

5208

}

5208

}

5209

5210

spin_unlock(&lock);

5210

spin_unlock(&lock);

5211

}

5211

}

5212

5213

static noinline void __init

5213

static noinline void __init

5214

build_all_zonelists_init(void)

5214

build_all_zonelists_init(void)

5215

{

5215

{

5216

int cpu;

5216

int cpu;

5217

5218

__build_all_zonelists(NULL);

5218

__build_all_zonelists(NULL);

5219

5220

/*

5220

/*

5221

* Initialize the boot_pagesets that are going to be used

5221

* Initialize the boot_pagesets that are going to be used

5222

* for bootstrapping processors. The real pagesets for

5222

* for bootstrapping processors. The real pagesets for

5223

* each zone will be allocated later when the per cpu

5223

* each zone will be allocated later when the per cpu

5224

* allocator is available.

5224

* allocator is available.

5225

*

5225

*

5226

* boot_pagesets are used also for bootstrapping offline

5226

* boot_pagesets are used also for bootstrapping offline

5227

* cpus if the system is already booted because the pagesets

5227

* cpus if the system is already booted because the pagesets

5228

* are needed to initialize allocators on a specific cpu too.

5228

* are needed to initialize allocators on a specific cpu too.

5229

* F.e. the percpu allocator needs the page allocator which

5229

* F.e. the percpu allocator needs the page allocator which

5230

* needs the percpu allocator in order to allocate its pagesets

5230

* needs the percpu allocator in order to allocate its pagesets

5231

* (a chicken-egg dilemma).

5231

* (a chicken-egg dilemma).

5232

*/

5232

*/

5233

for_each_possible_cpu(cpu)

5233

for_each_possible_cpu(cpu)

5234

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

5234

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

5235

5236

mminit_verify_zonelist();

5236

mminit_verify_zonelist();

5237

cpuset_init_current_mems_allowed();

5237

cpuset_init_current_mems_allowed();

5238

}

5238

}

5239

5240

/*

5240

/*

5241

* unless system_state == SYSTEM_BOOTING.

5241

* unless system_state == SYSTEM_BOOTING.

5242

*

5242

*

5243

* __ref due to call of __init annotated helper build_all_zonelists_init

5243

* __ref due to call of __init annotated helper build_all_zonelists_init

5244

* [protected by SYSTEM_BOOTING].

5244

* [protected by SYSTEM_BOOTING].

5245

*/

5245

*/

5246

void __ref build_all_zonelists(pg_data_t *pgdat)

5246

void __ref build_all_zonelists(pg_data_t *pgdat)

5247

{

5247

{

5248

if (system_state == SYSTEM_BOOTING) {

5248

if (system_state == SYSTEM_BOOTING) {

5249

build_all_zonelists_init();

5249

build_all_zonelists_init();

5250

} else {

5250

} else {

5251

__build_all_zonelists(pgdat);

5251

__build_all_zonelists(pgdat);

5252

/* cpuset refresh routine should be here */

5252

/* cpuset refresh routine should be here */

5253

}

5253

}

5254

vm_total_pages = nr_free_pagecache_pages();

5254

vm_total_pages = nr_free_pagecache_pages();

5255

/*

5255

/*

5256

* Disable grouping by mobility if the number of pages in the

5256

* Disable grouping by mobility if the number of pages in the

5257

* system is too low to allow the mechanism to work. It would be

5257

* system is too low to allow the mechanism to work. It would be

5258

* more accurate, but expensive to check per-zone. This check is

5258

* more accurate, but expensive to check per-zone. This check is

5259

* made on memory-hotadd so a system can start with mobility

5259

* made on memory-hotadd so a system can start with mobility

5260

* disabled and enable it later

5260

* disabled and enable it later

5261

*/

5261

*/

5262

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

5262

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

5263

page_group_by_mobility_disabled = 1;

5263

page_group_by_mobility_disabled = 1;

5264

else

5264

else

5265

page_group_by_mobility_disabled = 0;

5265

page_group_by_mobility_disabled = 0;

5266

5267

pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",

5267

pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",

5268

nr_online_nodes,

5268

nr_online_nodes,

5269

page_group_by_mobility_disabled ? "off" : "on",

5269

page_group_by_mobility_disabled ? "off" : "on",

5270

vm_total_pages);

5270

vm_total_pages);

5271

#ifdef CONFIG_NUMA

5271

#ifdef CONFIG_NUMA

5272

pr_info("Policy zone: %s\n", zone_names[policy_zone]);

5272

pr_info("Policy zone: %s\n", zone_names[policy_zone]);

5273

#endif

5273

#endif

5274

}

5274

}

5275

5276

/*

5276

/*

5277

* Initially all pages are reserved - free ones are freed

5277

* Initially all pages are reserved - free ones are freed

5278

* up by free_all_bootmem() once the early boot process is

5278

* up by free_all_bootmem() once the early boot process is

5279

* done. Non-atomic initialization, single-pass.

5279

* done. Non-atomic initialization, single-pass.

5280

*/

5280

*/

5281

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

5281

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

5282

unsigned long start_pfn, enum memmap_context context)

5282

unsigned long start_pfn, enum memmap_context context)

5283

{

5283

{

5284

struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));

5284

struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));

5285

unsigned long end_pfn = start_pfn + size;

5285

unsigned long end_pfn = start_pfn + size;

5286

pg_data_t *pgdat = NODE_DATA(nid);

5286

pg_data_t *pgdat = NODE_DATA(nid);

5287

unsigned long pfn;

5287

unsigned long pfn;

5288

unsigned long nr_initialised = 0;

5288

unsigned long nr_initialised = 0;

5289

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5289

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5290

struct memblock_region *r = NULL, *tmp;

5290

struct memblock_region *r = NULL, *tmp;

5291

#endif

5291

#endif

5292

5293

if (highest_memmap_pfn < end_pfn - 1)

5293

if (highest_memmap_pfn < end_pfn - 1)

5294

highest_memmap_pfn = end_pfn - 1;

5294

highest_memmap_pfn = end_pfn - 1;

5295

5296

/*

5296

/*

5297

* Honor reservation requested by the driver for this ZONE_DEVICE

5297

* Honor reservation requested by the driver for this ZONE_DEVICE

5298

* memory

5298

* memory

5299

*/

5299

*/

5300

if (altmap && start_pfn == altmap->base_pfn)

5300

if (altmap && start_pfn == altmap->base_pfn)

5301

start_pfn += altmap->reserve;

5301

start_pfn += altmap->reserve;

5302

5303

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

5303

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

5304

/*

5304

/*

5305

* There can be holes in boot-time mem_map[]s handed to this

5305

* There can be holes in boot-time mem_map[]s handed to this

5306

* function. They do not exist on hotplugged memory.

5306

* function. They do not exist on hotplugged memory.

5307

*/

5307

*/

5308

if (context != MEMMAP_EARLY)

5308

if (context != MEMMAP_EARLY)

5309

goto not_early;

5309

goto not_early;

5310

5311

if (!early_pfn_valid(pfn))

5311

if (!early_pfn_valid(pfn))

5312

continue;

5312

continue;

5313

if (!early_pfn_in_nid(pfn, nid))

5313

if (!early_pfn_in_nid(pfn, nid))

5314

continue;

5314

continue;

5315

if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))

5315

if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))

5316

break;

5316

break;

5317

5318

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5318

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5319

/*

5319

/*

5320

* Check given memblock attribute by firmware which can affect

5320

* Check given memblock attribute by firmware which can affect

5321

* kernel memory layout. If zone==ZONE_MOVABLE but memory is

5321

* kernel memory layout. If zone==ZONE_MOVABLE but memory is

5322

* mirrored, it's an overlapped memmap init. skip it.

5322

* mirrored, it's an overlapped memmap init. skip it.

5323

*/

5323

*/

5324

if (mirrored_kernelcore && zone == ZONE_MOVABLE) {

5324

if (mirrored_kernelcore && zone == ZONE_MOVABLE) {

5325

if (!r || pfn >= memblock_region_memory_end_pfn(r)) {

5325

if (!r || pfn >= memblock_region_memory_end_pfn(r)) {

5326

for_each_memblock(memory, tmp)

5326

for_each_memblock(memory, tmp)

5327

if (pfn < memblock_region_memory_end_pfn(tmp))

5327

if (pfn < memblock_region_memory_end_pfn(tmp))

5328

break;

5328

break;

5329

r = tmp;

5329

r = tmp;

5330

}

5330

}

5331

if (pfn >= memblock_region_memory_base_pfn(r) &&

5331

if (pfn >= memblock_region_memory_base_pfn(r) &&

5332

memblock_is_mirror(r)) {

5332

memblock_is_mirror(r)) {

5333

/* already initialized as NORMAL */

5333

/* already initialized as NORMAL */

5334

pfn = memblock_region_memory_end_pfn(r);

5334

pfn = memblock_region_memory_end_pfn(r);

5335

continue;

5335

continue;

5336

}

5336

}

5337

}

5337

}

5338

#endif

5338

#endif

5339

5340

not_early:

5340

not_early:

5341

/*

5341

/*

5342

* Mark the block movable so that blocks are reserved for

5342

* Mark the block movable so that blocks are reserved for

5343

* movable at startup. This will force kernel allocations

5343

* movable at startup. This will force kernel allocations

5344

* to reserve their blocks rather than leaking throughout

5344

* to reserve their blocks rather than leaking throughout

5345

* the address space during boot when many long-lived

5345

* the address space during boot when many long-lived

5346

* kernel allocations are made.

5346

* kernel allocations are made.

5347

*

5347

*

5348

* bitmap is created for zone's valid pfn range. but memmap

5348

* bitmap is created for zone's valid pfn range. but memmap

5349

* can be created for invalid pages (for alignment)

5349

* can be created for invalid pages (for alignment)

5350

* check here not to call set_pageblock_migratetype() against

5350

* check here not to call set_pageblock_migratetype() against

5351

* pfn out of zone.

5351

* pfn out of zone.

5352

*/

5352

*/

5353

if (!(pfn & (pageblock_nr_pages - 1))) {

5353

if (!(pfn & (pageblock_nr_pages - 1))) {

5354

struct page *page = pfn_to_page(pfn);

5354

struct page *page = pfn_to_page(pfn);

5355

5356

__init_single_page(page, pfn, zone, nid);

5356

__init_single_page(page, pfn, zone, nid);

5357

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

5357

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

5358

cond_resched();

5358

cond_resched();

5359

} else {

5359

} else {

5360

__init_single_pfn(pfn, zone, nid);

5360

__init_single_pfn(pfn, zone, nid);

5361

}

5361

}

5362

}

5362

}

5363

}

5363

}

5364

5365

static void __meminit zone_init_free_lists(struct zone *zone)

5365

static void __meminit zone_init_free_lists(struct zone *zone)

5366

{

5366

{

5367

unsigned int order, t;

5367

unsigned int order, t;

5368

for_each_migratetype_order(order, t) {

5368

for_each_migratetype_order(order, t) {

5369

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

5369

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

5370

zone->free_area[order].nr_free = 0;

5370

zone->free_area[order].nr_free = 0;

5371

}

5371

}

5372

}

5372

}

5373

5374

#ifndef __HAVE_ARCH_MEMMAP_INIT

5374

#ifndef __HAVE_ARCH_MEMMAP_INIT

5375

#define memmap_init(size, nid, zone, start_pfn) \

5375

#define memmap_init(size, nid, zone, start_pfn) \

5376

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

5376

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

5377

#endif

5377

#endif

5378

5379

static int zone_batchsize(struct zone *zone)

5379

static int zone_batchsize(struct zone *zone)

5380

{

5380

{

5381

#ifdef CONFIG_MMU

5381

#ifdef CONFIG_MMU

5382

int batch;

5382

int batch;

5383

5384

/*

5384

/*

5385

* The per-cpu-pages pools are set to around 1000th of the

5385

* The per-cpu-pages pools are set to around 1000th of the

5386

* size of the zone. But no more than 1/2 of a meg.

5386

* size of the zone. But no more than 1/2 of a meg.

5387

*

5387

*

5388

* OK, so we don't know how big the cache is. So guess.

5388

* OK, so we don't know how big the cache is. So guess.

5389

*/

5389

*/

5390

batch = zone->managed_pages / 1024;

5390

batch = zone->managed_pages / 1024;

5391

if (batch * PAGE_SIZE > 512 * 1024)

5391

if (batch * PAGE_SIZE > 512 * 1024)

5392

batch = (512 * 1024) / PAGE_SIZE;

5392

batch = (512 * 1024) / PAGE_SIZE;

5393

batch /= 4; /* We effectively *= 4 below */

5393

batch /= 4; /* We effectively *= 4 below */

5394

if (batch < 1)

5394

if (batch < 1)

5395

batch = 1;

5395

batch = 1;

5396

5397

/*

5397

/*

5398

* Clamp the batch to a 2^n - 1 value. Having a power

5398

* Clamp the batch to a 2^n - 1 value. Having a power

5399

* of 2 value was found to be more likely to have

5399

* of 2 value was found to be more likely to have

5400

* suboptimal cache aliasing properties in some cases.

5400

* suboptimal cache aliasing properties in some cases.

5401

*

5401

*

5402

* For example if 2 tasks are alternately allocating

5402

* For example if 2 tasks are alternately allocating

5403

* batches of pages, one task can end up with a lot

5403

* batches of pages, one task can end up with a lot

5404

* of pages of one half of the possible page colors

5404

* of pages of one half of the possible page colors

5405

* and the other with pages of the other colors.

5405

* and the other with pages of the other colors.

5406

*/

5406

*/

5407

batch = rounddown_pow_of_two(batch + batch/2) - 1;

5407

batch = rounddown_pow_of_two(batch + batch/2) - 1;

5408

5409

return batch;

5409

return batch;

5410

5411

#else

5411

#else

5412

/* The deferral and batching of frees should be suppressed under NOMMU

5412

/* The deferral and batching of frees should be suppressed under NOMMU

5413

* conditions.

5413

* conditions.

5414

*

5414

*

5415

* The problem is that NOMMU needs to be able to allocate large chunks

5415

* The problem is that NOMMU needs to be able to allocate large chunks

5416

* of contiguous memory as there's no hardware page translation to

5416

* of contiguous memory as there's no hardware page translation to

5417

* assemble apparent contiguous memory from discontiguous pages.

5417

* assemble apparent contiguous memory from discontiguous pages.

5418

*

5418

*

5419

* Queueing large contiguous runs of pages for batching, however,

5419

* Queueing large contiguous runs of pages for batching, however,

5420

* causes the pages to actually be freed in smaller chunks. As there

5420

* causes the pages to actually be freed in smaller chunks. As there

5421

* can be a significant delay between the individual batches being

5421

* can be a significant delay between the individual batches being

5422

* recycled, this leads to the once large chunks of space being

5422

* recycled, this leads to the once large chunks of space being

5423

* fragmented and becoming unavailable for high-order allocations.

5423

* fragmented and becoming unavailable for high-order allocations.

5424

*/

5424

*/

5425

return 0;

5425

return 0;

5426

#endif

5426

#endif

5427

}

5427

}

5428

5429

/*

5429

/*

5430

* pcp->high and pcp->batch values are related and dependent on one another:

5430

* pcp->high and pcp->batch values are related and dependent on one another:

5431

* ->batch must never be higher then ->high.

5431

* ->batch must never be higher then ->high.

5432

* The following function updates them in a safe manner without read side

5432

* The following function updates them in a safe manner without read side

5433

* locking.

5433

* locking.

5434

*

5434

*

5435

* Any new users of pcp->batch and pcp->high should ensure they can cope with

5435

* Any new users of pcp->batch and pcp->high should ensure they can cope with

5436

* those fields changing asynchronously (acording the the above rule).

5436

* those fields changing asynchronously (acording the the above rule).

5437

*

5437

*

5438

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

5438

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

5439

* outside of boot time (or some other assurance that no concurrent updaters

5439

* outside of boot time (or some other assurance that no concurrent updaters

5440

* exist).

5440

* exist).

5441

*/

5441

*/

5442

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

5442

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

5443

unsigned long batch)

5443

unsigned long batch)

5444

{

5444

{

5445

/* start with a fail safe value for batch */

5445

/* start with a fail safe value for batch */

5446

pcp->batch = 1;

5446

pcp->batch = 1;

5447

smp_wmb();

5447

smp_wmb();

5448

5449

/* Update high, then batch, in order */

5449

/* Update high, then batch, in order */

5450

pcp->high = high;

5450

pcp->high = high;

5451

smp_wmb();

5451

smp_wmb();

5452

5453

pcp->batch = batch;

5453

pcp->batch = batch;

5454

}

5454

}

5455

5456

/* a companion to pageset_set_high() */

5456

/* a companion to pageset_set_high() */

5457

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

5457

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

5458

{

5458

{

5459

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

5459

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

5460

}

5460

}

5461

5462

static void pageset_init(struct per_cpu_pageset *p)

5462

static void pageset_init(struct per_cpu_pageset *p)

5463

{

5463

{

5464

struct per_cpu_pages *pcp;

5464

struct per_cpu_pages *pcp;

5465

int migratetype;

5465

int migratetype;

5466

5467

memset(p, 0, sizeof(*p));

5467

memset(p, 0, sizeof(*p));

5468

5469

pcp = &p->pcp;

5469

pcp = &p->pcp;

5470

pcp->count = 0;

5470

pcp->count = 0;

5471

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

5471

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

5472

INIT_LIST_HEAD(&pcp->lists[migratetype]);

5472

INIT_LIST_HEAD(&pcp->lists[migratetype]);

5473

}

5473

}

5474

5475

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

5475

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

5476

{

5476

{

5477

pageset_init(p);

5477

pageset_init(p);

5478

pageset_set_batch(p, batch);

5478

pageset_set_batch(p, batch);

5479

}

5479

}

5480

5481

/*

5481

/*

5482

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

5482

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

5483

* to the value high for the pageset p.

5483

* to the value high for the pageset p.

5484

*/

5484

*/

5485

static void pageset_set_high(struct per_cpu_pageset *p,

5485

static void pageset_set_high(struct per_cpu_pageset *p,

5486

unsigned long high)

5486

unsigned long high)

5487

{

5487

{

5488

unsigned long batch = max(1UL, high / 4);

5488

unsigned long batch = max(1UL, high / 4);

5489

if ((high / 4) > (PAGE_SHIFT * 8))

5489

if ((high / 4) > (PAGE_SHIFT * 8))

5490

batch = PAGE_SHIFT * 8;

5490

batch = PAGE_SHIFT * 8;

5491

5492

pageset_update(&p->pcp, high, batch);

5492

pageset_update(&p->pcp, high, batch);

5493

}

5493

}

5494

5495

static void pageset_set_high_and_batch(struct zone *zone,

5495

static void pageset_set_high_and_batch(struct zone *zone,

5496

struct per_cpu_pageset *pcp)

5496

struct per_cpu_pageset *pcp)

5497

{

5497

{

5498

if (percpu_pagelist_fraction)

5498

if (percpu_pagelist_fraction)

5499

pageset_set_high(pcp,

5499

pageset_set_high(pcp,

5500

(zone->managed_pages /

5500

(zone->managed_pages /

5501

percpu_pagelist_fraction));

5501

percpu_pagelist_fraction));

5502

else

5502

else

5503

pageset_set_batch(pcp, zone_batchsize(zone));

5503

pageset_set_batch(pcp, zone_batchsize(zone));

5504

}

5504

}

5505

5506

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

5506

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

5507

{

5507

{

5508

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

5508

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

5509

5510

pageset_init(pcp);

5510

pageset_init(pcp);

5511

pageset_set_high_and_batch(zone, pcp);

5511

pageset_set_high_and_batch(zone, pcp);

5512

}

5512

}

5513

5514

void __meminit setup_zone_pageset(struct zone *zone)

5514

void __meminit setup_zone_pageset(struct zone *zone)

5515

{

5515

{

5516

int cpu;

5516

int cpu;

5517

zone->pageset = alloc_percpu(struct per_cpu_pageset);

5517

zone->pageset = alloc_percpu(struct per_cpu_pageset);

5518

for_each_possible_cpu(cpu)

5518

for_each_possible_cpu(cpu)

5519

zone_pageset_init(zone, cpu);

5519

zone_pageset_init(zone, cpu);

5520

}

5520

}

5521

5522

/*

5522

/*

5523

* Allocate per cpu pagesets and initialize them.

5523

* Allocate per cpu pagesets and initialize them.

5524

* Before this call only boot pagesets were available.

5524

* Before this call only boot pagesets were available.

5525

*/

5525

*/

5526

void __init setup_per_cpu_pageset(void)

5526

void __init setup_per_cpu_pageset(void)

5527

{

5527

{

5528

struct pglist_data *pgdat;

5528

struct pglist_data *pgdat;

5529

struct zone *zone;

5529

struct zone *zone;

5530

5531

for_each_populated_zone(zone)

5531

for_each_populated_zone(zone)

5532

setup_zone_pageset(zone);

5532

setup_zone_pageset(zone);

5533

5534

for_each_online_pgdat(pgdat)

5534

for_each_online_pgdat(pgdat)

5535

pgdat->per_cpu_nodestats =

5535

pgdat->per_cpu_nodestats =

5536

alloc_percpu(struct per_cpu_nodestat);

5536

alloc_percpu(struct per_cpu_nodestat);

5537

}

5537

}

5538

5539

static __meminit void zone_pcp_init(struct zone *zone)

5539

static __meminit void zone_pcp_init(struct zone *zone)

5540

{

5540

{

5541

/*

5541

/*

5542

* per cpu subsystem is not up at this point. The following code

5542

* per cpu subsystem is not up at this point. The following code

5543

* relies on the ability of the linker to provide the

5543

* relies on the ability of the linker to provide the

5544

* offset of a (static) per cpu variable into the per cpu area.

5544

* offset of a (static) per cpu variable into the per cpu area.

5545

*/

5545

*/

5546

zone->pageset = &boot_pageset;

5546

zone->pageset = &boot_pageset;

5547

5548

if (populated_zone(zone))

5548

if (populated_zone(zone))

5549

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

5549

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

5550

zone->name, zone->present_pages,

5550

zone->name, zone->present_pages,

5551

zone_batchsize(zone));

5551

zone_batchsize(zone));

5552

}

5552

}

5553

5554

void __meminit init_currently_empty_zone(struct zone *zone,

5554

void __meminit init_currently_empty_zone(struct zone *zone,

5555

unsigned long zone_start_pfn,

5555

unsigned long zone_start_pfn,

5556

unsigned long size)

5556

unsigned long size)

5557

{

5557

{

5558

struct pglist_data *pgdat = zone->zone_pgdat;

5558

struct pglist_data *pgdat = zone->zone_pgdat;

5559

int zone_idx = zone_idx(zone) + 1;

5559

int zone_idx = zone_idx(zone) + 1;

5560

5561

if (zone_idx > pgdat->nr_zones)

5561

if (zone_idx > pgdat->nr_zones)

5562

pgdat->nr_zones = zone_idx;

5562

pgdat->nr_zones = zone_idx;

5563

5564

zone->zone_start_pfn = zone_start_pfn;

5564

zone->zone_start_pfn = zone_start_pfn;

5565

5566

mminit_dprintk(MMINIT_TRACE, "memmap_init",

5566

mminit_dprintk(MMINIT_TRACE, "memmap_init",

5567

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

5567

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

5568

pgdat->node_id,

5568

pgdat->node_id,

5569

(unsigned long)zone_idx(zone),

5569

(unsigned long)zone_idx(zone),

5570

zone_start_pfn, (zone_start_pfn + size));

5570

zone_start_pfn, (zone_start_pfn + size));

5571

5572

zone_init_free_lists(zone);

5572

zone_init_free_lists(zone);

5573

zone->initialized = 1;

5573

zone->initialized = 1;

5574

}

5574

}

5575

5576

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5576

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

5577

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

5577

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

5578

5579

/*

5579

/*

5580

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

5580

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

5581

*/

5581

*/

5582

int __meminit __early_pfn_to_nid(unsigned long pfn,

5582

int __meminit __early_pfn_to_nid(unsigned long pfn,

5583

struct mminit_pfnnid_cache *state)

5583

struct mminit_pfnnid_cache *state)

5584

{

5584

{

5585

unsigned long start_pfn, end_pfn;

5585

unsigned long start_pfn, end_pfn;

5586

int nid;

5586

int nid;

5587

5588

if (state->last_start <= pfn && pfn < state->last_end)

5588

if (state->last_start <= pfn && pfn < state->last_end)

5589

return state->last_nid;

5589

return state->last_nid;

5590

5591

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

5591

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

5592

if (nid != -1) {

5592

if (nid != -1) {

5593

state->last_start = start_pfn;

5593

state->last_start = start_pfn;

5594

state->last_end = end_pfn;

5594

state->last_end = end_pfn;

5595

state->last_nid = nid;

5595

state->last_nid = nid;

5596

}

5596

}

5597

5598

return nid;

5598

return nid;

5599

}

5599

}

5600

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

5600

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

5601

5602

/**

5602

/**

5603

* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range

5603

* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range

5604

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

5604

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

5605

* @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid

5605

* @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid

5606

*

5606

*

5607

* If an architecture guarantees that all ranges registered contain no holes

5607

* If an architecture guarantees that all ranges registered contain no holes

5608

* and may be freed, this this function may be used instead of calling

5608

* and may be freed, this this function may be used instead of calling

5609

* memblock_free_early_nid() manually.

5609

* memblock_free_early_nid() manually.

5610

*/

5610

*/

5611

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

5611

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

5612

{

5612

{

5613

unsigned long start_pfn, end_pfn;

5613

unsigned long start_pfn, end_pfn;

5614

int i, this_nid;

5614

int i, this_nid;

5615

5616

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

5616

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

5617

start_pfn = min(start_pfn, max_low_pfn);

5617

start_pfn = min(start_pfn, max_low_pfn);

5618

end_pfn = min(end_pfn, max_low_pfn);

5618

end_pfn = min(end_pfn, max_low_pfn);

5619

5620

if (start_pfn < end_pfn)

5620

if (start_pfn < end_pfn)

5621

memblock_free_early_nid(PFN_PHYS(start_pfn),

5621

memblock_free_early_nid(PFN_PHYS(start_pfn),

5622

(end_pfn - start_pfn) << PAGE_SHIFT,

5622

(end_pfn - start_pfn) << PAGE_SHIFT,

5623

this_nid);

5623

this_nid);

5624

}

5624

}

5625

}

5625

}

5626

5627

/**

5627

/**

5628

* sparse_memory_present_with_active_regions - Call memory_present for each active range

5628

* sparse_memory_present_with_active_regions - Call memory_present for each active range

5629

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

5629

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

5630

*

5630

*

5631

* If an architecture guarantees that all ranges registered contain no holes and may

5631

* If an architecture guarantees that all ranges registered contain no holes and may

5632

* be freed, this function may be used instead of calling memory_present() manually.

5632

* be freed, this function may be used instead of calling memory_present() manually.

5633

*/

5633

*/

5634

void __init sparse_memory_present_with_active_regions(int nid)

5634

void __init sparse_memory_present_with_active_regions(int nid)

5635

{

5635

{

5636

unsigned long start_pfn, end_pfn;

5636

unsigned long start_pfn, end_pfn;

5637

int i, this_nid;

5637

int i, this_nid;

5638

5639

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

5639

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

5640

memory_present(this_nid, start_pfn, end_pfn);

5640

memory_present(this_nid, start_pfn, end_pfn);

5641

}

5641

}

5642

5643

/**

5643

/**

5644

* get_pfn_range_for_nid - Return the start and end page frames for a node

5644

* get_pfn_range_for_nid - Return the start and end page frames for a node

5645

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

5645

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

5646

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

5646

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

5647

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

5647

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

5648

*

5648

*

5649

* It returns the start and end page frame of a node based on information

5649

* It returns the start and end page frame of a node based on information

5650

* provided by memblock_set_node(). If called for a node

5650

* provided by memblock_set_node(). If called for a node

5651

* with no available memory, a warning is printed and the start and end

5651

* with no available memory, a warning is printed and the start and end

5652

* PFNs will be 0.

5652

* PFNs will be 0.

5653

*/

5653

*/

5654

void __meminit get_pfn_range_for_nid(unsigned int nid,

5654

void __meminit get_pfn_range_for_nid(unsigned int nid,

5655

unsigned long *start_pfn, unsigned long *end_pfn)

5655

unsigned long *start_pfn, unsigned long *end_pfn)

5656

{

5656

{

5657

unsigned long this_start_pfn, this_end_pfn;

5657

unsigned long this_start_pfn, this_end_pfn;

5658

int i;

5658

int i;

5659

5660

*start_pfn = -1UL;

5660

*start_pfn = -1UL;

5661

*end_pfn = 0;

5661

*end_pfn = 0;

5662

5663

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

5663

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

5664

*start_pfn = min(*start_pfn, this_start_pfn);

5664

*start_pfn = min(*start_pfn, this_start_pfn);

5665

*end_pfn = max(*end_pfn, this_end_pfn);

5665

*end_pfn = max(*end_pfn, this_end_pfn);

5666

}

5666

}

5667

5668

if (*start_pfn == -1UL)

5668

if (*start_pfn == -1UL)

5669

*start_pfn = 0;

5669

*start_pfn = 0;

5670

}

5670

}

5671

5672

/*

5672

/*

5673

* This finds a zone that can be used for ZONE_MOVABLE pages. The

5673

* This finds a zone that can be used for ZONE_MOVABLE pages. The

5674

* assumption is made that zones within a node are ordered in monotonic

5674

* assumption is made that zones within a node are ordered in monotonic

5675

* increasing memory addresses so that the "highest" populated zone is used

5675

* increasing memory addresses so that the "highest" populated zone is used

5676

*/

5676

*/

5677

static void __init find_usable_zone_for_movable(void)

5677

static void __init find_usable_zone_for_movable(void)

5678

{

5678

{

5679

int zone_index;

5679

int zone_index;

5680

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

5680

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

5681

if (zone_index == ZONE_MOVABLE)

5681

if (zone_index == ZONE_MOVABLE)

5682

continue;

5682

continue;

5683

5684

if (arch_zone_highest_possible_pfn[zone_index] >

5684

if (arch_zone_highest_possible_pfn[zone_index] >

5685

arch_zone_lowest_possible_pfn[zone_index])

5685

arch_zone_lowest_possible_pfn[zone_index])

5686

break;

5686

break;

5687

}

5687

}

5688

5689

VM_BUG_ON(zone_index == -1);

5689

VM_BUG_ON(zone_index == -1);

5690

movable_zone = zone_index;

5690

movable_zone = zone_index;

5691

}

5691

}

5692

5693

/*

5693

/*

5694

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

5694

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

5695

* because it is sized independent of architecture. Unlike the other zones,

5695

* because it is sized independent of architecture. Unlike the other zones,

5696

* the starting point for ZONE_MOVABLE is not fixed. It may be different

5696

* the starting point for ZONE_MOVABLE is not fixed. It may be different

5697

* in each node depending on the size of each node and how evenly kernelcore

5697

* in each node depending on the size of each node and how evenly kernelcore

5698

* is distributed. This helper function adjusts the zone ranges

5698

* is distributed. This helper function adjusts the zone ranges

5699

* provided by the architecture for a given node by using the end of the

5699

* provided by the architecture for a given node by using the end of the

5700

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

5700

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

5701

* zones within a node are in order of monotonic increases memory addresses

5701

* zones within a node are in order of monotonic increases memory addresses

5702

*/

5702

*/

5703

static void __meminit adjust_zone_range_for_zone_movable(int nid,

5703

static void __meminit adjust_zone_range_for_zone_movable(int nid,

5704

unsigned long zone_type,

5704

unsigned long zone_type,

5705

unsigned long node_start_pfn,

5705

unsigned long node_start_pfn,

5706

unsigned long node_end_pfn,

5706

unsigned long node_end_pfn,

5707

unsigned long *zone_start_pfn,

5707

unsigned long *zone_start_pfn,

5708

unsigned long *zone_end_pfn)

5708

unsigned long *zone_end_pfn)

5709

{

5709

{

5710

/* Only adjust if ZONE_MOVABLE is on this node */

5710

/* Only adjust if ZONE_MOVABLE is on this node */

5711

if (zone_movable_pfn[nid]) {

5711

if (zone_movable_pfn[nid]) {

5712

/* Size ZONE_MOVABLE */

5712

/* Size ZONE_MOVABLE */

5713

if (zone_type == ZONE_MOVABLE) {

5713

if (zone_type == ZONE_MOVABLE) {

5714

*zone_start_pfn = zone_movable_pfn[nid];

5714

*zone_start_pfn = zone_movable_pfn[nid];

5715

*zone_end_pfn = min(node_end_pfn,

5715

*zone_end_pfn = min(node_end_pfn,

5716

arch_zone_highest_possible_pfn[movable_zone]);

5716

arch_zone_highest_possible_pfn[movable_zone]);

5717

5718

/* Adjust for ZONE_MOVABLE starting within this range */

5718

/* Adjust for ZONE_MOVABLE starting within this range */

5719

} else if (!mirrored_kernelcore &&

5719

} else if (!mirrored_kernelcore &&

5720

*zone_start_pfn < zone_movable_pfn[nid] &&

5720

*zone_start_pfn < zone_movable_pfn[nid] &&

5721

*zone_end_pfn > zone_movable_pfn[nid]) {

5721

*zone_end_pfn > zone_movable_pfn[nid]) {

5722

*zone_end_pfn = zone_movable_pfn[nid];

5722

*zone_end_pfn = zone_movable_pfn[nid];

5723

5724

/* Check if this whole range is within ZONE_MOVABLE */

5724

/* Check if this whole range is within ZONE_MOVABLE */

5725

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

5725

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

5726

*zone_start_pfn = *zone_end_pfn;

5726

*zone_start_pfn = *zone_end_pfn;

5727

}

5727

}

5728

}

5728

}

5729

5730

/*

5730

/*

5731

* Return the number of pages a zone spans in a node, including holes

5731

* Return the number of pages a zone spans in a node, including holes

5732

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

5732

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

5733

*/

5733

*/

5734

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

5734

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

5735

unsigned long zone_type,

5735

unsigned long zone_type,

5736

unsigned long node_start_pfn,

5736

unsigned long node_start_pfn,

5737

unsigned long node_end_pfn,

5737

unsigned long node_end_pfn,

5738

unsigned long *zone_start_pfn,

5738

unsigned long *zone_start_pfn,

5739

unsigned long *zone_end_pfn,

5739

unsigned long *zone_end_pfn,

5740

unsigned long *ignored)

5740

unsigned long *ignored)

5741

{

5741

{

5742

/* When hotadd a new node from cpu_up(), the node should be empty */

5742

/* When hotadd a new node from cpu_up(), the node should be empty */

5743

if (!node_start_pfn && !node_end_pfn)

5743

if (!node_start_pfn && !node_end_pfn)

5744

return 0;

5744

return 0;

5745

5746

/* Get the start and end of the zone */

5746

/* Get the start and end of the zone */

5747

*zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

5747

*zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

5748

*zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

5748

*zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

5749

adjust_zone_range_for_zone_movable(nid, zone_type,

5749

adjust_zone_range_for_zone_movable(nid, zone_type,

5750

node_start_pfn, node_end_pfn,

5750

node_start_pfn, node_end_pfn,

5751

zone_start_pfn, zone_end_pfn);

5751

zone_start_pfn, zone_end_pfn);

5752

5753

/* Check that this node has pages within the zone's required range */

5753

/* Check that this node has pages within the zone's required range */

5754

if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)

5754

if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)

5755

return 0;

5755

return 0;

5756

5757

/* Move the zone boundaries inside the node if necessary */

5757

/* Move the zone boundaries inside the node if necessary */

5758

*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);

5758

*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);

5759

*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);

5759

*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);

5760

5761

/* Return the spanned pages */

5761

/* Return the spanned pages */

5762

return *zone_end_pfn - *zone_start_pfn;

5762

return *zone_end_pfn - *zone_start_pfn;

5763

}

5763

}

5764

5765

/*

5765

/*

5766

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

5766

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

5767

* then all holes in the requested range will be accounted for.

5767

* then all holes in the requested range will be accounted for.

5768

*/

5768

*/

5769

unsigned long __meminit __absent_pages_in_range(int nid,

5769

unsigned long __meminit __absent_pages_in_range(int nid,

5770

unsigned long range_start_pfn,

5770

unsigned long range_start_pfn,

5771

unsigned long range_end_pfn)

5771

unsigned long range_end_pfn)

5772

{

5772

{

5773

unsigned long nr_absent = range_end_pfn - range_start_pfn;

5773

unsigned long nr_absent = range_end_pfn - range_start_pfn;

5774

unsigned long start_pfn, end_pfn;

5774

unsigned long start_pfn, end_pfn;

5775

int i;

5775

int i;

5776

5777

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5777

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5778

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

5778

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

5779

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

5779

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

5780

nr_absent -= end_pfn - start_pfn;

5780

nr_absent -= end_pfn - start_pfn;

5781

}

5781

}

5782

return nr_absent;

5782

return nr_absent;

5783

}

5783

}

5784

5785

/**

5785

/**

5786

* absent_pages_in_range - Return number of page frames in holes within a range

5786

* absent_pages_in_range - Return number of page frames in holes within a range

5787

* @start_pfn: The start PFN to start searching for holes

5787

* @start_pfn: The start PFN to start searching for holes

5788

* @end_pfn: The end PFN to stop searching for holes

5788

* @end_pfn: The end PFN to stop searching for holes

5789

*

5789

*

5790

* It returns the number of pages frames in memory holes within a range.

5790

* It returns the number of pages frames in memory holes within a range.

5791

*/

5791

*/

5792

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

5792

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

5793

unsigned long end_pfn)

5793

unsigned long end_pfn)

5794

{

5794

{

5795

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

5795

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

5796

}

5796

}

5797

5798

/* Return the number of page frames in holes in a zone on a node */

5798

/* Return the number of page frames in holes in a zone on a node */

5799

static unsigned long __meminit zone_absent_pages_in_node(int nid,

5799

static unsigned long __meminit zone_absent_pages_in_node(int nid,

5800

unsigned long zone_type,

5800

unsigned long zone_type,

5801

unsigned long node_start_pfn,

5801

unsigned long node_start_pfn,

5802

unsigned long node_end_pfn,

5802

unsigned long node_end_pfn,

5803

unsigned long *ignored)

5803

unsigned long *ignored)

5804

{

5804

{

5805

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

5805

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

5806

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

5806

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

5807

unsigned long zone_start_pfn, zone_end_pfn;

5807

unsigned long zone_start_pfn, zone_end_pfn;

5808

unsigned long nr_absent;

5808

unsigned long nr_absent;

5809

5810

/* When hotadd a new node from cpu_up(), the node should be empty */

5810

/* When hotadd a new node from cpu_up(), the node should be empty */

5811

if (!node_start_pfn && !node_end_pfn)

5811

if (!node_start_pfn && !node_end_pfn)

5812

return 0;

5812

return 0;

5813

5814

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

5814

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

5815

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

5815

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

5816

5817

adjust_zone_range_for_zone_movable(nid, zone_type,

5817

adjust_zone_range_for_zone_movable(nid, zone_type,

5818

node_start_pfn, node_end_pfn,

5818

node_start_pfn, node_end_pfn,

5819

&zone_start_pfn, &zone_end_pfn);

5819

&zone_start_pfn, &zone_end_pfn);

5820

nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

5820

nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

5821

5822

/*

5822

/*

5823

* ZONE_MOVABLE handling.

5823

* ZONE_MOVABLE handling.

5824

* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages

5824

* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages

5825

* and vice versa.

5825

* and vice versa.

5826

*/

5826

*/

5827

if (mirrored_kernelcore && zone_movable_pfn[nid]) {

5827

if (mirrored_kernelcore && zone_movable_pfn[nid]) {

5828

unsigned long start_pfn, end_pfn;

5828

unsigned long start_pfn, end_pfn;

5829

struct memblock_region *r;

5829

struct memblock_region *r;

5830

5831

for_each_memblock(memory, r) {

5831

for_each_memblock(memory, r) {

5832

start_pfn = clamp(memblock_region_memory_base_pfn(r),

5832

start_pfn = clamp(memblock_region_memory_base_pfn(r),

5833

zone_start_pfn, zone_end_pfn);

5833

zone_start_pfn, zone_end_pfn);

5834

end_pfn = clamp(memblock_region_memory_end_pfn(r),

5834

end_pfn = clamp(memblock_region_memory_end_pfn(r),

5835

zone_start_pfn, zone_end_pfn);

5835

zone_start_pfn, zone_end_pfn);

5836

5837

if (zone_type == ZONE_MOVABLE &&

5837

if (zone_type == ZONE_MOVABLE &&

5838

memblock_is_mirror(r))

5838

memblock_is_mirror(r))

5839

nr_absent += end_pfn - start_pfn;

5839

nr_absent += end_pfn - start_pfn;

5840

5841

if (zone_type == ZONE_NORMAL &&

5841

if (zone_type == ZONE_NORMAL &&

5842

!memblock_is_mirror(r))

5842

!memblock_is_mirror(r))

5843

nr_absent += end_pfn - start_pfn;

5843

nr_absent += end_pfn - start_pfn;

5844

}

5844

}

5845

}

5845

}

5846

5847

return nr_absent;

5847

return nr_absent;

5848

}

5848

}

5849

5850

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5850

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5851

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

5851

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

5852

unsigned long zone_type,

5852

unsigned long zone_type,

5853

unsigned long node_start_pfn,

5853

unsigned long node_start_pfn,

5854

unsigned long node_end_pfn,

5854

unsigned long node_end_pfn,

5855

unsigned long *zone_start_pfn,

5855

unsigned long *zone_start_pfn,

5856

unsigned long *zone_end_pfn,

5856

unsigned long *zone_end_pfn,

5857

unsigned long *zones_size)

5857

unsigned long *zones_size)

5858

{

5858

{

5859

unsigned int zone;

5859

unsigned int zone;

5860

5861

*zone_start_pfn = node_start_pfn;

5861

*zone_start_pfn = node_start_pfn;

5862

for (zone = 0; zone < zone_type; zone++)

5862

for (zone = 0; zone < zone_type; zone++)

5863

*zone_start_pfn += zones_size[zone];

5863

*zone_start_pfn += zones_size[zone];

5864

5865

*zone_end_pfn = *zone_start_pfn + zones_size[zone_type];

5865

*zone_end_pfn = *zone_start_pfn + zones_size[zone_type];

5866

5867

return zones_size[zone_type];

5867

return zones_size[zone_type];

5868

}

5868

}

5869

5870

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

5870

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

5871

unsigned long zone_type,

5871

unsigned long zone_type,

5872

unsigned long node_start_pfn,

5872

unsigned long node_start_pfn,

5873

unsigned long node_end_pfn,

5873

unsigned long node_end_pfn,

5874

unsigned long *zholes_size)

5874

unsigned long *zholes_size)

5875

{

5875

{

5876

if (!zholes_size)

5876

if (!zholes_size)

5877

return 0;

5877

return 0;

5878

5879

return zholes_size[zone_type];

5879

return zholes_size[zone_type];

5880

}

5880

}

5881

5882

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5882

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5883

5884

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

5884

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

5885

unsigned long node_start_pfn,

5885

unsigned long node_start_pfn,

5886

unsigned long node_end_pfn,

5886

unsigned long node_end_pfn,

5887

unsigned long *zones_size,

5887

unsigned long *zones_size,

5888

unsigned long *zholes_size)

5888

unsigned long *zholes_size)

5889

{

5889

{

5890

unsigned long realtotalpages = 0, totalpages = 0;

5890

unsigned long realtotalpages = 0, totalpages = 0;

5891

enum zone_type i;

5891

enum zone_type i;

5892

5893

for (i = 0; i < MAX_NR_ZONES; i++) {

5893

for (i = 0; i < MAX_NR_ZONES; i++) {

5894

struct zone *zone = pgdat->node_zones + i;

5894

struct zone *zone = pgdat->node_zones + i;

5895

unsigned long zone_start_pfn, zone_end_pfn;

5895

unsigned long zone_start_pfn, zone_end_pfn;

5896

unsigned long size, real_size;

5896

unsigned long size, real_size;

5897

5898

size = zone_spanned_pages_in_node(pgdat->node_id, i,

5898

size = zone_spanned_pages_in_node(pgdat->node_id, i,

5899

node_start_pfn,

5899

node_start_pfn,

5900

node_end_pfn,

5900

node_end_pfn,

5901

&zone_start_pfn,

5901

&zone_start_pfn,

5902

&zone_end_pfn,

5902

&zone_end_pfn,

5903

zones_size);

5903

zones_size);

5904

real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,

5904

real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,

5905

node_start_pfn, node_end_pfn,

5905

node_start_pfn, node_end_pfn,

5906

zholes_size);

5906

zholes_size);

5907

if (size)

5907

if (size)

5908

zone->zone_start_pfn = zone_start_pfn;

5908

zone->zone_start_pfn = zone_start_pfn;

5909

else

5909

else

5910

zone->zone_start_pfn = 0;

5910

zone->zone_start_pfn = 0;

5911

zone->spanned_pages = size;

5911

zone->spanned_pages = size;

5912

zone->present_pages = real_size;

5912

zone->present_pages = real_size;

5913

5914

totalpages += size;

5914

totalpages += size;

5915

realtotalpages += real_size;

5915

realtotalpages += real_size;

5916

}

5916

}

5917

5918

pgdat->node_spanned_pages = totalpages;

5918

pgdat->node_spanned_pages = totalpages;

5919

pgdat->node_present_pages = realtotalpages;

5919

pgdat->node_present_pages = realtotalpages;

5920

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

5920

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

5921

realtotalpages);

5921

realtotalpages);

5922

}

5922

}

5923

5924

#ifndef CONFIG_SPARSEMEM

5924

#ifndef CONFIG_SPARSEMEM

5925

/*

5925

/*

5926

* Calculate the size of the zone->blockflags rounded to an unsigned long

5926

* Calculate the size of the zone->blockflags rounded to an unsigned long

5927

* Start by making sure zonesize is a multiple of pageblock_order by rounding

5927

* Start by making sure zonesize is a multiple of pageblock_order by rounding

5928

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

5928

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

5929

* round what is now in bits to nearest long in bits, then return it in

5929

* round what is now in bits to nearest long in bits, then return it in

5930

* bytes.

5930

* bytes.

5931

*/

5931

*/

5932

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

5932

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

5933

{

5933

{

5934

unsigned long usemapsize;

5934

unsigned long usemapsize;

5935

5936

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

5936

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

5937

usemapsize = roundup(zonesize, pageblock_nr_pages);

5937

usemapsize = roundup(zonesize, pageblock_nr_pages);

5938

usemapsize = usemapsize >> pageblock_order;

5938

usemapsize = usemapsize >> pageblock_order;

5939

usemapsize *= NR_PAGEBLOCK_BITS;

5939

usemapsize *= NR_PAGEBLOCK_BITS;

5940

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

5940

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

5941

5942

return usemapsize / 8;

5942

return usemapsize / 8;

5943

}

5943

}

5944

5945

static void __init setup_usemap(struct pglist_data *pgdat,

5945

static void __init setup_usemap(struct pglist_data *pgdat,

5946

struct zone *zone,

5946

struct zone *zone,

5947

unsigned long zone_start_pfn,

5947

unsigned long zone_start_pfn,

5948

unsigned long zonesize)

5948

unsigned long zonesize)

5949

{

5949

{

5950

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

5950

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

5951

zone->pageblock_flags = NULL;

5951

zone->pageblock_flags = NULL;

5952

if (usemapsize)

5952

if (usemapsize)

5953

zone->pageblock_flags =

5953

zone->pageblock_flags =

5954

memblock_virt_alloc_node_nopanic(usemapsize,

5954

memblock_virt_alloc_node_nopanic(usemapsize,

5955

pgdat->node_id);

5955

pgdat->node_id);

5956

}

5956

}

5957

#else

5957

#else

5958

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

5958

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

5959

unsigned long zone_start_pfn, unsigned long zonesize) {}

5959

unsigned long zone_start_pfn, unsigned long zonesize) {}

5960

#endif /* CONFIG_SPARSEMEM */

5960

#endif /* CONFIG_SPARSEMEM */

5961

5962

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

5962

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

5963

5964

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

5964

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

5965

void __paginginit set_pageblock_order(void)

5965

void __paginginit set_pageblock_order(void)

5966

{

5966

{

5967

unsigned int order;

5967

unsigned int order;

5968

5969

/* Check that pageblock_nr_pages has not already been setup */

5969

/* Check that pageblock_nr_pages has not already been setup */

5970

if (pageblock_order)

5970

if (pageblock_order)

5971

return;

5971

return;

5972

5973

if (HPAGE_SHIFT > PAGE_SHIFT)

5973

if (HPAGE_SHIFT > PAGE_SHIFT)

5974

order = HUGETLB_PAGE_ORDER;

5974

order = HUGETLB_PAGE_ORDER;

5975

else

5975

else

5976

order = MAX_ORDER - 1;

5976

order = MAX_ORDER - 1;

5977

5978

/*

5978

/*

5979

* Assume the largest contiguous order of interest is a huge page.

5979

* Assume the largest contiguous order of interest is a huge page.

5980

* This value may be variable depending on boot parameters on IA64 and

5980

* This value may be variable depending on boot parameters on IA64 and

5981

* powerpc.

5981

* powerpc.

5982

*/

5982

*/

5983

pageblock_order = order;

5983

pageblock_order = order;

5984

}

5984

}

5985

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

5985

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

5986

5987

/*

5987

/*

5988

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

5988

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

5989

* is unused as pageblock_order is set at compile-time. See

5989

* is unused as pageblock_order is set at compile-time. See

5990

* include/linux/pageblock-flags.h for the values of pageblock_order based on

5990

* include/linux/pageblock-flags.h for the values of pageblock_order based on

5991

* the kernel config

5991

* the kernel config

5992

*/

5992

*/

5993

void __paginginit set_pageblock_order(void)

5993

void __paginginit set_pageblock_order(void)

5994

{

5994

{

5995

}

5995

}

5996

5997

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

5997

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

5998

5999

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

5999

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

6000

unsigned long present_pages)

6000

unsigned long present_pages)

6001

{

6001

{

6002

unsigned long pages = spanned_pages;

6002

unsigned long pages = spanned_pages;

6003

6004

/*

6004

/*

6005

* Provide a more accurate estimation if there are holes within

6005

* Provide a more accurate estimation if there are holes within

6006

* the zone and SPARSEMEM is in use. If there are holes within the

6006

* the zone and SPARSEMEM is in use. If there are holes within the

6007

* zone, each populated memory region may cost us one or two extra

6007

* zone, each populated memory region may cost us one or two extra

6008

* memmap pages due to alignment because memmap pages for each

6008

* memmap pages due to alignment because memmap pages for each

6009

* populated regions may not be naturally aligned on page boundary.

6009

* populated regions may not be naturally aligned on page boundary.

6010

* So the (present_pages >> 4) heuristic is a tradeoff for that.

6010

* So the (present_pages >> 4) heuristic is a tradeoff for that.

6011

*/

6011

*/

6012

if (spanned_pages > present_pages + (present_pages >> 4) &&

6012

if (spanned_pages > present_pages + (present_pages >> 4) &&

6013

IS_ENABLED(CONFIG_SPARSEMEM))

6013

IS_ENABLED(CONFIG_SPARSEMEM))

6014

pages = present_pages;

6014

pages = present_pages;

6015

6016

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

6016

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

6017

}

6017

}

6018

6019

/*

6019

/*

6020

* Set up the zone data structures:

6020

* Set up the zone data structures:

6021

* - mark all pages reserved

6021

* - mark all pages reserved

6022

* - mark all memory queues empty

6022

* - mark all memory queues empty

6023

* - clear the memory bitmaps

6023

* - clear the memory bitmaps

6024

*

6024

*

6025

* NOTE: pgdat should get zeroed by caller.

6025

* NOTE: pgdat should get zeroed by caller.

6026

*/

6026

*/

6027

static void __paginginit free_area_init_core(struct pglist_data *pgdat)

6027

static void __paginginit free_area_init_core(struct pglist_data *pgdat)

6028

{

6028

{

6029

enum zone_type j;

6029

enum zone_type j;

6030

int nid = pgdat->node_id;

6030

int nid = pgdat->node_id;

6031

6032

pgdat_resize_init(pgdat);

6032

pgdat_resize_init(pgdat);

6033

#ifdef CONFIG_NUMA_BALANCING

6033

#ifdef CONFIG_NUMA_BALANCING

6034

spin_lock_init(&pgdat->numabalancing_migrate_lock);

6034

spin_lock_init(&pgdat->numabalancing_migrate_lock);

6035

pgdat->numabalancing_migrate_nr_pages = 0;

6035

pgdat->numabalancing_migrate_nr_pages = 0;

6036

pgdat->numabalancing_migrate_next_window = jiffies;

6036

pgdat->numabalancing_migrate_next_window = jiffies;

6037

#endif

6037

#endif

6038

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6038

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6039

spin_lock_init(&pgdat->split_queue_lock);

6039

spin_lock_init(&pgdat->split_queue_lock);

6040

INIT_LIST_HEAD(&pgdat->split_queue);

6040

INIT_LIST_HEAD(&pgdat->split_queue);

6041

pgdat->split_queue_len = 0;

6041

pgdat->split_queue_len = 0;

6042

#endif

6042

#endif

6043

init_waitqueue_head(&pgdat->kswapd_wait);

6043

init_waitqueue_head(&pgdat->kswapd_wait);

6044

init_waitqueue_head(&pgdat->pfmemalloc_wait);

6044

init_waitqueue_head(&pgdat->pfmemalloc_wait);

6045

#ifdef CONFIG_COMPACTION

6045

#ifdef CONFIG_COMPACTION

6046

init_waitqueue_head(&pgdat->kcompactd_wait);

6046

init_waitqueue_head(&pgdat->kcompactd_wait);

6047

#endif

6047

#endif

6048

pgdat_page_ext_init(pgdat);

6048

pgdat_page_ext_init(pgdat);

6049

spin_lock_init(&pgdat->lru_lock);

6049

spin_lock_init(&pgdat->lru_lock);

6050

lruvec_init(node_lruvec(pgdat));

6050

lruvec_init(node_lruvec(pgdat));

6051

6052

pgdat->per_cpu_nodestats = &boot_nodestats;

6052

pgdat->per_cpu_nodestats = &boot_nodestats;

6053

6054

for (j = 0; j < MAX_NR_ZONES; j++) {

6054

for (j = 0; j < MAX_NR_ZONES; j++) {

6055

struct zone *zone = pgdat->node_zones + j;

6055

struct zone *zone = pgdat->node_zones + j;

6056

unsigned long size, realsize, freesize, memmap_pages;

6056

unsigned long size, realsize, freesize, memmap_pages;

6057

unsigned long zone_start_pfn = zone->zone_start_pfn;

6057

unsigned long zone_start_pfn = zone->zone_start_pfn;

6058

6059

size = zone->spanned_pages;

6059

size = zone->spanned_pages;

6060

realsize = freesize = zone->present_pages;

6060

realsize = freesize = zone->present_pages;

6061

6062

/*

6062

/*

6063

* Adjust freesize so that it accounts for how much memory

6063

* Adjust freesize so that it accounts for how much memory

6064

* is used by this zone for memmap. This affects the watermark

6064

* is used by this zone for memmap. This affects the watermark

6065

* and per-cpu initialisations

6065

* and per-cpu initialisations

6066

*/

6066

*/

6067

memmap_pages = calc_memmap_size(size, realsize);

6067

memmap_pages = calc_memmap_size(size, realsize);

6068

if (!is_highmem_idx(j)) {

6068

if (!is_highmem_idx(j)) {

6069

if (freesize >= memmap_pages) {

6069

if (freesize >= memmap_pages) {

6070

freesize -= memmap_pages;

6070

freesize -= memmap_pages;

6071

if (memmap_pages)

6071

if (memmap_pages)

6072

printk(KERN_DEBUG

6072

printk(KERN_DEBUG

6073

" %s zone: %lu pages used for memmap\n",

6073

" %s zone: %lu pages used for memmap\n",

6074

zone_names[j], memmap_pages);

6074

zone_names[j], memmap_pages);

6075

} else

6075

} else

6076

pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",

6076

pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",

6077

zone_names[j], memmap_pages, freesize);

6077

zone_names[j], memmap_pages, freesize);

6078

}

6078

}

6079

6080

/* Account for reserved pages */

6080

/* Account for reserved pages */

6081

if (j == 0 && freesize > dma_reserve) {

6081

if (j == 0 && freesize > dma_reserve) {

6082

freesize -= dma_reserve;

6082

freesize -= dma_reserve;

6083

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

6083

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

6084

zone_names[0], dma_reserve);

6084

zone_names[0], dma_reserve);

6085

}

6085

}

6086

6087

if (!is_highmem_idx(j))

6087

if (!is_highmem_idx(j))

6088

nr_kernel_pages += freesize;

6088

nr_kernel_pages += freesize;

6089

/* Charge for highmem memmap if there are enough kernel pages */

6089

/* Charge for highmem memmap if there are enough kernel pages */

6090

else if (nr_kernel_pages > memmap_pages * 2)

6090

else if (nr_kernel_pages > memmap_pages * 2)

6091

nr_kernel_pages -= memmap_pages;

6091

nr_kernel_pages -= memmap_pages;

6092

nr_all_pages += freesize;

6092

nr_all_pages += freesize;

6093

6094

/*

6094

/*

6095

* Set an approximate value for lowmem here, it will be adjusted

6095

* Set an approximate value for lowmem here, it will be adjusted

6096

* when the bootmem allocator frees pages into the buddy system.

6096

* when the bootmem allocator frees pages into the buddy system.

6097

* And all highmem pages will be managed by the buddy system.

6097

* And all highmem pages will be managed by the buddy system.

6098

*/

6098

*/

6099

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

6099

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

6100

#ifdef CONFIG_NUMA

6100

#ifdef CONFIG_NUMA

6101

zone->node = nid;

6101

zone->node = nid;

6102

#endif

6102

#endif

6103

zone->name = zone_names[j];

6103

zone->name = zone_names[j];

6104

zone->zone_pgdat = pgdat;

6104

zone->zone_pgdat = pgdat;

6105

spin_lock_init(&zone->lock);

6105

spin_lock_init(&zone->lock);

6106

zone_seqlock_init(zone);

6106

zone_seqlock_init(zone);

6107

zone_pcp_init(zone);

6107

zone_pcp_init(zone);

6108

6109

if (!size)

6109

if (!size)

6110

continue;

6110

continue;

6111

6112

set_pageblock_order();

6112

set_pageblock_order();

6113

setup_usemap(pgdat, zone, zone_start_pfn, size);

6113

setup_usemap(pgdat, zone, zone_start_pfn, size);

6114

init_currently_empty_zone(zone, zone_start_pfn, size);

6114

init_currently_empty_zone(zone, zone_start_pfn, size);

6115

memmap_init(size, nid, j, zone_start_pfn);

6115

memmap_init(size, nid, j, zone_start_pfn);

6116

}

6116

}

6117

}

6117

}

6118

6119

static void __ref alloc_node_mem_map(struct pglist_data *pgdat)

6119

static void __ref alloc_node_mem_map(struct pglist_data *pgdat)

6120

{

6120

{

6121

unsigned long __maybe_unused start = 0;

6121

unsigned long __maybe_unused start = 0;

6122

unsigned long __maybe_unused offset = 0;

6122

unsigned long __maybe_unused offset = 0;

6123

6124

/* Skip empty nodes */

6124

/* Skip empty nodes */

6125

if (!pgdat->node_spanned_pages)

6125

if (!pgdat->node_spanned_pages)

6126

return;

6126

return;

6127

6128

#ifdef CONFIG_FLAT_NODE_MEM_MAP

6128

#ifdef CONFIG_FLAT_NODE_MEM_MAP

6129

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

6129

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

6130

offset = pgdat->node_start_pfn - start;

6130

offset = pgdat->node_start_pfn - start;

6131

/* ia64 gets its own node_mem_map, before this, without bootmem */

6131

/* ia64 gets its own node_mem_map, before this, without bootmem */

6132

if (!pgdat->node_mem_map) {

6132

if (!pgdat->node_mem_map) {

6133

unsigned long size, end;

6133

unsigned long size, end;

6134

struct page *map;

6134

struct page *map;

6135

6136

/*

6136

/*

6137

* The zone's endpoints aren't required to be MAX_ORDER

6137

* The zone's endpoints aren't required to be MAX_ORDER

6138

* aligned but the node_mem_map endpoints must be in order

6138

* aligned but the node_mem_map endpoints must be in order

6139

* for the buddy allocator to function correctly.

6139

* for the buddy allocator to function correctly.

6140

*/

6140

*/

6141

end = pgdat_end_pfn(pgdat);

6141

end = pgdat_end_pfn(pgdat);

6142

end = ALIGN(end, MAX_ORDER_NR_PAGES);

6142

end = ALIGN(end, MAX_ORDER_NR_PAGES);

6143

size = (end - start) * sizeof(struct page);

6143

size = (end - start) * sizeof(struct page);

6144

map = alloc_remap(pgdat->node_id, size);

6144

map = alloc_remap(pgdat->node_id, size);

6145

if (!map)

6145

if (!map)

6146

map = memblock_virt_alloc_node_nopanic(size,

6146

map = memblock_virt_alloc_node_nopanic(size,

6147

pgdat->node_id);

6147

pgdat->node_id);

6148

pgdat->node_mem_map = map + offset;

6148

pgdat->node_mem_map = map + offset;

6149

}

6149

}

6150

#ifndef CONFIG_NEED_MULTIPLE_NODES

6150

#ifndef CONFIG_NEED_MULTIPLE_NODES

6151

/*

6151

/*

6152

* With no DISCONTIG, the global mem_map is just set as node 0's

6152

* With no DISCONTIG, the global mem_map is just set as node 0's

6153

*/

6153

*/

6154

if (pgdat == NODE_DATA(0)) {

6154

if (pgdat == NODE_DATA(0)) {

6155

mem_map = NODE_DATA(0)->node_mem_map;

6155

mem_map = NODE_DATA(0)->node_mem_map;

6156

#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)

6156

#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)

6157

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

6157

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

6158

mem_map -= offset;

6158

mem_map -= offset;

6159

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

6159

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

6160

}

6160

}

6161

#endif

6161

#endif

6162

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

6162

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

6163

}

6163

}

6164

6165

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

6165

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

6166

unsigned long node_start_pfn, unsigned long *zholes_size)

6166

unsigned long node_start_pfn, unsigned long *zholes_size)

6167

{

6167

{

6168

pg_data_t *pgdat = NODE_DATA(nid);

6168

pg_data_t *pgdat = NODE_DATA(nid);

6169

unsigned long start_pfn = 0;

6169

unsigned long start_pfn = 0;

6170

unsigned long end_pfn = 0;

6170

unsigned long end_pfn = 0;

6171

6172

/* pg_data_t should be reset to zero when it's allocated */

6172

/* pg_data_t should be reset to zero when it's allocated */

6173

WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);

6173

WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);

6174

6175

pgdat->node_id = nid;

6175

pgdat->node_id = nid;

6176

pgdat->node_start_pfn = node_start_pfn;

6176

pgdat->node_start_pfn = node_start_pfn;

6177

pgdat->per_cpu_nodestats = NULL;

6177

pgdat->per_cpu_nodestats = NULL;

6178

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

6178

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

6179

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

6179

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

6180

pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,

6180

pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,

6181

(u64)start_pfn << PAGE_SHIFT,

6181

(u64)start_pfn << PAGE_SHIFT,

6182

end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);

6182

end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);

6183

#else

6183

#else

6184

start_pfn = node_start_pfn;

6184

start_pfn = node_start_pfn;

6185

#endif

6185

#endif

6186

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

6186

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

6187

zones_size, zholes_size);

6187

zones_size, zholes_size);

6188

6189

alloc_node_mem_map(pgdat);

6189

alloc_node_mem_map(pgdat);

6190

#ifdef CONFIG_FLAT_NODE_MEM_MAP

6190

#ifdef CONFIG_FLAT_NODE_MEM_MAP

6191

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

6191

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

6192

nid, (unsigned long)pgdat,

6192

nid, (unsigned long)pgdat,

6193

(unsigned long)pgdat->node_mem_map);

6193

(unsigned long)pgdat->node_mem_map);

6194

#endif

6194

#endif

6195

6196

reset_deferred_meminit(pgdat);

6196

reset_deferred_meminit(pgdat);

6197

free_area_init_core(pgdat);

6197

free_area_init_core(pgdat);

6198

}

6198

}

6199

6200

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

6200

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

6201

6202

#if MAX_NUMNODES > 1

6202

#if MAX_NUMNODES > 1

6203

/*

6203

/*

6204

* Figure out the number of possible node ids.

6204

* Figure out the number of possible node ids.

6205

*/

6205

*/

6206

void __init setup_nr_node_ids(void)

6206

void __init setup_nr_node_ids(void)

6207

{

6207

{

6208

unsigned int highest;

6208

unsigned int highest;

6209

6210

highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);

6210

highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);

6211

nr_node_ids = highest + 1;

6211

nr_node_ids = highest + 1;

6212

}

6212

}

6213

#endif

6213

#endif

6214

6215

/**

6215

/**

6216

* node_map_pfn_alignment - determine the maximum internode alignment

6216

* node_map_pfn_alignment - determine the maximum internode alignment

6217

*

6217

*

6218

* This function should be called after node map is populated and sorted.

6218

* This function should be called after node map is populated and sorted.

6219

* It calculates the maximum power of two alignment which can distinguish

6219

* It calculates the maximum power of two alignment which can distinguish

6220

* all the nodes.

6220

* all the nodes.

6221

*

6221

*

6222

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

6222

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

6223

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

6223

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

6224

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

6224

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

6225

* shifted, 1GiB is enough and this function will indicate so.

6225

* shifted, 1GiB is enough and this function will indicate so.

6226

*

6226

*

6227

* This is used to test whether pfn -> nid mapping of the chosen memory

6227

* This is used to test whether pfn -> nid mapping of the chosen memory

6228

* model has fine enough granularity to avoid incorrect mapping for the

6228

* model has fine enough granularity to avoid incorrect mapping for the

6229

* populated node map.

6229

* populated node map.

6230

*

6230

*

6231

* Returns the determined alignment in pfn's. 0 if there is no alignment

6231

* Returns the determined alignment in pfn's. 0 if there is no alignment

6232

* requirement (single node).

6232

* requirement (single node).

6233

*/

6233

*/

6234

unsigned long __init node_map_pfn_alignment(void)

6234

unsigned long __init node_map_pfn_alignment(void)

6235

{

6235

{

6236

unsigned long accl_mask = 0, last_end = 0;

6236

unsigned long accl_mask = 0, last_end = 0;

6237

unsigned long start, end, mask;

6237

unsigned long start, end, mask;

6238

int last_nid = -1;

6238

int last_nid = -1;

6239

int i, nid;

6239

int i, nid;

6240

6241

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

6241

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

6242

if (!start || last_nid < 0 || last_nid == nid) {

6242

if (!start || last_nid < 0 || last_nid == nid) {

6243

last_nid = nid;

6243

last_nid = nid;

6244

last_end = end;

6244

last_end = end;

6245

continue;

6245

continue;

6246

}

6246

}

6247

6248

/*

6248

/*

6249

* Start with a mask granular enough to pin-point to the

6249

* Start with a mask granular enough to pin-point to the

6250

* start pfn and tick off bits one-by-one until it becomes

6250

* start pfn and tick off bits one-by-one until it becomes

6251

* too coarse to separate the current node from the last.

6251

* too coarse to separate the current node from the last.

6252

*/

6252

*/

6253

mask = ~((1 << __ffs(start)) - 1);

6253

mask = ~((1 << __ffs(start)) - 1);

6254

while (mask && last_end <= (start & (mask << 1)))

6254

while (mask && last_end <= (start & (mask << 1)))

6255

mask <<= 1;

6255

mask <<= 1;

6256

6257

/* accumulate all internode masks */

6257

/* accumulate all internode masks */

6258

accl_mask |= mask;

6258

accl_mask |= mask;

6259

}

6259

}

6260

6261

/* convert mask to number of pages */

6261

/* convert mask to number of pages */

6262

return ~accl_mask + 1;

6262

return ~accl_mask + 1;

6263

}

6263

}

6264

6265

/* Find the lowest pfn for a node */

6265

/* Find the lowest pfn for a node */

6266

static unsigned long __init find_min_pfn_for_node(int nid)

6266

static unsigned long __init find_min_pfn_for_node(int nid)

6267

{

6267

{

6268

unsigned long min_pfn = ULONG_MAX;

6268

unsigned long min_pfn = ULONG_MAX;

6269

unsigned long start_pfn;

6269

unsigned long start_pfn;

6270

int i;

6270

int i;

6271

6272

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

6272

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

6273

min_pfn = min(min_pfn, start_pfn);

6273

min_pfn = min(min_pfn, start_pfn);

6274

6275

if (min_pfn == ULONG_MAX) {

6275

if (min_pfn == ULONG_MAX) {

6276

pr_warn("Could not find start_pfn for node %d\n", nid);

6276

pr_warn("Could not find start_pfn for node %d\n", nid);

6277

return 0;

6277

return 0;

6278

}

6278

}

6279

6280

return min_pfn;

6280

return min_pfn;

6281

}

6281

}

6282

6283

/**

6283

/**

6284

* find_min_pfn_with_active_regions - Find the minimum PFN registered

6284

* find_min_pfn_with_active_regions - Find the minimum PFN registered

6285

*

6285

*

6286

* It returns the minimum PFN based on information provided via

6286

* It returns the minimum PFN based on information provided via

6287

* memblock_set_node().

6287

* memblock_set_node().

6288

*/

6288

*/

6289

unsigned long __init find_min_pfn_with_active_regions(void)

6289

unsigned long __init find_min_pfn_with_active_regions(void)

6290

{

6290

{

6291

return find_min_pfn_for_node(MAX_NUMNODES);

6291

return find_min_pfn_for_node(MAX_NUMNODES);

6292

}

6292

}

6293

6294

/*

6294

/*

6295

* early_calculate_totalpages()

6295

* early_calculate_totalpages()

6296

* Sum pages in active regions for movable zone.

6296

* Sum pages in active regions for movable zone.

6297

* Populate N_MEMORY for calculating usable_nodes.

6297

* Populate N_MEMORY for calculating usable_nodes.

6298

*/

6298

*/

6299

static unsigned long __init early_calculate_totalpages(void)

6299

static unsigned long __init early_calculate_totalpages(void)

6300

{

6300

{

6301

unsigned long totalpages = 0;

6301

unsigned long totalpages = 0;

6302

unsigned long start_pfn, end_pfn;

6302

unsigned long start_pfn, end_pfn;

6303

int i, nid;

6303

int i, nid;

6304

6305

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

6305

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

6306

unsigned long pages = end_pfn - start_pfn;

6306

unsigned long pages = end_pfn - start_pfn;

6307

6308

totalpages += pages;

6308

totalpages += pages;

6309

if (pages)

6309

if (pages)

6310

node_set_state(nid, N_MEMORY);

6310

node_set_state(nid, N_MEMORY);

6311

}

6311

}

6312

return totalpages;

6312

return totalpages;

6313

}

6313

}

6314

6315

/*

6315

/*

6316

* Find the PFN the Movable zone begins in each node. Kernel memory

6316

* Find the PFN the Movable zone begins in each node. Kernel memory

6317

* is spread evenly between nodes as long as the nodes have enough

6317

* is spread evenly between nodes as long as the nodes have enough

6318

* memory. When they don't, some nodes will have more kernelcore than

6318

* memory. When they don't, some nodes will have more kernelcore than

6319

* others

6319

* others

6320

*/

6320

*/

6321

static void __init find_zone_movable_pfns_for_nodes(void)

6321

static void __init find_zone_movable_pfns_for_nodes(void)

6322

{

6322

{

6323

int i, nid;

6323

int i, nid;

6324

unsigned long usable_startpfn;

6324

unsigned long usable_startpfn;

6325

unsigned long kernelcore_node, kernelcore_remaining;

6325

unsigned long kernelcore_node, kernelcore_remaining;

6326

/* save the state before borrow the nodemask */

6326

/* save the state before borrow the nodemask */

6327

nodemask_t saved_node_state = node_states[N_MEMORY];

6327

nodemask_t saved_node_state = node_states[N_MEMORY];

6328

unsigned long totalpages = early_calculate_totalpages();

6328

unsigned long totalpages = early_calculate_totalpages();

6329

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

6329

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

6330

struct memblock_region *r;

6330

struct memblock_region *r;

6331

6332

/* Need to find movable_zone earlier when movable_node is specified. */

6332

/* Need to find movable_zone earlier when movable_node is specified. */

6333

find_usable_zone_for_movable();

6333

find_usable_zone_for_movable();

6334

6335

/*

6335

/*

6336

* If movable_node is specified, ignore kernelcore and movablecore

6336

* If movable_node is specified, ignore kernelcore and movablecore

6337

* options.

6337

* options.

6338

*/

6338

*/

6339

if (movable_node_is_enabled()) {

6339

if (movable_node_is_enabled()) {

6340

for_each_memblock(memory, r) {

6340

for_each_memblock(memory, r) {

6341

if (!memblock_is_hotpluggable(r))

6341

if (!memblock_is_hotpluggable(r))

6342

continue;

6342

continue;

6343

6344

nid = r->nid;

6344

nid = r->nid;

6345

6346

usable_startpfn = PFN_DOWN(r->base);

6346

usable_startpfn = PFN_DOWN(r->base);

6347

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

6347

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

6348

min(usable_startpfn, zone_movable_pfn[nid]) :

6348

min(usable_startpfn, zone_movable_pfn[nid]) :

6349

usable_startpfn;

6349

usable_startpfn;

6350

}

6350

}

6351

6352

goto out2;

6352

goto out2;

6353

}

6353

}

6354

6355

/*

6355

/*

6356

* If kernelcore=mirror is specified, ignore movablecore option

6356

* If kernelcore=mirror is specified, ignore movablecore option

6357

*/

6357

*/

6358

if (mirrored_kernelcore) {

6358

if (mirrored_kernelcore) {

6359

bool mem_below_4gb_not_mirrored = false;

6359

bool mem_below_4gb_not_mirrored = false;

6360

6361

for_each_memblock(memory, r) {

6361

for_each_memblock(memory, r) {

6362

if (memblock_is_mirror(r))

6362

if (memblock_is_mirror(r))

6363

continue;

6363

continue;

6364

6365

nid = r->nid;

6365

nid = r->nid;

6366

6367

usable_startpfn = memblock_region_memory_base_pfn(r);

6367

usable_startpfn = memblock_region_memory_base_pfn(r);

6368

6369

if (usable_startpfn < 0x100000) {

6369

if (usable_startpfn < 0x100000) {

6370

mem_below_4gb_not_mirrored = true;

6370

mem_below_4gb_not_mirrored = true;

6371

continue;

6371

continue;

6372

}

6372

}

6373

6374

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

6374

zone_movable_pfn[nid] = zone_movable_pfn[nid] ?

6375

min(usable_startpfn, zone_movable_pfn[nid]) :

6375

min(usable_startpfn, zone_movable_pfn[nid]) :

6376

usable_startpfn;

6376

usable_startpfn;

6377

}

6377

}

6378

6379

if (mem_below_4gb_not_mirrored)

6379

if (mem_below_4gb_not_mirrored)

6380

pr_warn("This configuration results in unmirrored kernel memory.");

6380

pr_warn("This configuration results in unmirrored kernel memory.");

6381

6382

goto out2;

6382

goto out2;

6383

}

6383

}

6384

6385

/*

6385

/*

6386

* If movablecore=nn[KMG] was specified, calculate what size of

6386

* If movablecore=nn[KMG] was specified, calculate what size of

6387

* kernelcore that corresponds so that memory usable for

6387

* kernelcore that corresponds so that memory usable for

6388

* any allocation type is evenly spread. If both kernelcore

6388

* any allocation type is evenly spread. If both kernelcore

6389

* and movablecore are specified, then the value of kernelcore

6389

* and movablecore are specified, then the value of kernelcore

6390

* will be used for required_kernelcore if it's greater than

6390

* will be used for required_kernelcore if it's greater than

6391

* what movablecore would have allowed.

6391

* what movablecore would have allowed.

6392

*/

6392

*/

6393

if (required_movablecore) {

6393

if (required_movablecore) {

6394

unsigned long corepages;

6394

unsigned long corepages;

6395

6396

/*

6396

/*

6397

* Round-up so that ZONE_MOVABLE is at least as large as what

6397

* Round-up so that ZONE_MOVABLE is at least as large as what

6398

* was requested by the user

6398

* was requested by the user

6399

*/

6399

*/

6400

required_movablecore =

6400

required_movablecore =

6401

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

6401

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

6402

required_movablecore = min(totalpages, required_movablecore);

6402

required_movablecore = min(totalpages, required_movablecore);

6403

corepages = totalpages - required_movablecore;

6403

corepages = totalpages - required_movablecore;

6404

6405

required_kernelcore = max(required_kernelcore, corepages);

6405

required_kernelcore = max(required_kernelcore, corepages);

6406

}

6406

}

6407

6408

/*

6408

/*

6409

* If kernelcore was not specified or kernelcore size is larger

6409

* If kernelcore was not specified or kernelcore size is larger

6410

* than totalpages, there is no ZONE_MOVABLE.

6410

* than totalpages, there is no ZONE_MOVABLE.

6411

*/

6411

*/

6412

if (!required_kernelcore || required_kernelcore >= totalpages)

6412

if (!required_kernelcore || required_kernelcore >= totalpages)

6413

goto out;

6413

goto out;

6414

6415

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

6415

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

6416

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

6416

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

6417

6418

restart:

6418

restart:

6419

/* Spread kernelcore memory as evenly as possible throughout nodes */

6419

/* Spread kernelcore memory as evenly as possible throughout nodes */

6420

kernelcore_node = required_kernelcore / usable_nodes;

6420

kernelcore_node = required_kernelcore / usable_nodes;

6421

for_each_node_state(nid, N_MEMORY) {

6421

for_each_node_state(nid, N_MEMORY) {

6422

unsigned long start_pfn, end_pfn;

6422

unsigned long start_pfn, end_pfn;

6423

6424

/*

6424

/*

6425

* Recalculate kernelcore_node if the division per node

6425

* Recalculate kernelcore_node if the division per node

6426

* now exceeds what is necessary to satisfy the requested

6426

* now exceeds what is necessary to satisfy the requested

6427

* amount of memory for the kernel

6427

* amount of memory for the kernel

6428

*/

6428

*/

6429

if (required_kernelcore < kernelcore_node)

6429

if (required_kernelcore < kernelcore_node)

6430

kernelcore_node = required_kernelcore / usable_nodes;

6430

kernelcore_node = required_kernelcore / usable_nodes;

6431

6432

/*

6432

/*

6433

* As the map is walked, we track how much memory is usable

6433

* As the map is walked, we track how much memory is usable

6434

* by the kernel using kernelcore_remaining. When it is

6434

* by the kernel using kernelcore_remaining. When it is

6435

* 0, the rest of the node is usable by ZONE_MOVABLE

6435

* 0, the rest of the node is usable by ZONE_MOVABLE

6436

*/

6436

*/

6437

kernelcore_remaining = kernelcore_node;

6437

kernelcore_remaining = kernelcore_node;

6438

6439

/* Go through each range of PFNs within this node */

6439

/* Go through each range of PFNs within this node */

6440

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

6440

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

6441

unsigned long size_pages;

6441

unsigned long size_pages;

6442

6443

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

6443

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

6444

if (start_pfn >= end_pfn)

6444

if (start_pfn >= end_pfn)

6445

continue;

6445

continue;

6446

6447

/* Account for what is only usable for kernelcore */

6447

/* Account for what is only usable for kernelcore */

6448

if (start_pfn < usable_startpfn) {

6448

if (start_pfn < usable_startpfn) {

6449

unsigned long kernel_pages;

6449

unsigned long kernel_pages;

6450

kernel_pages = min(end_pfn, usable_startpfn)

6450

kernel_pages = min(end_pfn, usable_startpfn)

6451

- start_pfn;

6451

- start_pfn;

6452

6453

kernelcore_remaining -= min(kernel_pages,

6453

kernelcore_remaining -= min(kernel_pages,

6454

kernelcore_remaining);

6454

kernelcore_remaining);

6455

required_kernelcore -= min(kernel_pages,

6455

required_kernelcore -= min(kernel_pages,

6456

required_kernelcore);

6456

required_kernelcore);

6457

6458

/* Continue if range is now fully accounted */

6458

/* Continue if range is now fully accounted */

6459

if (end_pfn <= usable_startpfn) {

6459

if (end_pfn <= usable_startpfn) {

6460

6461

/*

6461

/*

6462

* Push zone_movable_pfn to the end so

6462

* Push zone_movable_pfn to the end so

6463

* that if we have to rebalance

6463

* that if we have to rebalance

6464

* kernelcore across nodes, we will

6464

* kernelcore across nodes, we will

6465

* not double account here

6465

* not double account here

6466

*/

6466

*/

6467

zone_movable_pfn[nid] = end_pfn;

6467

zone_movable_pfn[nid] = end_pfn;

6468

continue;

6468

continue;

6469

}

6469

}

6470

start_pfn = usable_startpfn;

6470

start_pfn = usable_startpfn;

6471

}

6471

}

6472

6473

/*

6473

/*

6474

* The usable PFN range for ZONE_MOVABLE is from

6474

* The usable PFN range for ZONE_MOVABLE is from

6475

* start_pfn->end_pfn. Calculate size_pages as the

6475

* start_pfn->end_pfn. Calculate size_pages as the

6476

* number of pages used as kernelcore

6476

* number of pages used as kernelcore

6477

*/

6477

*/

6478

size_pages = end_pfn - start_pfn;

6478

size_pages = end_pfn - start_pfn;

6479

if (size_pages > kernelcore_remaining)

6479

if (size_pages > kernelcore_remaining)

6480

size_pages = kernelcore_remaining;

6480

size_pages = kernelcore_remaining;

6481

zone_movable_pfn[nid] = start_pfn + size_pages;

6481

zone_movable_pfn[nid] = start_pfn + size_pages;

6482

6483

/*

6483

/*

6484

* Some kernelcore has been met, update counts and

6484

* Some kernelcore has been met, update counts and

6485

* break if the kernelcore for this node has been

6485

* break if the kernelcore for this node has been

6486

* satisfied

6486

* satisfied

6487

*/

6487

*/

6488

required_kernelcore -= min(required_kernelcore,

6488

required_kernelcore -= min(required_kernelcore,

6489

size_pages);

6489

size_pages);

6490

kernelcore_remaining -= size_pages;

6490

kernelcore_remaining -= size_pages;

6491

if (!kernelcore_remaining)

6491

if (!kernelcore_remaining)

6492

break;

6492

break;

6493

}

6493

}

6494

}

6494

}

6495

6496

/*

6496

/*

6497

* If there is still required_kernelcore, we do another pass with one

6497

* If there is still required_kernelcore, we do another pass with one

6498

* less node in the count. This will push zone_movable_pfn[nid] further

6498

* less node in the count. This will push zone_movable_pfn[nid] further

6499

* along on the nodes that still have memory until kernelcore is

6499

* along on the nodes that still have memory until kernelcore is

6500

* satisfied

6500

* satisfied

6501

*/

6501

*/

6502

usable_nodes--;

6502

usable_nodes--;

6503

if (usable_nodes && required_kernelcore > usable_nodes)

6503

if (usable_nodes && required_kernelcore > usable_nodes)

6504

goto restart;

6504

goto restart;

6505

6506

out2:

6506

out2:

6507

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

6507

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

6508

for (nid = 0; nid < MAX_NUMNODES; nid++)

6508

for (nid = 0; nid < MAX_NUMNODES; nid++)

6509

zone_movable_pfn[nid] =

6509

zone_movable_pfn[nid] =

6510

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

6510

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

6511

6512

out:

6512

out:

6513

/* restore the node_state */

6513

/* restore the node_state */

6514

node_states[N_MEMORY] = saved_node_state;

6514

node_states[N_MEMORY] = saved_node_state;

6515

}

6515

}

6516

6517

/* Any regular or high memory on that node ? */

6517

/* Any regular or high memory on that node ? */

6518

static void check_for_memory(pg_data_t *pgdat, int nid)

6518

static void check_for_memory(pg_data_t *pgdat, int nid)

6519

{

6519

{

6520

enum zone_type zone_type;

6520

enum zone_type zone_type;

6521

6522

if (N_MEMORY == N_NORMAL_MEMORY)

6522

if (N_MEMORY == N_NORMAL_MEMORY)

6523

return;

6523

return;

6524

6525

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

6525

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

6526

struct zone *zone = &pgdat->node_zones[zone_type];

6526

struct zone *zone = &pgdat->node_zones[zone_type];

6527

if (populated_zone(zone)) {

6527

if (populated_zone(zone)) {

6528

node_set_state(nid, N_HIGH_MEMORY);

6528

node_set_state(nid, N_HIGH_MEMORY);

6529

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

6529

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

6530

zone_type <= ZONE_NORMAL)

6530

zone_type <= ZONE_NORMAL)

6531

node_set_state(nid, N_NORMAL_MEMORY);

6531

node_set_state(nid, N_NORMAL_MEMORY);

6532

break;

6532

break;

6533

}

6533

}

6534

}

6534

}

6535

}

6535

}

6536

6537

/**

6537

/**

6538

* free_area_init_nodes - Initialise all pg_data_t and zone data

6538

* free_area_init_nodes - Initialise all pg_data_t and zone data

6539

* @max_zone_pfn: an array of max PFNs for each zone

6539

* @max_zone_pfn: an array of max PFNs for each zone

6540

*

6540

*

6541

* This will call free_area_init_node() for each active node in the system.

6541

* This will call free_area_init_node() for each active node in the system.

6542

* Using the page ranges provided by memblock_set_node(), the size of each

6542

* Using the page ranges provided by memblock_set_node(), the size of each

6543

* zone in each node and their holes is calculated. If the maximum PFN

6543

* zone in each node and their holes is calculated. If the maximum PFN

6544

* between two adjacent zones match, it is assumed that the zone is empty.

6544

* between two adjacent zones match, it is assumed that the zone is empty.

6545

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

6545

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

6546

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

6546

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

6547

* starts where the previous one ended. For example, ZONE_DMA32 starts

6547

* starts where the previous one ended. For example, ZONE_DMA32 starts

6548

* at arch_max_dma_pfn.

6548

* at arch_max_dma_pfn.

6549

*/

6549

*/

6550

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

6550

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

6551

{

6551

{

6552

unsigned long start_pfn, end_pfn;

6552

unsigned long start_pfn, end_pfn;

6553

int i, nid;

6553

int i, nid;

6554

6555

/* Record where the zone boundaries are */

6555

/* Record where the zone boundaries are */

6556

memset(arch_zone_lowest_possible_pfn, 0,

6556

memset(arch_zone_lowest_possible_pfn, 0,

6557

sizeof(arch_zone_lowest_possible_pfn));

6557

sizeof(arch_zone_lowest_possible_pfn));

6558

memset(arch_zone_highest_possible_pfn, 0,

6558

memset(arch_zone_highest_possible_pfn, 0,

6559

sizeof(arch_zone_highest_possible_pfn));

6559

sizeof(arch_zone_highest_possible_pfn));

6560

6561

start_pfn = find_min_pfn_with_active_regions();

6561

start_pfn = find_min_pfn_with_active_regions();

6562

6563

for (i = 0; i < MAX_NR_ZONES; i++) {

6563

for (i = 0; i < MAX_NR_ZONES; i++) {

6564

if (i == ZONE_MOVABLE)

6564

if (i == ZONE_MOVABLE)

6565

continue;

6565

continue;

6566

6567

end_pfn = max(max_zone_pfn[i], start_pfn);

6567

end_pfn = max(max_zone_pfn[i], start_pfn);

6568

arch_zone_lowest_possible_pfn[i] = start_pfn;

6568

arch_zone_lowest_possible_pfn[i] = start_pfn;

6569

arch_zone_highest_possible_pfn[i] = end_pfn;

6569

arch_zone_highest_possible_pfn[i] = end_pfn;

6570

6571

start_pfn = end_pfn;

6571

start_pfn = end_pfn;

6572

}

6572

}

6573

6574

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

6574

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

6575

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

6575

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

6576

find_zone_movable_pfns_for_nodes();

6576

find_zone_movable_pfns_for_nodes();

6577

6578

/* Print out the zone ranges */

6578

/* Print out the zone ranges */

6579

pr_info("Zone ranges:\n");

6579

pr_info("Zone ranges:\n");

6580

for (i = 0; i < MAX_NR_ZONES; i++) {

6580

for (i = 0; i < MAX_NR_ZONES; i++) {

6581

if (i == ZONE_MOVABLE)

6581

if (i == ZONE_MOVABLE)

6582

continue;

6582

continue;

6583

pr_info(" %-8s ", zone_names[i]);

6583

pr_info(" %-8s ", zone_names[i]);

6584

if (arch_zone_lowest_possible_pfn[i] ==

6584

if (arch_zone_lowest_possible_pfn[i] ==

6585

arch_zone_highest_possible_pfn[i])

6585

arch_zone_highest_possible_pfn[i])

6586

pr_cont("empty\n");

6586

pr_cont("empty\n");

6587

else

6587

else

6588

pr_cont("[mem %#018Lx-%#018Lx]\n",

6588

pr_cont("[mem %#018Lx-%#018Lx]\n",

6589

(u64)arch_zone_lowest_possible_pfn[i]

6589

(u64)arch_zone_lowest_possible_pfn[i]

6590

<< PAGE_SHIFT,

6590

<< PAGE_SHIFT,

6591

((u64)arch_zone_highest_possible_pfn[i]

6591

((u64)arch_zone_highest_possible_pfn[i]

6592

<< PAGE_SHIFT) - 1);

6592

<< PAGE_SHIFT) - 1);

6593

}

6593

}

6594

6595

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

6595

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

6596

pr_info("Movable zone start for each node\n");

6596

pr_info("Movable zone start for each node\n");

6597

for (i = 0; i < MAX_NUMNODES; i++) {

6597

for (i = 0; i < MAX_NUMNODES; i++) {

6598

if (zone_movable_pfn[i])

6598

if (zone_movable_pfn[i])

6599

pr_info(" Node %d: %#018Lx\n", i,

6599

pr_info(" Node %d: %#018Lx\n", i,

6600

(u64)zone_movable_pfn[i] << PAGE_SHIFT);

6600

(u64)zone_movable_pfn[i] << PAGE_SHIFT);

6601

}

6601

}

6602

6603

/* Print out the early node map */

6603

/* Print out the early node map */

6604

pr_info("Early memory node ranges\n");

6604

pr_info("Early memory node ranges\n");

6605

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

6605

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

6606

pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,

6606

pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,

6607

(u64)start_pfn << PAGE_SHIFT,

6607

(u64)start_pfn << PAGE_SHIFT,

6608

((u64)end_pfn << PAGE_SHIFT) - 1);

6608

((u64)end_pfn << PAGE_SHIFT) - 1);

6609

6610

/* Initialise every node */

6610

/* Initialise every node */

6611

mminit_verify_pageflags_layout();

6611

mminit_verify_pageflags_layout();

6612

setup_nr_node_ids();

6612

setup_nr_node_ids();

6613

for_each_online_node(nid) {

6613

for_each_online_node(nid) {

6614

pg_data_t *pgdat = NODE_DATA(nid);

6614

pg_data_t *pgdat = NODE_DATA(nid);

6615

free_area_init_node(nid, NULL,

6615

free_area_init_node(nid, NULL,

6616

find_min_pfn_for_node(nid), NULL);

6616

find_min_pfn_for_node(nid), NULL);

6617

6618

/* Any memory on that node */

6618

/* Any memory on that node */

6619

if (pgdat->node_present_pages)

6619

if (pgdat->node_present_pages)

6620

node_set_state(nid, N_MEMORY);

6620

node_set_state(nid, N_MEMORY);

6621

check_for_memory(pgdat, nid);

6621

check_for_memory(pgdat, nid);

6622

}

6622

}

6623

}

6623

}

6624

6625

static int __init cmdline_parse_core(char *p, unsigned long *core)

6625

static int __init cmdline_parse_core(char *p, unsigned long *core)

6626

{

6626

{

6627

unsigned long long coremem;

6627

unsigned long long coremem;

6628

if (!p)

6628

if (!p)

6629

return -EINVAL;

6629

return -EINVAL;

6630

6631

coremem = memparse(p, &p);

6631

coremem = memparse(p, &p);

6632

*core = coremem >> PAGE_SHIFT;

6632

*core = coremem >> PAGE_SHIFT;

6633

6634

/* Paranoid check that UL is enough for the coremem value */

6634

/* Paranoid check that UL is enough for the coremem value */

6635

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

6635

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

6636

6637

return 0;

6637

return 0;

6638

}

6638

}

6639

6640

/*

6640

/*

6641

* kernelcore=size sets the amount of memory for use for allocations that

6641

* kernelcore=size sets the amount of memory for use for allocations that

6642

* cannot be reclaimed or migrated.

6642

* cannot be reclaimed or migrated.

6643

*/

6643

*/

6644

static int __init cmdline_parse_kernelcore(char *p)

6644

static int __init cmdline_parse_kernelcore(char *p)

6645

{

6645

{

6646

/* parse kernelcore=mirror */

6646

/* parse kernelcore=mirror */

6647

if (parse_option_str(p, "mirror")) {

6647

if (parse_option_str(p, "mirror")) {

6648

mirrored_kernelcore = true;

6648

mirrored_kernelcore = true;

6649

return 0;

6649

return 0;

6650

}

6650

}

6651

6652

return cmdline_parse_core(p, &required_kernelcore);

6652

return cmdline_parse_core(p, &required_kernelcore);

6653

}

6653

}

6654

6655

/*

6655

/*

6656

* movablecore=size sets the amount of memory for use for allocations that

6656

* movablecore=size sets the amount of memory for use for allocations that

6657

* can be reclaimed or migrated.

6657

* can be reclaimed or migrated.

6658

*/

6658

*/

6659

static int __init cmdline_parse_movablecore(char *p)

6659

static int __init cmdline_parse_movablecore(char *p)

6660

{

6660

{

6661

return cmdline_parse_core(p, &required_movablecore);

6661

return cmdline_parse_core(p, &required_movablecore);

6662

}

6662

}

6663

6664

early_param("kernelcore", cmdline_parse_kernelcore);

6664

early_param("kernelcore", cmdline_parse_kernelcore);

6665

early_param("movablecore", cmdline_parse_movablecore);

6665

early_param("movablecore", cmdline_parse_movablecore);

6666

6667

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

6667

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

6668

6669

void adjust_managed_page_count(struct page *page, long count)

6669

void adjust_managed_page_count(struct page *page, long count)

6670

{

6670

{

6671

spin_lock(&managed_page_count_lock);

6671

spin_lock(&managed_page_count_lock);

6672

page_zone(page)->managed_pages += count;

6672

page_zone(page)->managed_pages += count;

6673

totalram_pages += count;

6673

totalram_pages += count;

6674

#ifdef CONFIG_HIGHMEM

6674

#ifdef CONFIG_HIGHMEM

6675

if (PageHighMem(page))

6675

if (PageHighMem(page))

6676

totalhigh_pages += count;

6676

totalhigh_pages += count;

6677

#endif

6677

#endif

6678

spin_unlock(&managed_page_count_lock);

6678

spin_unlock(&managed_page_count_lock);

6679

}

6679

}

6680

EXPORT_SYMBOL(adjust_managed_page_count);

6680

EXPORT_SYMBOL(adjust_managed_page_count);

6681

6682

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

6682

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

6683

{

6683

{

6684

void *pos;

6684

void *pos;

6685

unsigned long pages = 0;

6685

unsigned long pages = 0;

6686

6687

start = (void *)PAGE_ALIGN((unsigned long)start);

6687

start = (void *)PAGE_ALIGN((unsigned long)start);

6688

end = (void *)((unsigned long)end & PAGE_MASK);

6688

end = (void *)((unsigned long)end & PAGE_MASK);

6689

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

6689

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

6690

if ((unsigned int)poison <= 0xFF)

6690

if ((unsigned int)poison <= 0xFF)

6691

memset(pos, poison, PAGE_SIZE);

6691

memset(pos, poison, PAGE_SIZE);

6692

free_reserved_page(virt_to_page(pos));

6692

free_reserved_page(virt_to_page(pos));

6693

}

6693

}

6694

6695

if (pages && s)

6695

if (pages && s)

6696

pr_info("Freeing %s memory: %ldK\n",

6696

pr_info("Freeing %s memory: %ldK\n",

6697

s, pages << (PAGE_SHIFT - 10));

6697

s, pages << (PAGE_SHIFT - 10));

6698

6699

return pages;

6699

return pages;

6700

}

6700

}

6701

EXPORT_SYMBOL(free_reserved_area);

6701

EXPORT_SYMBOL(free_reserved_area);

6702

6703

#ifdef CONFIG_HIGHMEM

6703

#ifdef CONFIG_HIGHMEM

6704

void free_highmem_page(struct page *page)

6704

void free_highmem_page(struct page *page)

6705

{

6705

{

6706

__free_reserved_page(page);

6706

__free_reserved_page(page);

6707

totalram_pages++;

6707

totalram_pages++;

6708

page_zone(page)->managed_pages++;

6708

page_zone(page)->managed_pages++;

6709

totalhigh_pages++;

6709

totalhigh_pages++;

6710

}

6710

}

6711

#endif

6711

#endif

6712

6713

6714

void __init mem_init_print_info(const char *str)

6714

void __init mem_init_print_info(const char *str)

6715

{

6715

{

6716

unsigned long physpages, codesize, datasize, rosize, bss_size;

6716

unsigned long physpages, codesize, datasize, rosize, bss_size;

6717

unsigned long init_code_size, init_data_size;

6717

unsigned long init_code_size, init_data_size;

6718

6719

physpages = get_num_physpages();

6719

physpages = get_num_physpages();

6720

codesize = _etext - _stext;

6720

codesize = _etext - _stext;

6721

datasize = _edata - _sdata;

6721

datasize = _edata - _sdata;

6722

rosize = __end_rodata - __start_rodata;

6722

rosize = __end_rodata - __start_rodata;

6723

bss_size = __bss_stop - __bss_start;

6723

bss_size = __bss_stop - __bss_start;

6724

init_data_size = __init_end - __init_begin;

6724

init_data_size = __init_end - __init_begin;

6725

init_code_size = _einittext - _sinittext;

6725

init_code_size = _einittext - _sinittext;

6726

6727

/*

6727

/*

6728

* Detect special cases and adjust section sizes accordingly:

6728

* Detect special cases and adjust section sizes accordingly:

6729

* 1) .init.* may be embedded into .data sections

6729

* 1) .init.* may be embedded into .data sections

6730

* 2) .init.text.* may be out of [__init_begin, __init_end],

6730

* 2) .init.text.* may be out of [__init_begin, __init_end],

6731

* please refer to arch/tile/kernel/vmlinux.lds.S.

6731

* please refer to arch/tile/kernel/vmlinux.lds.S.

6732

* 3) .rodata.* may be embedded into .text or .data sections.

6732

* 3) .rodata.* may be embedded into .text or .data sections.

6733

*/

6733

*/

6734

#define adj_init_size(start, end, size, pos, adj) \

6734

#define adj_init_size(start, end, size, pos, adj) \

6735

do { \

6735

do { \

6736

if (start <= pos && pos < end && size > adj) \

6736

if (start <= pos && pos < end && size > adj) \

6737

size -= adj; \

6737

size -= adj; \

6738

} while (0)

6738

} while (0)

6739

6740

adj_init_size(__init_begin, __init_end, init_data_size,

6740

adj_init_size(__init_begin, __init_end, init_data_size,

6741

_sinittext, init_code_size);

6741

_sinittext, init_code_size);

6742

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

6742

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

6743

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

6743

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

6744

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

6744

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

6745

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

6745

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

6746

6747

#undef adj_init_size

6747

#undef adj_init_size

6748

6749

pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"

6749

pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"

6750

#ifdef CONFIG_HIGHMEM

6750

#ifdef CONFIG_HIGHMEM

6751

", %luK highmem"

6751

", %luK highmem"

6752

#endif

6752

#endif

6753

"%s%s)\n",

6753

"%s%s)\n",

6754

nr_free_pages() << (PAGE_SHIFT - 10),

6754

nr_free_pages() << (PAGE_SHIFT - 10),

6755

physpages << (PAGE_SHIFT - 10),

6755

physpages << (PAGE_SHIFT - 10),

6756

codesize >> 10, datasize >> 10, rosize >> 10,

6756

codesize >> 10, datasize >> 10, rosize >> 10,

6757

(init_data_size + init_code_size) >> 10, bss_size >> 10,

6757

(init_data_size + init_code_size) >> 10, bss_size >> 10,

6758

(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),

6758

(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),

6759

totalcma_pages << (PAGE_SHIFT - 10),

6759

totalcma_pages << (PAGE_SHIFT - 10),

6760

#ifdef CONFIG_HIGHMEM

6760

#ifdef CONFIG_HIGHMEM

6761

totalhigh_pages << (PAGE_SHIFT - 10),

6761

totalhigh_pages << (PAGE_SHIFT - 10),

6762

#endif

6762

#endif

6763

str ? ", " : "", str ? str : "");

6763

str ? ", " : "", str ? str : "");

6764

}

6764

}

6765

6766

/**

6766

/**

6767

* set_dma_reserve - set the specified number of pages reserved in the first zone

6767

* set_dma_reserve - set the specified number of pages reserved in the first zone

6768

* @new_dma_reserve: The number of pages to mark reserved

6768

* @new_dma_reserve: The number of pages to mark reserved

6769

*

6769

*

6770

* The per-cpu batchsize and zone watermarks are determined by managed_pages.

6770

* The per-cpu batchsize and zone watermarks are determined by managed_pages.

6771

* In the DMA zone, a significant percentage may be consumed by kernel image

6771

* In the DMA zone, a significant percentage may be consumed by kernel image

6772

* and other unfreeable allocations which can skew the watermarks badly. This

6772

* and other unfreeable allocations which can skew the watermarks badly. This

6773

* function may optionally be used to account for unfreeable pages in the

6773

* function may optionally be used to account for unfreeable pages in the

6774

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

6774

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

6775

* smaller per-cpu batchsize.

6775

* smaller per-cpu batchsize.

6776

*/

6776

*/

6777

void __init set_dma_reserve(unsigned long new_dma_reserve)

6777

void __init set_dma_reserve(unsigned long new_dma_reserve)

6778

{

6778

{

6779

dma_reserve = new_dma_reserve;

6779

dma_reserve = new_dma_reserve;

6780

}

6780

}

6781

6782

void __init free_area_init(unsigned long *zones_size)

6782

void __init free_area_init(unsigned long *zones_size)

6783

{

6783

{

6784

free_area_init_node(0, zones_size,

6784

free_area_init_node(0, zones_size,

6785

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

6785

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

6786

}

6786

}

6787

6788

static int page_alloc_cpu_dead(unsigned int cpu)

6788

static int page_alloc_cpu_dead(unsigned int cpu)

6789

{

6789

{

6790

6791

lru_add_drain_cpu(cpu);

6791

lru_add_drain_cpu(cpu);

6792

drain_pages(cpu);

6792

drain_pages(cpu);

6793

6794

/*

6794

/*

6795

* Spill the event counters of the dead processor

6795

* Spill the event counters of the dead processor

6796

* into the current processors event counters.

6796

* into the current processors event counters.

6797

* This artificially elevates the count of the current

6797

* This artificially elevates the count of the current

6798

* processor.

6798

* processor.

6799

*/

6799

*/

6800

vm_events_fold_cpu(cpu);

6800

vm_events_fold_cpu(cpu);

6801

6802

/*

6802

/*

6803

* Zero the differential counters of the dead processor

6803

* Zero the differential counters of the dead processor

6804

* so that the vm statistics are consistent.

6804

* so that the vm statistics are consistent.

6805

*

6805

*

6806

* This is only okay since the processor is dead and cannot

6806

* This is only okay since the processor is dead and cannot

6807

* race with what we are doing.

6807

* race with what we are doing.

6808

*/

6808

*/

6809

cpu_vm_stats_fold(cpu);

6809

cpu_vm_stats_fold(cpu);

6810

return 0;

6810

return 0;

6811

}

6811

}

6812

6813

void __init page_alloc_init(void)

6813

void __init page_alloc_init(void)

6814

{

6814

{

6815

int ret;

6815

int ret;

6816

6817

ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,

6817

ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,

6818

"mm/page_alloc:dead", NULL,

6818

"mm/page_alloc:dead", NULL,

6819

page_alloc_cpu_dead);

6819

page_alloc_cpu_dead);

6820

WARN_ON(ret < 0);

6820

WARN_ON(ret < 0);

6821

}

6821

}

6822

6823

/*

6823

/*

6824

* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio

6824

* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio

6825

* or min_free_kbytes changes.

6825

* or min_free_kbytes changes.

6826

*/

6826

*/

6827

static void calculate_totalreserve_pages(void)

6827

static void calculate_totalreserve_pages(void)

6828

{

6828

{

6829

struct pglist_data *pgdat;

6829

struct pglist_data *pgdat;

6830

unsigned long reserve_pages = 0;

6830

unsigned long reserve_pages = 0;

6831

enum zone_type i, j;

6831

enum zone_type i, j;

6832

6833

for_each_online_pgdat(pgdat) {

6833

for_each_online_pgdat(pgdat) {

6834

6835

pgdat->totalreserve_pages = 0;

6835

pgdat->totalreserve_pages = 0;

6836

6837

for (i = 0; i < MAX_NR_ZONES; i++) {

6837

for (i = 0; i < MAX_NR_ZONES; i++) {

6838

struct zone *zone = pgdat->node_zones + i;

6838

struct zone *zone = pgdat->node_zones + i;

6839

long max = 0;

6839

long max = 0;

6840

6841

/* Find valid and maximum lowmem_reserve in the zone */

6841

/* Find valid and maximum lowmem_reserve in the zone */

6842

for (j = i; j < MAX_NR_ZONES; j++) {

6842

for (j = i; j < MAX_NR_ZONES; j++) {

6843

if (zone->lowmem_reserve[j] > max)

6843

if (zone->lowmem_reserve[j] > max)

6844

max = zone->lowmem_reserve[j];

6844

max = zone->lowmem_reserve[j];

6845

}

6845

}

6846

6847

/* we treat the high watermark as reserved pages. */

6847

/* we treat the high watermark as reserved pages. */

6848

max += high_wmark_pages(zone);

6848

max += high_wmark_pages(zone);

6849

6850

if (max > zone->managed_pages)

6850

if (max > zone->managed_pages)

6851

max = zone->managed_pages;

6851

max = zone->managed_pages;

6852

6853

pgdat->totalreserve_pages += max;

6853

pgdat->totalreserve_pages += max;

6854

6855

reserve_pages += max;

6855

reserve_pages += max;

6856

}

6856

}

6857

}

6857

}

6858

totalreserve_pages = reserve_pages;

6858

totalreserve_pages = reserve_pages;

6859

}

6859

}

6860

6861

/*

6861

/*

6862

* setup_per_zone_lowmem_reserve - called whenever

6862

* setup_per_zone_lowmem_reserve - called whenever

6863

* sysctl_lowmem_reserve_ratio changes. Ensures that each zone

6863

* sysctl_lowmem_reserve_ratio changes. Ensures that each zone

6864

* has a correct pages reserved value, so an adequate number of

6864

* has a correct pages reserved value, so an adequate number of

6865

* pages are left in the zone after a successful __alloc_pages().

6865

* pages are left in the zone after a successful __alloc_pages().

6866

*/

6866

*/

6867

static void setup_per_zone_lowmem_reserve(void)

6867

static void setup_per_zone_lowmem_reserve(void)

6868

{

6868

{

6869

struct pglist_data *pgdat;

6869

struct pglist_data *pgdat;

6870

enum zone_type j, idx;

6870

enum zone_type j, idx;

6871

6872

for_each_online_pgdat(pgdat) {

6872

for_each_online_pgdat(pgdat) {

6873

for (j = 0; j < MAX_NR_ZONES; j++) {

6873

for (j = 0; j < MAX_NR_ZONES; j++) {

6874

struct zone *zone = pgdat->node_zones + j;

6874

struct zone *zone = pgdat->node_zones + j;

6875

unsigned long managed_pages = zone->managed_pages;

6875

unsigned long managed_pages = zone->managed_pages;

6876

6877

zone->lowmem_reserve[j] = 0;

6877

zone->lowmem_reserve[j] = 0;

6878

6879

idx = j;

6879

idx = j;

6880

while (idx) {

6880

while (idx) {

6881

struct zone *lower_zone;

6881

struct zone *lower_zone;

6882

6883

idx--;

6883

idx--;

6884

6885

if (sysctl_lowmem_reserve_ratio[idx] < 1)

6885

if (sysctl_lowmem_reserve_ratio[idx] < 1)

6886

sysctl_lowmem_reserve_ratio[idx] = 1;

6886

sysctl_lowmem_reserve_ratio[idx] = 1;

6887

6888

lower_zone = pgdat->node_zones + idx;

6888

lower_zone = pgdat->node_zones + idx;

6889

lower_zone->lowmem_reserve[j] = managed_pages /

6889

lower_zone->lowmem_reserve[j] = managed_pages /

6890

sysctl_lowmem_reserve_ratio[idx];

6890

sysctl_lowmem_reserve_ratio[idx];

6891

managed_pages += lower_zone->managed_pages;

6891

managed_pages += lower_zone->managed_pages;

6892

}

6892

}

6893

}

6893

}

6894

}

6894

}

6895

6896

/* update totalreserve_pages */

6896

/* update totalreserve_pages */

6897

calculate_totalreserve_pages();

6897

calculate_totalreserve_pages();

6898

}

6898

}

6899

6900

static void __setup_per_zone_wmarks(void)

6900

static void __setup_per_zone_wmarks(void)

6901

{

6901

{

6902

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

6902

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

6903

unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);

6903

unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);

6904

unsigned long lowmem_pages = 0;

6904

unsigned long lowmem_pages = 0;

6905

struct zone *zone;

6905

struct zone *zone;

6906

unsigned long flags;

6906

unsigned long flags;

6907

6908

/* Calculate total number of !ZONE_HIGHMEM pages */

6908

/* Calculate total number of !ZONE_HIGHMEM pages */

6909

for_each_zone(zone) {

6909

for_each_zone(zone) {

6910

if (!is_highmem(zone))

6910

if (!is_highmem(zone))

6911

lowmem_pages += zone->managed_pages;

6911

lowmem_pages += zone->managed_pages;

6912

}

6912

}

6913

6914

for_each_zone(zone) {

6914

for_each_zone(zone) {

6915

u64 min, low;

6915

u64 min, low;

6916

6917

spin_lock_irqsave(&zone->lock, flags);

6917

spin_lock_irqsave(&zone->lock, flags);

6918

min = (u64)pages_min * zone->managed_pages;

6918

min = (u64)pages_min * zone->managed_pages;

6919

do_div(min, lowmem_pages);

6919

do_div(min, lowmem_pages);

6920

low = (u64)pages_low * zone->managed_pages;

6920

low = (u64)pages_low * zone->managed_pages;

6921

do_div(low, vm_total_pages);

6921

do_div(low, vm_total_pages);

6922

6923

if (is_highmem(zone)) {

6923

if (is_highmem(zone)) {

6924

/*

6924

/*

6925

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

6925

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

6926

* need highmem pages, so cap pages_min to a small

6926

* need highmem pages, so cap pages_min to a small

6927

* value here.

6927

* value here.

6928

*

6928

*

6929

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

6929

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

6930

* deltas control asynch page reclaim, and so should

6930

* deltas control asynch page reclaim, and so should

6931

* not be capped for highmem.

6931

* not be capped for highmem.

6932

*/

6932

*/

6933

unsigned long min_pages;

6933

unsigned long min_pages;

6934

6935

min_pages = zone->managed_pages / 1024;

6935

min_pages = zone->managed_pages / 1024;

6936

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

6936

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

6937

zone->watermark[WMARK_MIN] = min_pages;

6937

zone->watermark[WMARK_MIN] = min_pages;

6938

} else {

6938

} else {

6939

/*

6939

/*

6940

* If it's a lowmem zone, reserve a number of pages

6940

* If it's a lowmem zone, reserve a number of pages

6941

* proportionate to the zone's size.

6941

* proportionate to the zone's size.

6942

*/

6942

*/

6943

zone->watermark[WMARK_MIN] = min;

6943

zone->watermark[WMARK_MIN] = min;

6944

}

6944

}

6945

6946

/*

6946

/*

6947

* Set the kswapd watermarks distance according to the

6947

* Set the kswapd watermarks distance according to the

6948

* scale factor in proportion to available memory, but

6948

* scale factor in proportion to available memory, but

6949

* ensure a minimum size on small systems.

6949

* ensure a minimum size on small systems.

6950

*/

6950

*/

6951

min = max_t(u64, min >> 2,

6951

min = max_t(u64, min >> 2,

6952

mult_frac(zone->managed_pages,

6952

mult_frac(zone->managed_pages,

6953

watermark_scale_factor, 10000));

6953

watermark_scale_factor, 10000));

6954

6955

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +

6955

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +

6956

low + min;

6956

low + min;

6957

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +

6957

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +

6958

low + min * 2;

6958

low + min * 2;

6959

6960

spin_unlock_irqrestore(&zone->lock, flags);

6960

spin_unlock_irqrestore(&zone->lock, flags);

6961

}

6961

}

6962

6963

/* update totalreserve_pages */

6963

/* update totalreserve_pages */

6964

calculate_totalreserve_pages();

6964

calculate_totalreserve_pages();

6965

}

6965

}

6966

6967

/**

6967

/**

6968

* setup_per_zone_wmarks - called when min_free_kbytes changes

6968

* setup_per_zone_wmarks - called when min_free_kbytes changes

6969

* or when memory is hot-{added|removed}

6969

* or when memory is hot-{added|removed}

6970

*

6970

*

6971

* Ensures that the watermark[min,low,high] values for each zone are set

6971

* Ensures that the watermark[min,low,high] values for each zone are set

6972

* correctly with respect to min_free_kbytes.

6972

* correctly with respect to min_free_kbytes.

6973

*/

6973

*/

6974

void setup_per_zone_wmarks(void)

6974

void setup_per_zone_wmarks(void)

6975

{

6975

{

6976

static DEFINE_SPINLOCK(lock);

6976

static DEFINE_SPINLOCK(lock);

6977

6978

spin_lock(&lock);

6978

spin_lock(&lock);

6979

__setup_per_zone_wmarks();

6979

__setup_per_zone_wmarks();

6980

spin_unlock(&lock);

6980

spin_unlock(&lock);

6981

}

6981

}

6982

6983

/*

6983

/*

6984

* Initialise min_free_kbytes.

6984

* Initialise min_free_kbytes.

6985

*

6985

*

6986

* For small machines we want it small (128k min). For large machines

6986

* For small machines we want it small (128k min). For large machines

6987

* we want it large (64MB max). But it is not linear, because network

6987

* we want it large (64MB max). But it is not linear, because network

6988

* bandwidth does not increase linearly with machine size. We use

6988

* bandwidth does not increase linearly with machine size. We use

6989

*

6989

*

6990

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

6990

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

6991

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

6991

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

6992

*

6992

*

6993

* which yields

6993

* which yields

6994

*

6994

*

6995

* 16MB: 512k

6995

* 16MB: 512k

6996

* 32MB: 724k

6996

* 32MB: 724k

6997

* 64MB: 1024k

6997

* 64MB: 1024k

6998

* 128MB: 1448k

6998

* 128MB: 1448k

6999

* 256MB: 2048k

6999

* 256MB: 2048k

7000

* 512MB: 2896k

7000

* 512MB: 2896k

7001

* 1024MB: 4096k

7001

* 1024MB: 4096k

7002

* 2048MB: 5792k

7002

* 2048MB: 5792k

7003

* 4096MB: 8192k

7003

* 4096MB: 8192k

7004

* 8192MB: 11584k

7004

* 8192MB: 11584k

7005

* 16384MB: 16384k

7005

* 16384MB: 16384k

7006

*/

7006

*/

7007

int __meminit init_per_zone_wmark_min(void)

7007

int __meminit init_per_zone_wmark_min(void)

7008

{

7008

{

7009

unsigned long lowmem_kbytes;

7009

unsigned long lowmem_kbytes;

7010

int new_min_free_kbytes;

7010

int new_min_free_kbytes;

7011

7012

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

7012

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

7013

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

7013

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

7014

7015

if (new_min_free_kbytes > user_min_free_kbytes) {

7015

if (new_min_free_kbytes > user_min_free_kbytes) {

7016

min_free_kbytes = new_min_free_kbytes;

7016

min_free_kbytes = new_min_free_kbytes;

7017

if (min_free_kbytes < 128)

7017

if (min_free_kbytes < 128)

7018

min_free_kbytes = 128;

7018

min_free_kbytes = 128;

7019

if (min_free_kbytes > 65536)

7019

if (min_free_kbytes > 65536)

7020

min_free_kbytes = 65536;

7020

min_free_kbytes = 65536;

7021

} else {

7021

} else {

7022

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

7022

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

7023

new_min_free_kbytes, user_min_free_kbytes);

7023

new_min_free_kbytes, user_min_free_kbytes);

7024

}

7024

}

7025

setup_per_zone_wmarks();

7025

setup_per_zone_wmarks();

7026

refresh_zone_stat_thresholds();

7026

refresh_zone_stat_thresholds();

7027

setup_per_zone_lowmem_reserve();

7027

setup_per_zone_lowmem_reserve();

7028

7029

#ifdef CONFIG_NUMA

7029

#ifdef CONFIG_NUMA

7030

setup_min_unmapped_ratio();

7030

setup_min_unmapped_ratio();

7031

setup_min_slab_ratio();

7031

setup_min_slab_ratio();

7032

#endif

7032

#endif

7033

7034

return 0;

7034

return 0;

7035

}

7035

}

7036

core_initcall(init_per_zone_wmark_min)

7036

core_initcall(init_per_zone_wmark_min)

7037

7038

/*

7038

/*

7039

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

7039

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

7040

* that we can call two helper functions whenever min_free_kbytes

7040

* that we can call two helper functions whenever min_free_kbytes

7041

* or extra_free_kbytes changes.

7041

* or extra_free_kbytes changes.

7042

*/

7042

*/

7043

int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,

7043

int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,

7044

void __user *buffer, size_t *length, loff_t *ppos)

7044

void __user *buffer, size_t *length, loff_t *ppos)

7045

{

7045

{

7046

int rc;

7046

int rc;

7047

7048

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7048

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7049

if (rc)

7049

if (rc)

7050

return rc;

7050

return rc;

7051

7052

if (write) {

7052

if (write) {

7053

user_min_free_kbytes = min_free_kbytes;

7053

user_min_free_kbytes = min_free_kbytes;

7054

setup_per_zone_wmarks();

7054

setup_per_zone_wmarks();

7055

}

7055

}

7056

return 0;

7056

return 0;

7057

}

7057

}

7058

7059

int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,

7059

int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,

7060

void __user *buffer, size_t *length, loff_t *ppos)

7060

void __user *buffer, size_t *length, loff_t *ppos)

7061

{

7061

{

7062

int rc;

7062

int rc;

7063

7064

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7064

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7065

if (rc)

7065

if (rc)

7066

return rc;

7066

return rc;

7067

7068

if (write)

7068

if (write)

7069

setup_per_zone_wmarks();

7069

setup_per_zone_wmarks();

7070

7071

return 0;

7071

return 0;

7072

}

7072

}

7073

7074

#ifdef CONFIG_NUMA

7074

#ifdef CONFIG_NUMA

7075

static void setup_min_unmapped_ratio(void)

7075

static void setup_min_unmapped_ratio(void)

7076

{

7076

{

7077

pg_data_t *pgdat;

7077

pg_data_t *pgdat;

7078

struct zone *zone;

7078

struct zone *zone;

7079

7080

for_each_online_pgdat(pgdat)

7080

for_each_online_pgdat(pgdat)

7081

pgdat->min_unmapped_pages = 0;

7081

pgdat->min_unmapped_pages = 0;

7082

7083

for_each_zone(zone)

7083

for_each_zone(zone)

7084

zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *

7084

zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *

7085

sysctl_min_unmapped_ratio) / 100;

7085

sysctl_min_unmapped_ratio) / 100;

7086

}

7086

}

7087

7088

7089

int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,

7089

int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,

7090

void __user *buffer, size_t *length, loff_t *ppos)

7090

void __user *buffer, size_t *length, loff_t *ppos)

7091

{

7091

{

7092

int rc;

7092

int rc;

7093

7094

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7094

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7095

if (rc)

7095

if (rc)

7096

return rc;

7096

return rc;

7097

7098

setup_min_unmapped_ratio();

7098

setup_min_unmapped_ratio();

7099

7100

return 0;

7100

return 0;

7101

}

7101

}

7102

7103

static void setup_min_slab_ratio(void)

7103

static void setup_min_slab_ratio(void)

7104

{

7104

{

7105

pg_data_t *pgdat;

7105

pg_data_t *pgdat;

7106

struct zone *zone;

7106

struct zone *zone;

7107

7108

for_each_online_pgdat(pgdat)

7108

for_each_online_pgdat(pgdat)

7109

pgdat->min_slab_pages = 0;

7109

pgdat->min_slab_pages = 0;

7110

7111

for_each_zone(zone)

7111

for_each_zone(zone)

7112

zone->zone_pgdat->min_slab_pages += (zone->managed_pages *

7112

zone->zone_pgdat->min_slab_pages += (zone->managed_pages *

7113

sysctl_min_slab_ratio) / 100;

7113

sysctl_min_slab_ratio) / 100;

7114

}

7114

}

7115

7116

int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,

7116

int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,

7117

void __user *buffer, size_t *length, loff_t *ppos)

7117

void __user *buffer, size_t *length, loff_t *ppos)

7118

{

7118

{

7119

int rc;

7119

int rc;

7120

7121

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7121

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

7122

if (rc)

7122

if (rc)

7123

return rc;

7123

return rc;

7124

7125

setup_min_slab_ratio();

7125

setup_min_slab_ratio();

7126

7127

return 0;

7127

return 0;

7128

}

7128

}

7129

#endif

7129

#endif

7130

7131

/*

7131

/*

7132

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

7132

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

7133

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

7133

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

7134

* whenever sysctl_lowmem_reserve_ratio changes.

7134

* whenever sysctl_lowmem_reserve_ratio changes.

7135

*

7135

*

7136

* The reserve ratio obviously has absolutely no relation with the

7136

* The reserve ratio obviously has absolutely no relation with the

7137

* minimum watermarks. The lowmem reserve ratio can only make sense

7137

* minimum watermarks. The lowmem reserve ratio can only make sense

7138

* if in function of the boot time zone sizes.

7138

* if in function of the boot time zone sizes.

7139

*/

7139

*/

7140

int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,

7140

int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,

7141

void __user *buffer, size_t *length, loff_t *ppos)

7141

void __user *buffer, size_t *length, loff_t *ppos)

7142

{

7142

{

7143

proc_dointvec_minmax(table, write, buffer, length, ppos);

7143

proc_dointvec_minmax(table, write, buffer, length, ppos);

7144

setup_per_zone_lowmem_reserve();

7144

setup_per_zone_lowmem_reserve();

7145

return 0;

7145

return 0;

7146

}

7146

}

7147

7148

/*

7148

/*

7149

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

7149

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

7150

* cpu. It is the fraction of total pages in each zone that a hot per cpu

7150

* cpu. It is the fraction of total pages in each zone that a hot per cpu

7151

* pagelist can have before it gets flushed back to buddy allocator.

7151

* pagelist can have before it gets flushed back to buddy allocator.

7152

*/

7152

*/

7153

int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,

7153

int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,

7154

void __user *buffer, size_t *length, loff_t *ppos)

7154

void __user *buffer, size_t *length, loff_t *ppos)

7155

{

7155

{

7156

struct zone *zone;

7156

struct zone *zone;

7157

int old_percpu_pagelist_fraction;

7157

int old_percpu_pagelist_fraction;

7158

int ret;

7158

int ret;

7159

7160

mutex_lock(&pcp_batch_high_lock);

7160

mutex_lock(&pcp_batch_high_lock);

7161

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

7161

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

7162

7163

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

7163

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

7164

if (!write || ret < 0)

7164

if (!write || ret < 0)

7165

goto out;

7165

goto out;

7166

7167

/* Sanity checking to avoid pcp imbalance */

7167

/* Sanity checking to avoid pcp imbalance */

7168

if (percpu_pagelist_fraction &&

7168

if (percpu_pagelist_fraction &&

7169

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

7169

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

7170

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

7170

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

7171

ret = -EINVAL;

7171

ret = -EINVAL;

7172

goto out;

7172

goto out;

7173

}

7173

}

7174

7175

/* No change? */

7175

/* No change? */

7176

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

7176

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

7177

goto out;

7177

goto out;

7178

7179

for_each_populated_zone(zone) {

7179

for_each_populated_zone(zone) {

7180

unsigned int cpu;

7180

unsigned int cpu;

7181

7182

for_each_possible_cpu(cpu)

7182

for_each_possible_cpu(cpu)

7183

pageset_set_high_and_batch(zone,

7183

pageset_set_high_and_batch(zone,

7184

per_cpu_ptr(zone->pageset, cpu));

7184

per_cpu_ptr(zone->pageset, cpu));

7185

}

7185

}

7186

out:

7186

out:

7187

mutex_unlock(&pcp_batch_high_lock);

7187

mutex_unlock(&pcp_batch_high_lock);

7188

return ret;

7188

return ret;

7189

}

7189

}

7190

7191

#ifdef CONFIG_NUMA

7191

#ifdef CONFIG_NUMA

7192

int hashdist = HASHDIST_DEFAULT;

7192

int hashdist = HASHDIST_DEFAULT;

7193

7194

static int __init set_hashdist(char *str)

7194

static int __init set_hashdist(char *str)

7195

{

7195

{

7196

if (!str)

7196

if (!str)

7197

return 0;

7197

return 0;

7198

hashdist = simple_strtoul(str, &str, 0);

7198

hashdist = simple_strtoul(str, &str, 0);

7199

return 1;

7199

return 1;

7200

}

7200

}

7201

__setup("hashdist=", set_hashdist);

7201

__setup("hashdist=", set_hashdist);

7202

#endif

7202

#endif

7203

7204

#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES

7204

#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES

7205

/*

7205

/*

7206

* Returns the number of pages that arch has reserved but

7206

* Returns the number of pages that arch has reserved but

7207

* is not known to alloc_large_system_hash().

7207

* is not known to alloc_large_system_hash().

7208

*/

7208

*/

7209

static unsigned long __init arch_reserved_kernel_pages(void)

7209

static unsigned long __init arch_reserved_kernel_pages(void)

7210

{

7210

{

7211

return 0;

7211

return 0;

7212

}

7212

}

7213

#endif

7213

#endif

7214

7215

/*

7215

/*

7216

* Adaptive scale is meant to reduce sizes of hash tables on large memory

7216

* Adaptive scale is meant to reduce sizes of hash tables on large memory

7217

* machines. As memory size is increased the scale is also increased but at

7217

* machines. As memory size is increased the scale is also increased but at

7218

* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory

7218

* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory

7219

* quadruples the scale is increased by one, which means the size of hash table

7219

* quadruples the scale is increased by one, which means the size of hash table

7220

* only doubles, instead of quadrupling as well.

7220

* only doubles, instead of quadrupling as well.

7221

* Because 32-bit systems cannot have large physical memory, where this scaling

7221

* Because 32-bit systems cannot have large physical memory, where this scaling

7222

* makes sense, it is disabled on such platforms.

7222

* makes sense, it is disabled on such platforms.

7223

*/

7223

*/

7224

#if __BITS_PER_LONG > 32

7224

#if __BITS_PER_LONG > 32

7225

#define ADAPT_SCALE_BASE (64ul << 30)

7225

#define ADAPT_SCALE_BASE (64ul << 30)

7226

#define ADAPT_SCALE_SHIFT 2

7226

#define ADAPT_SCALE_SHIFT 2

7227

#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)

7227

#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)

7228

#endif

7228

#endif

7229

7230

/*

7230

/*

7231

* allocate a large system hash table from bootmem

7231

* allocate a large system hash table from bootmem

7232

* - it is assumed that the hash table must contain an exact power-of-2

7232

* - it is assumed that the hash table must contain an exact power-of-2

7233

* quantity of entries

7233

* quantity of entries

7234

* - limit is the number of hash buckets, not the total allocation size

7234

* - limit is the number of hash buckets, not the total allocation size

7235

*/

7235

*/

7236

void *__init alloc_large_system_hash(const char *tablename,

7236

void *__init alloc_large_system_hash(const char *tablename,

7237

unsigned long bucketsize,

7237

unsigned long bucketsize,

7238

unsigned long numentries,

7238

unsigned long numentries,

7239

int scale,

7239

int scale,

7240

int flags,

7240

int flags,

7241

unsigned int *_hash_shift,

7241

unsigned int *_hash_shift,

7242

unsigned int *_hash_mask,

7242

unsigned int *_hash_mask,

7243

unsigned long low_limit,

7243

unsigned long low_limit,

7244

unsigned long high_limit)

7244

unsigned long high_limit)

7245

{

7245

{

7246

unsigned long long max = high_limit;

7246

unsigned long long max = high_limit;

7247

unsigned long log2qty, size;

7247

unsigned long log2qty, size;

7248

void *table = NULL;

7248

void *table = NULL;

7249

gfp_t gfp_flags;

7249

gfp_t gfp_flags;

7250

7251

/* allow the kernel cmdline to have a say */

7251

/* allow the kernel cmdline to have a say */

7252

if (!numentries) {

7252

if (!numentries) {

7253

/* round applicable memory size up to nearest megabyte */

7253

/* round applicable memory size up to nearest megabyte */

7254

numentries = nr_kernel_pages;

7254

numentries = nr_kernel_pages;

7255

numentries -= arch_reserved_kernel_pages();

7255

numentries -= arch_reserved_kernel_pages();

7256

7257

/* It isn't necessary when PAGE_SIZE >= 1MB */

7257

/* It isn't necessary when PAGE_SIZE >= 1MB */

7258

if (PAGE_SHIFT < 20)

7258

if (PAGE_SHIFT < 20)

7259

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

7259

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

7260

7261

#if __BITS_PER_LONG > 32

7261

#if __BITS_PER_LONG > 32

7262

if (!high_limit) {

7262

if (!high_limit) {

7263

unsigned long adapt;

7263

unsigned long adapt;

7264

7265

for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;

7265

for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;

7266

adapt <<= ADAPT_SCALE_SHIFT)

7266

adapt <<= ADAPT_SCALE_SHIFT)

7267

scale++;

7267

scale++;

7268

}

7268

}

7269

#endif

7269

#endif

7270

7271

/* limit to 1 bucket per 2^scale bytes of low memory */

7271

/* limit to 1 bucket per 2^scale bytes of low memory */

7272

if (scale > PAGE_SHIFT)

7272

if (scale > PAGE_SHIFT)

7273

numentries >>= (scale - PAGE_SHIFT);

7273

numentries >>= (scale - PAGE_SHIFT);

7274

else

7274

else

7275

numentries <<= (PAGE_SHIFT - scale);

7275

numentries <<= (PAGE_SHIFT - scale);

7276

7277

/* Make sure we've got at least a 0-order allocation.. */

7277

/* Make sure we've got at least a 0-order allocation.. */

7278

if (unlikely(flags & HASH_SMALL)) {

7278

if (unlikely(flags & HASH_SMALL)) {

7279

/* Makes no sense without HASH_EARLY */

7279

/* Makes no sense without HASH_EARLY */

7280

WARN_ON(!(flags & HASH_EARLY));

7280

WARN_ON(!(flags & HASH_EARLY));

7281

if (!(numentries >> *_hash_shift)) {

7281

if (!(numentries >> *_hash_shift)) {

7282

numentries = 1UL << *_hash_shift;

7282

numentries = 1UL << *_hash_shift;

7283

BUG_ON(!numentries);

7283

BUG_ON(!numentries);

7284

}

7284

}

7285

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

7285

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

7286

numentries = PAGE_SIZE / bucketsize;

7286

numentries = PAGE_SIZE / bucketsize;

7287

}

7287

}

7288

numentries = roundup_pow_of_two(numentries);

7288

numentries = roundup_pow_of_two(numentries);

7289

7290

/* limit allocation size to 1/16 total memory by default */

7290

/* limit allocation size to 1/16 total memory by default */

7291

if (max == 0) {

7291

if (max == 0) {

7292

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

7292

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

7293

do_div(max, bucketsize);

7293

do_div(max, bucketsize);

7294

}

7294

}

7295

max = min(max, 0x80000000ULL);

7295

max = min(max, 0x80000000ULL);

7296

7297

if (numentries < low_limit)

7297

if (numentries < low_limit)

7298

numentries = low_limit;

7298

numentries = low_limit;

7299

if (numentries > max)

7299

if (numentries > max)

7300

numentries = max;

7300

numentries = max;

7301

7302

log2qty = ilog2(numentries);

7302

log2qty = ilog2(numentries);

7303

7304

/*

7304

/*

7305

* memblock allocator returns zeroed memory already, so HASH_ZERO is

7305

* memblock allocator returns zeroed memory already, so HASH_ZERO is

7306

* currently not used when HASH_EARLY is specified.

7306

* currently not used when HASH_EARLY is specified.

7307

*/

7307

*/

7308

gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;

7308

gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;

7309

do {

7309

do {

7310

size = bucketsize << log2qty;

7310

size = bucketsize << log2qty;

7311

if (flags & HASH_EARLY)

7311

if (flags & HASH_EARLY)

7312

table = memblock_virt_alloc_nopanic(size, 0);

7312

table = memblock_virt_alloc_nopanic(size, 0);

7313

else if (hashdist)

7313

else if (hashdist)

7314

table = __vmalloc(size, gfp_flags, PAGE_KERNEL);

7314

table = __vmalloc(size, gfp_flags, PAGE_KERNEL);

7315

else {

7315

else {

7316

/*

7316

/*

7317

* If bucketsize is not a power-of-two, we may free

7317

* If bucketsize is not a power-of-two, we may free

7318

* some pages at the end of hash table which

7318

* some pages at the end of hash table which

7319

* alloc_pages_exact() automatically does

7319

* alloc_pages_exact() automatically does

7320

*/

7320

*/

7321

if (get_order(size) < MAX_ORDER) {

7321

if (get_order(size) < MAX_ORDER) {

7322

table = alloc_pages_exact(size, gfp_flags);

7322

table = alloc_pages_exact(size, gfp_flags);

7323

kmemleak_alloc(table, size, 1, gfp_flags);

7323

kmemleak_alloc(table, size, 1, gfp_flags);

7324

}

7324

}

7325

}

7325

}

7326

} while (!table && size > PAGE_SIZE && --log2qty);

7326

} while (!table && size > PAGE_SIZE && --log2qty);

7327

7328

if (!table)

7328

if (!table)

7329

panic("Failed to allocate %s hash table\n", tablename);

7329

panic("Failed to allocate %s hash table\n", tablename);

7330

7331

pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",

7331

pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",

7332

tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);

7332

tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);

7333

7334

if (_hash_shift)

7334

if (_hash_shift)

7335

*_hash_shift = log2qty;

7335

*_hash_shift = log2qty;

7336

if (_hash_mask)

7336

if (_hash_mask)

7337

*_hash_mask = (1 << log2qty) - 1;

7337

*_hash_mask = (1 << log2qty) - 1;

7338

7339

return table;

7339

return table;

7340

}

7340

}

7341

7342

/*

7342

/*

7343

* This function checks whether pageblock includes unmovable pages or not.

7343

* This function checks whether pageblock includes unmovable pages or not.

7344

* If @count is not zero, it is okay to include less @count unmovable pages

7344

* If @count is not zero, it is okay to include less @count unmovable pages

7345

*

7345

*

7346

* PageLRU check without isolation or lru_lock could race so that

7346

* PageLRU check without isolation or lru_lock could race so that

7347

* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable

7347

* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable

7348

* check without lock_page also may miss some movable non-lru pages at

7348

* check without lock_page also may miss some movable non-lru pages at

7349

* race condition. So you can't expect this function should be exact.

7349

* race condition. So you can't expect this function should be exact.

7350

*/

7350

*/

7351

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

7351

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

7352

bool skip_hwpoisoned_pages)

7352

bool skip_hwpoisoned_pages)

7353

{

7353

{

7354

unsigned long pfn, iter, found;

7354

unsigned long pfn, iter, found;

7355

int mt;

7355

int mt;

7356

7357

/*

7357

/*

7358

* For avoiding noise data, lru_add_drain_all() should be called

7358

* For avoiding noise data, lru_add_drain_all() should be called

7359

* If ZONE_MOVABLE, the zone never contains unmovable pages

7359

* If ZONE_MOVABLE, the zone never contains unmovable pages

7360

*/

7360

*/

7361

if (zone_idx(zone) == ZONE_MOVABLE)

7361

if (zone_idx(zone) == ZONE_MOVABLE)

7362

return false;

7362

return false;

7363

mt = get_pageblock_migratetype(page);

7363

mt = get_pageblock_migratetype(page);

7364

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

7364

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

7365

return false;

7365

return false;

7366

7367

pfn = page_to_pfn(page);

7367

pfn = page_to_pfn(page);

7368

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

7368

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

7369

unsigned long check = pfn + iter;

7369

unsigned long check = pfn + iter;

7370

7371

if (!pfn_valid_within(check))

7371

if (!pfn_valid_within(check))

7372

continue;

7372

continue;

7373

7374

page = pfn_to_page(check);

7374

page = pfn_to_page(check);

7375

7376

/*

7376

/*

7377

* Hugepages are not in LRU lists, but they're movable.

7377

* Hugepages are not in LRU lists, but they're movable.

7378

* We need not scan over tail pages bacause we don't

7378

* We need not scan over tail pages bacause we don't

7379

* handle each tail page individually in migration.

7379

* handle each tail page individually in migration.

7380

*/

7380

*/

7381

if (PageHuge(page)) {

7381

if (PageHuge(page)) {

7382

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

7382

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

7383

continue;

7383

continue;

7384

}

7384

}

7385

7386

/*

7386

/*

7387

* We can't use page_count without pin a page

7387

* We can't use page_count without pin a page

7388

* because another CPU can free compound page.

7388

* because another CPU can free compound page.

7389

* This check already skips compound tails of THP

7389

* This check already skips compound tails of THP

7390

* because their page->_refcount is zero at all time.

7390

* because their page->_refcount is zero at all time.

7391

*/

7391

*/

7392

if (!page_ref_count(page)) {

7392

if (!page_ref_count(page)) {

7393

if (PageBuddy(page))

7393

if (PageBuddy(page))

7394

iter += (1 << page_order(page)) - 1;

7394

iter += (1 << page_order(page)) - 1;

7395

continue;

7395

continue;

7396

}

7396

}

7397

7398

/*

7398

/*

7399

* The HWPoisoned page may be not in buddy system, and

7399

* The HWPoisoned page may be not in buddy system, and

7400

* page_count() is not 0.

7400

* page_count() is not 0.

7401

*/

7401

*/

7402

if (skip_hwpoisoned_pages && PageHWPoison(page))

7402

if (skip_hwpoisoned_pages && PageHWPoison(page))

7403

continue;

7403

continue;

7404

7405

if (__PageMovable(page))

7405

if (__PageMovable(page))

7406

continue;

7406

continue;

7407

7408

if (!PageLRU(page))

7408

if (!PageLRU(page))

7409

found++;

7409

found++;

7410

/*

7410

/*

7411

* If there are RECLAIMABLE pages, we need to check

7411

* If there are RECLAIMABLE pages, we need to check

7412

* it. But now, memory offline itself doesn't call

7412

* it. But now, memory offline itself doesn't call

7413

* shrink_node_slabs() and it still to be fixed.

7413

* shrink_node_slabs() and it still to be fixed.

7414

*/

7414

*/

7415

/*

7415

/*

7416

* If the page is not RAM, page_count()should be 0.

7416

* If the page is not RAM, page_count()should be 0.

7417

* we don't need more check. This is an _used_ not-movable page.

7417

* we don't need more check. This is an _used_ not-movable page.

7418

*

7418

*

7419

* The problematic thing here is PG_reserved pages. PG_reserved

7419

* The problematic thing here is PG_reserved pages. PG_reserved

7420

* is set to both of a memory hole page and a _used_ kernel

7420

* is set to both of a memory hole page and a _used_ kernel

7421

* page at boot.

7421

* page at boot.

7422

*/

7422

*/

7423

if (found > count)

7423

if (found > count)

7424

return true;

7424

return true;

7425

}

7425

}

7426

return false;

7426

return false;

7427

}

7427

}

7428

7429

bool is_pageblock_removable_nolock(struct page *page)

7429

bool is_pageblock_removable_nolock(struct page *page)

7430

{

7430

{

7431

struct zone *zone;

7431

struct zone *zone;

7432

unsigned long pfn;

7432

unsigned long pfn;

7433

7434

/*

7434

/*

7435

* We have to be careful here because we are iterating over memory

7435

* We have to be careful here because we are iterating over memory

7436

* sections which are not zone aware so we might end up outside of

7436

* sections which are not zone aware so we might end up outside of

7437

* the zone but still within the section.

7437

* the zone but still within the section.

7438

* We have to take care about the node as well. If the node is offline

7438

* We have to take care about the node as well. If the node is offline

7439

* its NODE_DATA will be NULL - see page_zone.

7439

* its NODE_DATA will be NULL - see page_zone.

7440

*/

7440

*/

7441

if (!node_online(page_to_nid(page)))

7441

if (!node_online(page_to_nid(page)))

7442

return false;

7442

return false;

7443

7444

zone = page_zone(page);

7444

zone = page_zone(page);

7445

pfn = page_to_pfn(page);

7445

pfn = page_to_pfn(page);

7446

if (!zone_spans_pfn(zone, pfn))

7446

if (!zone_spans_pfn(zone, pfn))

7447

return false;

7447

return false;

7448

7449

return !has_unmovable_pages(zone, page, 0, true);

7449

return !has_unmovable_pages(zone, page, 0, true);

7450

}

7450

}

7451

7452

#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)

7452

#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)

7453

7454

static unsigned long pfn_max_align_down(unsigned long pfn)

7454

static unsigned long pfn_max_align_down(unsigned long pfn)

7455

{

7455

{

7456

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

7456

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

7457

pageblock_nr_pages) - 1);

7457

pageblock_nr_pages) - 1);

7458

}

7458

}

7459

7460

static unsigned long pfn_max_align_up(unsigned long pfn)

7460

static unsigned long pfn_max_align_up(unsigned long pfn)

7461

{

7461

{

7462

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

7462

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

7463

pageblock_nr_pages));

7463

pageblock_nr_pages));

7464

}

7464

}

7465

7466

/* [start, end) must belong to a single zone. */

7466

/* [start, end) must belong to a single zone. */

7467

static int __alloc_contig_migrate_range(struct compact_control *cc,

7467

static int __alloc_contig_migrate_range(struct compact_control *cc,

7468

unsigned long start, unsigned long end)

7468

unsigned long start, unsigned long end)

7469

{

7469

{

7470

/* This function is based on compact_zone() from compaction.c. */

7470

/* This function is based on compact_zone() from compaction.c. */

7471

unsigned long nr_reclaimed;

7471

unsigned long nr_reclaimed;

7472

unsigned long pfn = start;

7472

unsigned long pfn = start;

7473

unsigned int tries = 0;

7473

unsigned int tries = 0;

7474

int ret = 0;

7474

int ret = 0;

7475

7476

migrate_prep();

7476

migrate_prep();

7477

7478

while (pfn < end || !list_empty(&cc->migratepages)) {

7478

while (pfn < end || !list_empty(&cc->migratepages)) {

7479

if (fatal_signal_pending(current)) {

7479

if (fatal_signal_pending(current)) {

7480

ret = -EINTR;

7480

ret = -EINTR;

7481

break;

7481

break;

7482

}

7482

}

7483

7484

if (list_empty(&cc->migratepages)) {

7484

if (list_empty(&cc->migratepages)) {

7485

cc->nr_migratepages = 0;

7485

cc->nr_migratepages = 0;

7486

pfn = isolate_migratepages_range(cc, pfn, end);

7486

pfn = isolate_migratepages_range(cc, pfn, end);

7487

if (!pfn) {

7487

if (!pfn) {

7488

ret = -EINTR;

7488

ret = -EINTR;

7489

break;

7489

break;

7490

}

7490

}

7491

tries = 0;

7491

tries = 0;

7492

} else if (++tries == 5) {

7492

} else if (++tries == 5) {

7493

ret = ret < 0 ? ret : -EBUSY;

7493

ret = ret < 0 ? ret : -EBUSY;

7494

break;

7494

break;

7495

}

7495

}

7496

7497

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

7497

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

7498

&cc->migratepages);

7498

&cc->migratepages);

7499

cc->nr_migratepages -= nr_reclaimed;

7499

cc->nr_migratepages -= nr_reclaimed;

7500

7501

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

7501

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

7502

NULL, 0, cc->mode, MR_CMA);

7502

NULL, 0, cc->mode, MR_CMA);

7503

}

7503

}

7504

if (ret < 0) {

7504

if (ret < 0) {

7505

putback_movable_pages(&cc->migratepages);

7505

putback_movable_pages(&cc->migratepages);

7506

return ret;

7506

return ret;

7507

}

7507

}

7508

return 0;

7508

return 0;

7509

}

7509

}

7510

7511

/**

7511

/**

7512

* alloc_contig_range() -- tries to allocate given range of pages

7512

* alloc_contig_range() -- tries to allocate given range of pages

7513

* @start: start PFN to allocate

7513

* @start: start PFN to allocate

7514

* @end: one-past-the-last PFN to allocate

7514

* @end: one-past-the-last PFN to allocate

7515

* @migratetype: migratetype of the underlaying pageblocks (either

7515

* @migratetype: migratetype of the underlaying pageblocks (either

7516

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

7516

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

7517

* in range must have the same migratetype and it must

7517

* in range must have the same migratetype and it must

7518

* be either of the two.

7518

* be either of the two.

7519

* @gfp_mask: GFP mask to use during compaction

7519

* @gfp_mask: GFP mask to use during compaction

7520

*

7520

*

7521

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

7521

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

7522

* aligned, however it's the caller's responsibility to guarantee that

7522

* aligned, however it's the caller's responsibility to guarantee that

7523

* we are the only thread that changes migrate type of pageblocks the

7523

* we are the only thread that changes migrate type of pageblocks the

7524

* pages fall in.

7524

* pages fall in.

7525

*

7525

*

7526

* The PFN range must belong to a single zone.

7526

* The PFN range must belong to a single zone.

7527

*

7527

*

7528

* Returns zero on success or negative error code. On success all

7528

* Returns zero on success or negative error code. On success all

7529

* pages which PFN is in [start, end) are allocated for the caller and

7529

* pages which PFN is in [start, end) are allocated for the caller and

7530

* need to be freed with free_contig_range().

7530

* need to be freed with free_contig_range().

7531

*/

7531

*/

7532

int alloc_contig_range(unsigned long start, unsigned long end,

7532

int alloc_contig_range(unsigned long start, unsigned long end,

7533

unsigned migratetype, gfp_t gfp_mask)

7533

unsigned migratetype, gfp_t gfp_mask)

7534

{

7534

{

7535

unsigned long outer_start, outer_end;

7535

unsigned long outer_start, outer_end;

7536

unsigned int order;

7536

unsigned int order;

7537

int ret = 0;

7537

int ret = 0;

7538

7539

struct compact_control cc = {

7539

struct compact_control cc = {

7540

.nr_migratepages = 0,

7540

.nr_migratepages = 0,

7541

.order = -1,

7541

.order = -1,

7542

.zone = page_zone(pfn_to_page(start)),

7542

.zone = page_zone(pfn_to_page(start)),

7543

.mode = MIGRATE_SYNC,

7543

.mode = MIGRATE_SYNC,

7544

.ignore_skip_hint = true,

7544

.ignore_skip_hint = true,

7545

.gfp_mask = current_gfp_context(gfp_mask),

7545

.gfp_mask = current_gfp_context(gfp_mask),

7546

};

7546

};

7547

INIT_LIST_HEAD(&cc.migratepages);

7547

INIT_LIST_HEAD(&cc.migratepages);

7548

7549

/*

7549

/*

7550

* What we do here is we mark all pageblocks in range as

7550

* What we do here is we mark all pageblocks in range as

7551

* MIGRATE_ISOLATE. Because pageblock and max order pages may

7551

* MIGRATE_ISOLATE. Because pageblock and max order pages may

7552

* have different sizes, and due to the way page allocator

7552

* have different sizes, and due to the way page allocator

7553

* work, we align the range to biggest of the two pages so

7553

* work, we align the range to biggest of the two pages so

7554

* that page allocator won't try to merge buddies from

7554

* that page allocator won't try to merge buddies from

7555

* different pageblocks and change MIGRATE_ISOLATE to some

7555

* different pageblocks and change MIGRATE_ISOLATE to some

7556

* other migration type.

7556

* other migration type.

7557

*

7557

*

7558

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

7558

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

7559

* migrate the pages from an unaligned range (ie. pages that

7559

* migrate the pages from an unaligned range (ie. pages that

7560

* we are interested in). This will put all the pages in

7560

* we are interested in). This will put all the pages in

7561

* range back to page allocator as MIGRATE_ISOLATE.

7561

* range back to page allocator as MIGRATE_ISOLATE.

7562

*

7562

*

7563

* When this is done, we take the pages in range from page

7563

* When this is done, we take the pages in range from page

7564

* allocator removing them from the buddy system. This way

7564

* allocator removing them from the buddy system. This way

7565

* page allocator will never consider using them.

7565

* page allocator will never consider using them.

7566

*

7566

*

7567

* This lets us mark the pageblocks back as

7567

* This lets us mark the pageblocks back as

7568

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

7568

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

7569

* aligned range but not in the unaligned, original range are

7569

* aligned range but not in the unaligned, original range are

7570

* put back to page allocator so that buddy can use them.

7570

* put back to page allocator so that buddy can use them.

7571

*/

7571

*/

7572

7573

ret = start_isolate_page_range(pfn_max_align_down(start),

7573

ret = start_isolate_page_range(pfn_max_align_down(start),

7574

pfn_max_align_up(end), migratetype,

7574

pfn_max_align_up(end), migratetype,

7575

false);

7575

false);

7576

if (ret)

7576

if (ret)

7577

return ret;

7577

return ret;

7578

7579

/*

7579

/*

7580

* In case of -EBUSY, we'd like to know which page causes problem.

7580

* In case of -EBUSY, we'd like to know which page causes problem.

7581

* So, just fall through. test_pages_isolated() has a tracepoint

7581

* So, just fall through. test_pages_isolated() has a tracepoint

7582

* which will report the busy page.

7582

* which will report the busy page.

7583

*

7583

*

7584

* It is possible that busy pages could become available before

7584

* It is possible that busy pages could become available before

7585

* the call to test_pages_isolated, and the range will actually be

7585

* the call to test_pages_isolated, and the range will actually be

7586

* allocated. So, if we fall through be sure to clear ret so that

7586

* allocated. So, if we fall through be sure to clear ret so that

7587

* -EBUSY is not accidentally used or returned to caller.

7587

* -EBUSY is not accidentally used or returned to caller.

7588

*/

7588

*/

7589

ret = __alloc_contig_migrate_range(&cc, start, end);

7589

ret = __alloc_contig_migrate_range(&cc, start, end);

7590

if (ret && ret != -EBUSY)

7590

if (ret && ret != -EBUSY)

7591

goto done;

7591

goto done;

7592

ret =0;

7592

ret =0;

7593

7594

/*

7594

/*

7595

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

7595

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

7596

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

7596

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

7597

* more, all pages in [start, end) are free in page allocator.

7597

* more, all pages in [start, end) are free in page allocator.

7598

* What we are going to do is to allocate all pages from

7598

* What we are going to do is to allocate all pages from

7599

* [start, end) (that is remove them from page allocator).

7599

* [start, end) (that is remove them from page allocator).

7600

*

7600

*

7601

* The only problem is that pages at the beginning and at the

7601

* The only problem is that pages at the beginning and at the

7602

* end of interesting range may be not aligned with pages that

7602

* end of interesting range may be not aligned with pages that

7603

* page allocator holds, ie. they can be part of higher order

7603

* page allocator holds, ie. they can be part of higher order

7604

* pages. Because of this, we reserve the bigger range and

7604

* pages. Because of this, we reserve the bigger range and

7605

* once this is done free the pages we are not interested in.

7605

* once this is done free the pages we are not interested in.

7606

*

7606

*

7607

* We don't have to hold zone->lock here because the pages are

7607

* We don't have to hold zone->lock here because the pages are

7608

* isolated thus they won't get removed from buddy.

7608

* isolated thus they won't get removed from buddy.

7609

*/

7609

*/

7610

7611

lru_add_drain_all();

7611

lru_add_drain_all();

7612

drain_all_pages(cc.zone);

7612

drain_all_pages(cc.zone);

7613

7614

order = 0;

7614

order = 0;

7615

outer_start = start;

7615

outer_start = start;

7616

while (!PageBuddy(pfn_to_page(outer_start))) {

7616

while (!PageBuddy(pfn_to_page(outer_start))) {

7617

if (++order >= MAX_ORDER) {

7617

if (++order >= MAX_ORDER) {

7618

outer_start = start;

7618

outer_start = start;

7619

break;

7619

break;

7620

}

7620

}

7621

outer_start &= ~0UL << order;

7621

outer_start &= ~0UL << order;

7622

}

7622

}

7623

7624

if (outer_start != start) {

7624

if (outer_start != start) {

7625

order = page_order(pfn_to_page(outer_start));

7625

order = page_order(pfn_to_page(outer_start));

7626

7627

/*

7627

/*

7628

* outer_start page could be small order buddy page and

7628

* outer_start page could be small order buddy page and

7629

* it doesn't include start page. Adjust outer_start

7629

* it doesn't include start page. Adjust outer_start

7630

* in this case to report failed page properly

7630

* in this case to report failed page properly

7631

* on tracepoint in test_pages_isolated()

7631

* on tracepoint in test_pages_isolated()

7632

*/

7632

*/

7633

if (outer_start + (1UL << order) <= start)

7633

if (outer_start + (1UL << order) <= start)

7634

outer_start = start;

7634

outer_start = start;

7635

}

7635

}

7636

7637

/* Make sure the range is really isolated. */

7637

/* Make sure the range is really isolated. */

7638

if (test_pages_isolated(outer_start, end, false)) {

7638

ret = test_pages_isolated(outer_start, end, false);

7639

pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",

7639

if (ret) {

7640

__func__, outer_start, end);

7641

ret = -EBUSY;

7640

ret = -EBUSY;

7642

goto done;

7641

goto done;

7643

}

7642

}

7644

7643

7645

/* Grab isolated pages from freelists. */

7644

/* Grab isolated pages from freelists. */

7646

outer_end = isolate_freepages_range(&cc, outer_start, end);

7645

outer_end = isolate_freepages_range(&cc, outer_start, end);

7647

if (!outer_end) {

7646

if (!outer_end) {

7648

ret = -EBUSY;

7647

ret = -EBUSY;

7649

goto done;

7648

goto done;

7650

}

7649

}

7651

7650

7652

/* Free head and tail (if any) */

7651

/* Free head and tail (if any) */

7653

if (start != outer_start)

7652

if (start != outer_start)

7654

free_contig_range(outer_start, start - outer_start);

7653

free_contig_range(outer_start, start - outer_start);

7655

if (end != outer_end)

7654

if (end != outer_end)

7656

free_contig_range(end, outer_end - end);

7655

free_contig_range(end, outer_end - end);

7657

7656

7658

done:

7657

done:

7659

undo_isolate_page_range(pfn_max_align_down(start),

7658

undo_isolate_page_range(pfn_max_align_down(start),

7660

pfn_max_align_up(end), migratetype);

7659

pfn_max_align_up(end), migratetype);

7661

return ret;

7660

return ret;

7662

}

7661

}

7663

7662

7664

void free_contig_range(unsigned long pfn, unsigned nr_pages)

7663

void free_contig_range(unsigned long pfn, unsigned nr_pages)

7665

{

7664

{

7666

unsigned int count = 0;

7665

unsigned int count = 0;

7667

7666

7668

for (; nr_pages--; pfn++) {

7667

for (; nr_pages--; pfn++) {

7669

struct page *page = pfn_to_page(pfn);

7668

struct page *page = pfn_to_page(pfn);

7670

7669

7671

count += page_count(page) != 1;

7670

count += page_count(page) != 1;

7672

__free_page(page);

7671

__free_page(page);

7673

}

7672

}

7674

WARN(count != 0, "%d pages are still in use!\n", count);

7673

WARN(count != 0, "%d pages are still in use!\n", count);

7675

}

7674

}

7676

#endif

7675

#endif

7677

7676

7678

#ifdef CONFIG_MEMORY_HOTPLUG

7677

#ifdef CONFIG_MEMORY_HOTPLUG

7679

/*

7678

/*

7680

* The zone indicated has a new number of managed_pages; batch sizes and percpu

7679

* The zone indicated has a new number of managed_pages; batch sizes and percpu

7681

* page high values need to be recalulated.

7680

* page high values need to be recalulated.

7682

*/

7681

*/

7683

void __meminit zone_pcp_update(struct zone *zone)

7682

void __meminit zone_pcp_update(struct zone *zone)

7684

{

7683

{

7685

unsigned cpu;

7684

unsigned cpu;

7686

mutex_lock(&pcp_batch_high_lock);

7685

mutex_lock(&pcp_batch_high_lock);

7687

for_each_possible_cpu(cpu)

7686

for_each_possible_cpu(cpu)

7688

pageset_set_high_and_batch(zone,

7687

pageset_set_high_and_batch(zone,

7689

per_cpu_ptr(zone->pageset, cpu));

7688

per_cpu_ptr(zone->pageset, cpu));

7690

mutex_unlock(&pcp_batch_high_lock);

7689

mutex_unlock(&pcp_batch_high_lock);

7691

}

7690

}

7692

#endif

7691

#endif

7693

7692

7694

void zone_pcp_reset(struct zone *zone)

7693

void zone_pcp_reset(struct zone *zone)

7695

{

7694

{

7696

unsigned long flags;

7695

unsigned long flags;

7697

int cpu;

7696

int cpu;

7698

struct per_cpu_pageset *pset;

7697

struct per_cpu_pageset *pset;

7699

7698

7700

/* avoid races with drain_pages() */

7699

/* avoid races with drain_pages() */

7701

local_irq_save(flags);

7700

local_irq_save(flags);

7702

if (zone->pageset != &boot_pageset) {

7701

if (zone->pageset != &boot_pageset) {

7703

for_each_online_cpu(cpu) {

7702

for_each_online_cpu(cpu) {

7704

pset = per_cpu_ptr(zone->pageset, cpu);

7703

pset = per_cpu_ptr(zone->pageset, cpu);

7705

drain_zonestat(zone, pset);

7704

drain_zonestat(zone, pset);

7706

}

7705

}

7707

free_percpu(zone->pageset);

7706

free_percpu(zone->pageset);

7708

zone->pageset = &boot_pageset;

7707

zone->pageset = &boot_pageset;

7709

}

7708

}

7710

local_irq_restore(flags);

7709

local_irq_restore(flags);

7711

}

7710

}

7712

7711

7713

#ifdef CONFIG_MEMORY_HOTREMOVE

7712

#ifdef CONFIG_MEMORY_HOTREMOVE

7714

/*

7713

/*

7715

* All pages in the range must be in a single zone and isolated

7714

* All pages in the range must be in a single zone and isolated

7716

* before calling this.

7715

* before calling this.

7717

*/

7716

*/

7718

void

7717

void

7719

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

7718

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

7720

{

7719

{

7721

struct page *page;

7720

struct page *page;

7722

struct zone *zone;

7721

struct zone *zone;

7723

unsigned int order, i;

7722

unsigned int order, i;

7724

unsigned long pfn;

7723

unsigned long pfn;

7725

unsigned long flags;

7724

unsigned long flags;

7726

/* find the first valid pfn */

7725

/* find the first valid pfn */

7727

for (pfn = start_pfn; pfn < end_pfn; pfn++)

7726

for (pfn = start_pfn; pfn < end_pfn; pfn++)

7728

if (pfn_valid(pfn))

7727

if (pfn_valid(pfn))

7729

break;

7728

break;

7730

if (pfn == end_pfn)

7729

if (pfn == end_pfn)

7731

return;

7730

return;

7732

offline_mem_sections(pfn, end_pfn);

7731

offline_mem_sections(pfn, end_pfn);

7733

zone = page_zone(pfn_to_page(pfn));

7732

zone = page_zone(pfn_to_page(pfn));

7734

spin_lock_irqsave(&zone->lock, flags);

7733

spin_lock_irqsave(&zone->lock, flags);

7735

pfn = start_pfn;

7734

pfn = start_pfn;

7736

while (pfn < end_pfn) {

7735

while (pfn < end_pfn) {

7737

if (!pfn_valid(pfn)) {

7736

if (!pfn_valid(pfn)) {

7738

pfn++;

7737

pfn++;

7739

continue;

7738

continue;

7740

}

7739

}

7741

page = pfn_to_page(pfn);

7740

page = pfn_to_page(pfn);

7742

/*

7741

/*

7743

* The HWPoisoned page may be not in buddy system, and

7742

* The HWPoisoned page may be not in buddy system, and

7744

* page_count() is not 0.

7743

* page_count() is not 0.

7745

*/

7744

*/

7746

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

7745

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

7747

pfn++;

7746

pfn++;

7748

SetPageReserved(page);

7747

SetPageReserved(page);

7749

continue;

7748

continue;

7750

}

7749

}

7751

7750

7752

BUG_ON(page_count(page));

7751

BUG_ON(page_count(page));

7753

BUG_ON(!PageBuddy(page));

7752

BUG_ON(!PageBuddy(page));

7754

order = page_order(page);

7753

order = page_order(page);

7755

#ifdef CONFIG_DEBUG_VM

7754

#ifdef CONFIG_DEBUG_VM

7756

pr_info("remove from free list %lx %d %lx\n",

7755

pr_info("remove from free list %lx %d %lx\n",

7757

pfn, 1 << order, end_pfn);

7756

pfn, 1 << order, end_pfn);

7758

#endif

7757

#endif

7759

list_del(&page->lru);

7758

list_del(&page->lru);

7760

rmv_page_order(page);

7759

rmv_page_order(page);

7761

zone->free_area[order].nr_free--;

7760

zone->free_area[order].nr_free--;

7762

for (i = 0; i < (1 << order); i++)

7761

for (i = 0; i < (1 << order); i++)

7763

SetPageReserved((page+i));

7762

SetPageReserved((page+i));

7764

pfn += (1 << order);

7763

pfn += (1 << order);

7765

}

7764

}

7766

spin_unlock_irqrestore(&zone->lock, flags);

7765

spin_unlock_irqrestore(&zone->lock, flags);

7767

}

7766

}

7768

#endif

7767

#endif

7769

7768

7770

bool is_free_buddy_page(struct page *page)

7769

bool is_free_buddy_page(struct page *page)

7771

{

7770

{

7772

struct zone *zone = page_zone(page);

7771

struct zone *zone = page_zone(page);

7773

unsigned long pfn = page_to_pfn(page);

7772

unsigned long pfn = page_to_pfn(page);

7774

unsigned long flags;

7773

unsigned long flags;

7775

unsigned int order;

7774

unsigned int order;

7776

7775

7777

spin_lock_irqsave(&zone->lock, flags);

7776

spin_lock_irqsave(&zone->lock, flags);

7778

for (order = 0; order < MAX_ORDER; order++) {

7777

for (order = 0; order < MAX_ORDER; order++) {

7779

struct page *page_head = page - (pfn & ((1 << order) - 1));

7778

struct page *page_head = page - (pfn & ((1 << order) - 1));

7780

7779

7781

if (PageBuddy(page_head) && page_order(page_head) >= order)

7780

if (PageBuddy(page_head) && page_order(page_head) >= order)

7782

break;

7781

break;

7783

}

7782

}

7784

spin_unlock_irqrestore(&zone->lock, flags);

7783

spin_unlock_irqrestore(&zone->lock, flags);

7785

7784

7786

return order < MAX_ORDER;

7785

return order < MAX_ORDER;

7787

}

7786

}

7788

7787

GITLAB

Eric Lee / smarc-fsl-linux-kernel

Drop PFNs busy printk in an expected path

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/memremap.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_ext.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <trace/events/oom.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/mm.h>
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION	(8)
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /* work_structs for global per-cpu drains */
 DEFINE_MUTEX(pcpu_drain_mutex);
 DEFINE_PER_CPU(struct work_struct, pcpu_drain);
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
 EXPORT_SYMBOL(latent_entropy);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_MEMORY] = { { [0] = 1UL } },
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 /* Protect totalram_pages and zone->managed_pages */
 static DEFINE_SPINLOCK(managed_page_count_lock);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 /*
  * A cached value of the page's pageblock's migratetype, used when the page is
  * put on a pcplist. Used to avoid the pageblock migratetype lookup when
  * freeing from pcplists in most cases, at the cost of possibly becoming stale.
  * Also the migratetype set in the page does not necessarily match the pcplist
  * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
  * other index - this ensures that it will be put on the correct CMA freelist.
  */
 static inline int get_pcppage_migratetype(struct page *page)
 {
 	return page->index;
 }
 static inline void set_pcppage_migratetype(struct page *page, int migratetype)
 {
 	page->index = migratetype;
 }
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 unsigned int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 #ifdef CONFIG_ZONE_DEVICE
 	 "Device",
 #endif
 };
 char * const migratetype_names[MIGRATE_TYPES] = {
 	"Unmovable",
 	"Movable",
 	"Reclaimable",
 	"HighAtomic",
 #ifdef CONFIG_CMA
 	"CMA",
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 	"Isolate",
 #endif
 };
 compound_page_dtor * const compound_page_dtors[] = {
 	NULL,
 	free_compound_page,
 #ifdef CONFIG_HUGETLB_PAGE
 	free_huge_page,
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	free_transhuge_page,
 #endif
 };
 /*
  * Try to keep at least this much lowmem free.  Do not allow normal
  * allocations below this point, only high priority ones. Automatically
  * tuned according to the amount of memory in the system.
  */
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
 int watermark_scale_factor = 10;
 /*
  * Extra memory for the system to try freeing. Used to temporarily
  * free memory, to make space for new workloads. Anyone can allocate
  * down to the min watermarks controlled by min_free_kbytes above.
  */
 int extra_free_kbytes = 0;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 static bool mirrored_kernelcore;
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 /*
  * Determine how many pages need to be initialized durig early boot
  * (non-deferred initialization).
  * The value of first_deferred_pfn will be set later, once non-deferred pages
  * are initialized, but for now set it ULONG_MAX.
  */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
 	phys_addr_t start_addr, end_addr;
 	unsigned long max_pgcnt;
 	unsigned long reserved;
 	/*
 	 * Initialise at least 2G of a node but also take into account that
 	 * two large system hashes that can take up 1GB for 0.25TB/node.
 	 */
 	max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
 			(pgdat->node_spanned_pages >> 8));
 	/*
 	 * Compensate the all the memblock reservations (e.g. crash kernel)
 	 * from the initial estimation to make sure we will initialize enough
 	 * memory to boot.
 	 */
 	start_addr = PFN_PHYS(pgdat->node_start_pfn);
 	end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
 	reserved = memblock_reserved_memory_within(start_addr, end_addr);
 	max_pgcnt += PHYS_PFN(reserved);
 	pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
 	pgdat->first_deferred_pfn = ULONG_MAX;
 }
 /* Returns true if the struct page for the pfn is uninitialised */
 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 {
 	int nid = early_pfn_to_nid(pfn);
 	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
 		return true;
 	return false;
 }
 /*
  * Returns false when the remaining initialisation should be deferred until
  * later in the boot cycle when it can be parallelised.
  */
 static inline bool update_defer_init(pg_data_t *pgdat,
 				unsigned long pfn, unsigned long zone_end,
 				unsigned long *nr_initialised)
 {
 	/* Always populate low zones for address-contrained allocations */
 	if (zone_end < pgdat_end_pfn(pgdat))
 		return true;
 	(*nr_initialised)++;
 	if ((*nr_initialised > pgdat->static_init_pgcnt) &&
 	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
 		pgdat->first_deferred_pfn = pfn;
 		return false;
 	}
 	return true;
 }
 #else
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
 }
 static inline bool early_page_uninitialised(unsigned long pfn)
 {
 	return false;
 }
 static inline bool update_defer_init(pg_data_t *pgdat,
 				unsigned long pfn, unsigned long zone_end,
 				unsigned long *nr_initialised)
 {
 	return true;
 }
 #endif
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @pfn: The target page frame number
  * @end_bitidx: The last bit of interest to retrieve
  * @mask: mask of bits that the caller is interested in
  *
  * Return: pageblock_bits flags
  */
 static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
 					unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long word;
 	bitmap = get_pageblock_bitmap(page, pfn);
 	bitidx = pfn_to_bitidx(page, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	word = bitmap[word_bitidx];
 	bitidx += end_bitidx;
 	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 }
 unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
 }
 static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
 {
 	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
 }
 /**
  * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @flags: The flags to set
  * @pfn: The target page frame number
  * @end_bitidx: The last bit of interest
  * @mask: mask of bits that the caller is interested in
  */
 void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long old_word, word;
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 	bitmap = get_pageblock_bitmap(page, pfn);
 	bitidx = pfn_to_bitidx(page, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
 	flags <<= (BITS_PER_LONG - bitidx - 1);
 	word = READ_ONCE(bitmap[word_bitidx]);
 	for (;;) {
 		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 		if (word == old_word)
 			break;
 		word = old_word;
 	}
 }
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled &&
 		     migratetype < MIGRATE_PCPTYPES))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long sp, start_pfn;
 	do {
 		seq = zone_span_seqbegin(zone);
 		start_pfn = zone->zone_start_pfn;
 		sp = zone->spanned_pages;
 		if (!zone_spans_pfn(zone, pfn))
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	if (ret)
 		pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
 			pfn, zone_to_nid(zone), zone->name,
 			start_pfn, start_pfn + sp);
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int __maybe_unused bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page, const char *reason,
 		unsigned long bad_flags)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			pr_alert(
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	__dump_page(page, reason);
 	bad_flags &= page->flags;
 	if (bad_flags)
 		pr_alert("bad because of flags: %#lx(%pGp)\n",
 						bad_flags, &bad_flags);
 	dump_page_owner(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page" and have PG_head set.
  *
  * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
  * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
  *
  * The first tail page's ->compound_dtor holds the offset in array of compound
  * page destructors. See compound_page_dtors.
  *
  * The first tail page's ->compound_order holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned int order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		set_page_count(p, 0);
 		p->mapping = TAIL_MAPPING;
 		set_compound_head(p, page);
 	}
 	atomic_set(compound_mapcount_ptr(page), -1);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 bool _debug_pagealloc_enabled __read_mostly
 			= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
 EXPORT_SYMBOL(_debug_pagealloc_enabled);
 bool _debug_guardpage_enabled __read_mostly;
 static int __init early_debug_pagealloc(char *buf)
 {
 	if (!buf)
 		return -EINVAL;
 	return kstrtobool(buf, &_debug_pagealloc_enabled);
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
 static bool need_debug_guardpage(void)
 {
 	/* If we don't use debug_pagealloc, we don't need guard page */
 	if (!debug_pagealloc_enabled())
 		return false;
 	if (!debug_guardpage_minorder())
 		return false;
 	return true;
 }
 static void init_debug_guardpage(void)
 {
 	if (!debug_pagealloc_enabled())
 		return;
 	if (!debug_guardpage_minorder())
 		return;
 	_debug_guardpage_enabled = true;
 }
 struct page_ext_operations debug_guardpage_ops = {
 	.need = need_debug_guardpage,
 	.init = init_debug_guardpage,
 };
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		pr_err("Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	pr_info("Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
 {
 	struct page_ext *page_ext;
 	if (!debug_guardpage_enabled())
 		return false;
 	if (order >= debug_guardpage_minorder())
 		return false;
 	page_ext = lookup_page_ext(page);
 	if (unlikely(!page_ext))
 		return false;
 	__set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 	INIT_LIST_HEAD(&page->lru);
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
 	return true;
 }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype)
 {
 	struct page_ext *page_ext;
 	if (!debug_guardpage_enabled())
 		return;
 	page_ext = lookup_page_ext(page);
 	if (unlikely(!page_ext))
 		return;
 	__clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 	set_page_private(page, 0);
 	if (!is_migrate_isolate(migratetype))
 		__mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
 #else
 struct page_ext_operations debug_guardpage_ops;
 static inline bool set_page_guard(struct zone *zone, struct page *page,
 			unsigned int order, int migratetype) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
 				unsigned int order, int migratetype) {}
 #endif
 static inline void set_page_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole (check before calling!) &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE.
  * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
  * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 							unsigned int order)
 {
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		/*
 		 * zone check is done late to avoid uselessly
 		 * calculating zone/node ids for pages that could
 		 * never merge.
 		 */
 		if (page_zone_id(page) != page_zone_id(buddy))
 			return 0;
 		VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount
  * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
  * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- nyc
  */
 static inline void __free_one_page(struct page *page,
 		unsigned long pfn,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long combined_pfn;
 	unsigned long uninitialized_var(buddy_pfn);
 	struct page *buddy;
 	unsigned int max_order;
 	max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
 	VM_BUG_ON(!zone_is_initialized(zone));
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
 	VM_BUG_ON(migratetype == -1);
 	if (likely(!is_migrate_isolate(migratetype)))
 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 continue_merging:
 	while (order < max_order - 1) {
 		buddy_pfn = __find_buddy_pfn(pfn, order);
 		buddy = page + (buddy_pfn - pfn);
 		if (!pfn_valid_within(buddy_pfn))
 			goto done_merging;
 		if (!page_is_buddy(page, buddy, order))
 			goto done_merging;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard(zone, buddy, order, migratetype);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_pfn = buddy_pfn & pfn;
 		page = page + (combined_pfn - pfn);
 		pfn = combined_pfn;
 		order++;
 	}
 	if (max_order < MAX_ORDER) {
 		/* If we are here, it means order is >= pageblock_order.
 		 * We want to prevent merge between freepages on isolate
 		 * pageblock and normal pageblock. Without this, pageblock
 		 * isolation could cause incorrect freepage or CMA accounting.
 		 *
 		 * We don't want to hit this code for the more frequent
 		 * low-order merging.
 		 */
 		if (unlikely(has_isolate_pageblock(zone))) {
 			int buddy_mt;
 			buddy_pfn = __find_buddy_pfn(pfn, order);
 			buddy = page + (buddy_pfn - pfn);
 			buddy_mt = get_pageblock_migratetype(buddy);
 			if (migratetype != buddy_mt
 					&& (is_migrate_isolate(migratetype) ||
 						is_migrate_isolate(buddy_mt)))
 				goto done_merging;
 		}
 		max_order++;
 		goto continue_merging;
 	}
 done_merging:
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
 		struct page *higher_page, *higher_buddy;
 		combined_pfn = buddy_pfn & pfn;
 		higher_page = page + (combined_pfn - pfn);
 		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 		if (pfn_valid_within(buddy_pfn) &&
 		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 /*
  * A bad page could be due to a number of fields. Instead of multiple branches,
  * try and check multiple fields with one check. The caller must do a detailed
  * check if necessary.
  */
 static inline bool page_expected_state(struct page *page,
 					unsigned long check_flags)
 {
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		return false;
 	if (unlikely((unsigned long)page->mapping |
 			page_ref_count(page) |
 #ifdef CONFIG_MEMCG
 			(unsigned long)page->mem_cgroup |
 #endif
 			(page->flags & check_flags)))
 		return false;
 	return true;
 }
 static void free_pages_check_bad(struct page *page)
 {
 	const char *bad_reason;
 	unsigned long bad_flags;
 	bad_reason = NULL;
 	bad_flags = 0;
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
 		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
 	}
 #ifdef CONFIG_MEMCG
 	if (unlikely(page->mem_cgroup))
 		bad_reason = "page still charged to cgroup";
 #endif
 	bad_page(page, bad_reason, bad_flags);
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
 		return 0;
 	/* Something has gone sideways, find it */
 	free_pages_check_bad(page);
 	return 1;
 }
 static int free_tail_pages_check(struct page *head_page, struct page *page)
 {
 	int ret = 1;
 	/*
 	 * We rely page->lru.next never has bit 0 set, unless the page
 	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
 	 */
 	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
 	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
 		ret = 0;
 		goto out;
 	}
 	switch (page - head_page) {
 	case 1:
 		/* the first tail page: ->mapping is compound_mapcount() */
 		if (unlikely(compound_mapcount(page))) {
 			bad_page(page, "nonzero compound_mapcount", 0);
 			goto out;
 		}
 		break;
 	case 2:
 		/*
 		 * the second tail page: ->mapping is
 		 * page_deferred_list().next -- ignore value.
 		 */
 		break;
 	default:
 		if (page->mapping != TAIL_MAPPING) {
 			bad_page(page, "corrupted mapping in tail page", 0);
 			goto out;
 		}
 		break;
 	}
 	if (unlikely(!PageTail(page))) {
 		bad_page(page, "PageTail not set", 0);
 		goto out;
 	}
 	if (unlikely(compound_head(page) != head_page)) {
 		bad_page(page, "compound_head not consistent", 0);
 		goto out;
 	}
 	ret = 0;
 out:
 	page->mapping = NULL;
 	clear_compound_head(page);
 	return ret;
 }
 static __always_inline bool free_pages_prepare(struct page *page,
 					unsigned int order, bool check_free)
 {
 	int bad = 0;
 	VM_BUG_ON_PAGE(PageTail(page), page);
 	trace_mm_page_free(page, order);
 	/*
 	 * Check tail pages before head page information is cleared to
 	 * avoid checking PageCompound for order-0 pages.
 	 */
 	if (unlikely(order)) {
 		bool compound = PageCompound(page);
 		int i;
 		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 		if (compound)
 			ClearPageDoubleMap(page);
 		for (i = 1; i < (1 << order); i++) {
 			if (compound)
 				bad += free_tail_pages_check(page, page + i);
 			if (unlikely(free_pages_check(page + i))) {
 				bad++;
 				continue;
 			}
 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		}
 	}
 	if (PageMappingFlags(page))
 		page->mapping = NULL;
 	if (memcg_kmem_enabled() && PageKmemcg(page))
 		memcg_kmem_uncharge(page, order);
 	if (check_free)
 		bad += free_pages_check(page);
 	if (bad)
 		return false;
 	page_cpupid_reset_last(page);
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
 					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_poison_pages(page, 1 << order, 0);
 	kernel_map_pages(page, 1 << order, 0);
 	kasan_free_pages(page, order);
 	return true;
 }
 #ifdef CONFIG_DEBUG_VM
 static inline bool free_pcp_prepare(struct page *page)
 {
 	return free_pages_prepare(page, 0, true);
 }
 static inline bool bulkfree_pcp_prepare(struct page *page)
 {
 	return false;
 }
 #else
 static bool free_pcp_prepare(struct page *page)
 {
 	return free_pages_prepare(page, 0, false);
 }
 static bool bulkfree_pcp_prepare(struct page *page)
 {
 	return free_pages_check(page);
 }
 #endif /* CONFIG_DEBUG_VM */
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	bool isolated_pageblocks;
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
 	while (count) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = count;
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
 			page = list_last_entry(list, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			mt = get_pcppage_migratetype(page);
 			/* MIGRATE_ISOLATE page should not go to pcplists */
 			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
 			/* Pageblock could have been isolated meanwhile */
 			if (unlikely(isolated_pageblocks))
 				mt = get_pageblock_migratetype(page);
 			if (bulkfree_pcp_prepare(page))
 				continue;
 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--count && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone,
 				struct page *page, unsigned long pfn,
 				unsigned int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	if (unlikely(has_isolate_pageblock(zone) ||
 		is_migrate_isolate(migratetype))) {
 		migratetype = get_pfnblock_migratetype(page, pfn);
 	}
 	__free_one_page(page, pfn, zone, order, migratetype);
 	spin_unlock(&zone->lock);
 }
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 				unsigned long zone, int nid)
 {
 	set_page_links(page, zone, nid, pfn);
 	init_page_count(page);
 	page_mapcount_reset(page);
 	page_cpupid_reset_last(page);
 	INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 	if (!is_highmem_idx(zone))
 		set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 }
 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
 					int nid)
 {
 	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
 }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __meminit init_reserved_page(unsigned long pfn)
 {
 	pg_data_t *pgdat;
 	int nid, zid;
 	if (!early_page_uninitialised(pfn))
 		return;
 	nid = early_pfn_to_nid(pfn);
 	pgdat = NODE_DATA(nid);
 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 		struct zone *zone = &pgdat->node_zones[zid];
 		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
 			break;
 	}
 	__init_single_pfn(pfn, zid, nid);
 }
 #else
 static inline void init_reserved_page(unsigned long pfn)
 {
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 /*
  * Initialised pages do not have PageReserved set. This function is
  * called for each range allocated by the bootmem allocator and
  * marks the pages PageReserved. The remaining valid pages are later
  * sent to the buddy page allocator.
  */
 void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long end_pfn = PFN_UP(end);
 	for (; start_pfn < end_pfn; start_pfn++) {
 		if (pfn_valid(start_pfn)) {
 			struct page *page = pfn_to_page(start_pfn);
 			init_reserved_page(start_pfn);
 			/* Avoid false-positive PageTail() */
 			INIT_LIST_HEAD(&page->lru);
 			SetPageReserved(page);
 		}
 	}
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 	if (!free_pages_prepare(page, order, true))
 		return;
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	local_irq_save(flags);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, pfn, order, migratetype);
 	local_irq_restore(flags);
 }
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
 	unsigned int loop;
 	prefetchw(p);
 	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
 		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	__ClearPageReserved(p);
 	set_page_count(p, 0);
 	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
 	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	static DEFINE_SPINLOCK(early_pfn_lock);
 	int nid;
 	spin_lock(&early_pfn_lock);
 	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
 	if (nid < 0)
 		nid = first_online_node;
 	spin_unlock(&early_pfn_lock);
 	return nid;
 }
 #endif
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 static inline bool __meminit __maybe_unused
 meminit_pfn_in_nid(unsigned long pfn, int node,
 		   struct mminit_pfnnid_cache *state)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn, state);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 /* Only safe to use early in boot when initialisation is single-threaded */
 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
 }
 #else
 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	return true;
 }
 static inline bool __meminit  __maybe_unused
 meminit_pfn_in_nid(unsigned long pfn, int node,
 		   struct mminit_pfnnid_cache *state)
 {
 	return true;
 }
 #endif
 void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
 							unsigned int order)
 {
 	if (early_page_uninitialised(pfn))
 		return;
 	return __free_pages_boot_core(page, order);
 }
 /*
  * Check that the whole (or subset of) a pageblock given by the interval of
  * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
  * with the migration of free compaction scanner. The scanners then need to
  * use only pfn_valid_within() check for arches that allow holes within
  * pageblocks.
  *
  * Return struct page pointer of start_pfn, or NULL if checks were not passed.
  *
  * It's possible on some configurations to have a setup like node0 node1 node0
  * i.e. it's possible that all pages within a zones range of pages do not
  * belong to a single zone. We assume that a border between node0 and node1
  * can occur within a single pageblock, but not a node0 node1 node0
  * interleaving within a single pageblock. It is therefore sufficient to check
  * the first and last page of a pageblock and avoid checking each individual
  * page in a pageblock.
  */
 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 				     unsigned long end_pfn, struct zone *zone)
 {
 	struct page *start_page;
 	struct page *end_page;
 	/* end_pfn is one past the range we are checking */
 	end_pfn--;
 	if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
 		return NULL;
 	start_page = pfn_to_online_page(start_pfn);
 	if (!start_page)
 		return NULL;
 	if (page_zone(start_page) != zone)
 		return NULL;
 	end_page = pfn_to_page(end_pfn);
 	/* This gives a shorter code than deriving page_zone(end_page) */
 	if (page_zone_id(start_page) != page_zone_id(end_page))
 		return NULL;
 	return start_page;
 }
 void set_zone_contiguous(struct zone *zone)
 {
 	unsigned long block_start_pfn = zone->zone_start_pfn;
 	unsigned long block_end_pfn;
 	block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
 	for (; block_start_pfn < zone_end_pfn(zone);
 			block_start_pfn = block_end_pfn,
 			 block_end_pfn += pageblock_nr_pages) {
 		block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
 		if (!__pageblock_pfn_to_page(block_start_pfn,
 					     block_end_pfn, zone))
 			return;
 	}
 	/* We confirm that there is no hole */
 	zone->contiguous = true;
 }
 void clear_zone_contiguous(struct zone *zone)
 {
 	zone->contiguous = false;
 }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __init deferred_free_range(struct page *page,
 					unsigned long pfn, int nr_pages)
 {
 	int i;
 	if (!page)
 		return;
 	/* Free a large naturally-aligned chunk if possible */
 	if (nr_pages == pageblock_nr_pages &&
 	    (pfn & (pageblock_nr_pages - 1)) == 0) {
 		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		__free_pages_boot_core(page, pageblock_order);
 		return;
 	}
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
 		if ((pfn & (pageblock_nr_pages - 1)) == 0)
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		__free_pages_boot_core(page, 0);
 	}
 }
 /* Completion tracking for deferred_init_memmap() threads */
 static atomic_t pgdat_init_n_undone __initdata;
 static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
 static inline void __init pgdat_init_report_one_done(void)
 {
 	if (atomic_dec_and_test(&pgdat_init_n_undone))
 		complete(&pgdat_init_all_done_comp);
 }
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
 	pg_data_t *pgdat = data;
 	int nid = pgdat->node_id;
 	struct mminit_pfnnid_cache nid_init_state = { };
 	unsigned long start = jiffies;
 	unsigned long nr_pages = 0;
 	unsigned long walk_start, walk_end;
 	int i, zid;
 	struct zone *zone;
 	unsigned long first_init_pfn = pgdat->first_deferred_pfn;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 	if (first_init_pfn == ULONG_MAX) {
 		pgdat_init_report_one_done();
 		return 0;
 	}
 	/* Bind memory initialisation thread to a local node if possible */
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(current, cpumask);
 	/* Sanity check boundaries */
 	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
 	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
 	pgdat->first_deferred_pfn = ULONG_MAX;
 	/* Only the highest zone is deferred so find it */
 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 		zone = pgdat->node_zones + zid;
 		if (first_init_pfn < zone_end_pfn(zone))
 			break;
 	}
 	for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
 		unsigned long pfn, end_pfn;
 		struct page *page = NULL;
 		struct page *free_base_page = NULL;
 		unsigned long free_base_pfn = 0;
 		int nr_to_free = 0;
 		end_pfn = min(walk_end, zone_end_pfn(zone));
 		pfn = first_init_pfn;
 		if (pfn < walk_start)
 			pfn = walk_start;
 		if (pfn < zone->zone_start_pfn)
 			pfn = zone->zone_start_pfn;
 		for (; pfn < end_pfn; pfn++) {
 			if (!pfn_valid_within(pfn))
 				goto free_range;
 			/*
 			 * Ensure pfn_valid is checked every
 			 * pageblock_nr_pages for memory holes
 			 */
 			if ((pfn & (pageblock_nr_pages - 1)) == 0) {
 				if (!pfn_valid(pfn)) {
 					page = NULL;
 					goto free_range;
 				}
 			}
 			if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
 				page = NULL;
 				goto free_range;
 			}
 			/* Minimise pfn page lookups and scheduler checks */
 			if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
 				page++;
 			} else {
 				nr_pages += nr_to_free;
 				deferred_free_range(free_base_page,
 						free_base_pfn, nr_to_free);
 				free_base_page = NULL;
 				free_base_pfn = nr_to_free = 0;
 				page = pfn_to_page(pfn);
 				cond_resched();
 			}
 			if (page->flags) {
 				VM_BUG_ON(page_zone(page) != zone);
 				goto free_range;
 			}
 			__init_single_page(page, pfn, zid, nid);
 			if (!free_base_page) {
 				free_base_page = page;
 				free_base_pfn = pfn;
 				nr_to_free = 0;
 			}
 			nr_to_free++;
 			/* Where possible, batch up pages for a single free */
 			continue;
 free_range:
 			/* Free the current block of pages to allocator */
 			nr_pages += nr_to_free;
 			deferred_free_range(free_base_page, free_base_pfn,
 								nr_to_free);
 			free_base_page = NULL;
 			free_base_pfn = nr_to_free = 0;
 		}
 		/* Free the last block of pages to allocator */
 		nr_pages += nr_to_free;
 		deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
 		first_init_pfn = max(end_pfn, first_init_pfn);
 	}
 	/* Sanity check that the next zone really is unpopulated */
 	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
 	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
 	pgdat_init_report_one_done();
 	return 0;
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 void __init page_alloc_init_late(void)
 {
 	struct zone *zone;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 	int nid;
 	/* There will be num_node_state(N_MEMORY) threads */
 	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
 	for_each_node_state(nid, N_MEMORY) {
 		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
 	}
 	/* Block until all are initialised */
 	wait_for_completion(&pgdat_init_all_done_comp);
 	/* Reinit limits that are based on free pages after the kernel is up */
 	files_maxfiles_init();
 #endif
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 	/* Discard memblock private memory */
 	memblock_discard();
 #endif
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	if (pageblock_order >= MAX_ORDER) {
 		i = pageblock_nr_pages;
 		p = page;
 		do {
 			set_page_refcounted(p);
 			__free_pages(p, MAX_ORDER - 1);
 			p += MAX_ORDER_NR_PAGES;
 		} while (i -= MAX_ORDER_NR_PAGES);
 	} else {
 		set_page_refcounted(page);
 		__free_pages(page, pageblock_order);
 	}
 	adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- nyc
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
 		/*
 		 * Mark as guard pages (or page), that will allow to
 		 * merge back to allocator when buddy will be freed.
 		 * Corresponding page table entries will not be touched,
 		 * pages will stay not present in virtual address space
 		 */
 		if (set_page_guard(zone, &page[size], high, migratetype))
 			continue;
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 static void check_new_page_bad(struct page *page)
 {
 	const char *bad_reason = NULL;
 	unsigned long bad_flags = 0;
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
 		bad_reason = "nonzero _count";
 	if (unlikely(page->flags & __PG_HWPOISON)) {
 		bad_reason = "HWPoisoned (hardware-corrupted)";
 		bad_flags = __PG_HWPOISON;
 		/* Don't complain about hwpoisoned pages */
 		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
 	}
 #ifdef CONFIG_MEMCG
 	if (unlikely(page->mem_cgroup))
 		bad_reason = "page still charged to cgroup";
 #endif
 	bad_page(page, bad_reason, bad_flags);
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (likely(page_expected_state(page,
 				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
 		return 0;
 	check_new_page_bad(page);
 	return 1;
 }
 static inline bool free_pages_prezeroed(void)
 {
 	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
 		page_poisoning_enabled();
 }
 #ifdef CONFIG_DEBUG_VM
 static bool check_pcp_refill(struct page *page)
 {
 	return false;
 }
 static bool check_new_pcp(struct page *page)
 {
 	return check_new_page(page);
 }
 #else
 static bool check_pcp_refill(struct page *page)
 {
 	return check_new_page(page);
 }
 static bool check_new_pcp(struct page *page)
 {
 	return false;
 }
 #endif /* CONFIG_DEBUG_VM */
 static bool check_new_pages(struct page *page, unsigned int order)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return true;
 	}
 	return false;
 }
 inline void post_alloc_hook(struct page *page, unsigned int order,
 				gfp_t gfp_flags)
 {
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	kernel_poison_pages(page, 1 << order, 1);
 	kasan_alloc_pages(page, order);
 	set_page_owner(page, order, gfp_flags);
 }
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 							unsigned int alloc_flags)
 {
 	int i;
 	post_alloc_hook(page, order, gfp_flags);
 	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
 		for (i = 0; i < (1 << order); i++)
 			clear_highpage(page + i);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	/*
 	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
 	 * allocate the page. The expectation is that the caller is taking
 	 * steps that will free more memory. The caller should avoid the page
 	 * being used for !PFMEMALLOC purposes.
 	 */
 	if (alloc_flags & ALLOC_NO_WATERMARKS)
 		set_page_pfmemalloc(page);
 	else
 		clear_page_pfmemalloc(page);
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		page = list_first_entry_or_null(&area->free_list[migratetype],
 							struct page, lru);
 		if (!page)
 			continue;
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		set_pcppage_migratetype(page, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
 #ifdef CONFIG_CMA
 	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
 #endif
 };
 #ifdef CONFIG_CMA
 static struct page *__rmqueue_cma_fallback(struct zone *zone,
 					unsigned int order)
 {
 	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
 }
 #else
 static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
 					unsigned int order) { return NULL; }
 #endif
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype, int *num_movable)
 {
 	struct page *page;
 	unsigned int order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	if (num_movable)
 		*num_movable = 0;
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 		if (!PageBuddy(page)) {
 			/*
 			 * We assume that pages that could be isolated for
 			 * migration are movable. But we don't actually try
 			 * isolating, as that would be expensive.
 			 */
 			if (num_movable &&
 					(PageLRU(page) || __PageMovable(page)))
 				(*num_movable)++;
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype, int *num_movable)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (!zone_spans_pfn(zone, start_pfn))
 		start_page = page;
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype,
 								num_movable);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /*
  * When we are falling back to another migratetype during allocation, try to
  * steal extra free pages from the same pageblocks to satisfy further
  * allocations, instead of polluting multiple pageblocks.
  *
  * If we are stealing a relatively large buddy page, it is likely there will
  * be more free pages in the pageblock, so try to steal them all. For
  * reclaimable and unmovable allocations, we steal regardless of page size,
  * as fragmentation caused by those allocations polluting movable pageblocks
  * is worse than movable allocations stealing from unmovable and reclaimable
  * pageblocks.
  */
 static bool can_steal_fallback(unsigned int order, int start_mt)
 {
 	/*
 	 * Leaving this order check is intended, although there is
 	 * relaxed order check in next check. The reason is that
 	 * we can actually steal whole pageblock if this condition met,
 	 * but, below check doesn't guarantee it and that is just heuristic
 	 * so could be changed anytime.
 	 */
 	if (order >= pageblock_order)
 		return true;
 	if (order >= pageblock_order / 2 ||
 		start_mt == MIGRATE_RECLAIMABLE ||
 		start_mt == MIGRATE_UNMOVABLE ||
 		page_group_by_mobility_disabled)
 		return true;
 	return false;
 }
 /*
  * This function implements actual steal behaviour. If order is large enough,
  * we can steal whole pageblock. If not, we first move freepages in this
  * pageblock to our migratetype and determine how many already-allocated pages
  * are there in the pageblock with a compatible migratetype. If at least half
  * of pages are free or compatible, we can change migratetype of the pageblock
  * itself, so pages freed in the future will be put on the correct free list.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
 					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
 	struct free_area *area;
 	int free_pages, movable_pages, alike_pages;
 	int old_block_type;
 	old_block_type = get_pageblock_migratetype(page);
 	/*
 	 * This can happen due to races and we want to prevent broken
 	 * highatomic accounting.
 	 */
 	if (is_migrate_highatomic(old_block_type))
 		goto single_page;
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
 		goto single_page;
 	}
 	/* We are not allowed to try stealing from the whole block */
 	if (!whole_block)
 		goto single_page;
 	free_pages = move_freepages_block(zone, page, start_type,
 						&movable_pages);
 	/*
 	 * Determine how many pages are compatible with our allocation.
 	 * For movable allocation, it's the number of movable pages which
 	 * we just obtained. For other types it's a bit more tricky.
 	 */
 	if (start_type == MIGRATE_MOVABLE) {
 		alike_pages = movable_pages;
 	} else {
 		/*
 		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
 		 * to MOVABLE pageblock, consider all non-movable pages as
 		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
 		 * vice versa, be conservative since we can't distinguish the
 		 * exact migratetype of non-movable pages.
 		 */
 		if (old_block_type == MIGRATE_MOVABLE)
 			alike_pages = pageblock_nr_pages
 						- (free_pages + movable_pages);
 		else
 			alike_pages = 0;
 	}
 	/* moving whole block can fail due to zone boundary conditions */
 	if (!free_pages)
 		goto single_page;
 	/*
 	 * If a sufficient number of pages in the block are either free or of
 	 * comparable migratability as our allocation, claim the whole block.
 	 */
 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
 	return;
 single_page:
 	area = &zone->free_area[current_order];
 	list_move(&page->lru, &area->free_list[start_type]);
 }
 /*
  * Check whether there is a suitable fallback freepage with requested order.
  * If only_stealable is true, this function returns fallback_mt only if
  * we can steal other freepages all together. This would help to reduce
  * fragmentation due to mixed migratetype pages in one pageblock.
  */
 int find_suitable_fallback(struct free_area *area, unsigned int order,
 			int migratetype, bool only_stealable, bool *can_steal)
 {
 	int i;
 	int fallback_mt;
 	if (area->nr_free == 0)
 		return -1;
 	*can_steal = false;
 	for (i = 0;; i++) {
 		fallback_mt = fallbacks[migratetype][i];
 		if (fallback_mt == MIGRATE_TYPES)
 			break;
 		if (list_empty(&area->free_list[fallback_mt]))
 			continue;
 		if (can_steal_fallback(order, migratetype))
 			*can_steal = true;
 		if (!only_stealable)
 			return fallback_mt;
 		if (*can_steal)
 			return fallback_mt;
 	}
 	return -1;
 }
 /*
  * Reserve a pageblock for exclusive use of high-order atomic allocations if
  * there are no empty page blocks that contain a page with a suitable order
  */
 static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 				unsigned int alloc_order)
 {
 	int mt;
 	unsigned long max_managed, flags;
 	/*
 	 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
 	 * Check is race-prone but harmless.
 	 */
 	max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
 	if (zone->nr_reserved_highatomic >= max_managed)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	/* Recheck the nr_reserved_highatomic limit under the lock */
 	if (zone->nr_reserved_highatomic >= max_managed)
 		goto out_unlock;
 	/* Yoink! */
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
 	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
 		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
 	}
 out_unlock:
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 /*
  * Used when an allocation is about to fail under memory pressure. This
  * potentially hurts the reliability of high-order allocations when under
  * intense memory pressure but failed atomic allocations should be easier
  * to recover from than an OOM.
  *
  * If @force is true, try to unreserve a pageblock even though highatomic
  * pageblock is exhausted.
  */
 static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 						bool force)
 {
 	struct zonelist *zonelist = ac->zonelist;
 	unsigned long flags;
 	struct zoneref *z;
 	struct zone *zone;
 	struct page *page;
 	int order;
 	bool ret;
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		/*
 		 * Preserve at least one pageblock unless memory pressure
 		 * is really high.
 		 */
 		if (!force && zone->nr_reserved_highatomic <=
 					pageblock_nr_pages)
 			continue;
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &(zone->free_area[order]);
 			page = list_first_entry_or_null(
 					&area->free_list[MIGRATE_HIGHATOMIC],
 					struct page, lru);
 			if (!page)
 				continue;
 			/*
 			 * In page freeing path, migratetype change is racy so
 			 * we can counter several free pages in a pageblock
 			 * in this loop althoug we changed the pageblock type
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
 			if (is_migrate_highatomic_page(page)) {
 				/*
 				 * It should never happen but changes to
 				 * locking could inadvertently allow a per-cpu
 				 * drain to add pages to MIGRATE_HIGHATOMIC
 				 * while unreserving so be safe and watch for
 				 * underflows.
 				 */
 				zone->nr_reserved_highatomic -= min(
 						pageblock_nr_pages,
 						zone->nr_reserved_highatomic);
 			}
 			/*
 			 * Convert to ac->migratetype and avoid the normal
 			 * pageblock stealing heuristics. Minimally, the caller
 			 * is doing the work and needs the pages. More
 			 * importantly, if the block was always converted to
 			 * MIGRATE_UNMOVABLE or another type then the number
 			 * of pageblocks that cannot be completely freed
 			 * may increase.
 			 */
 			set_pageblock_migratetype(page, ac->migratetype);
 			ret = move_freepages_block(zone, page, ac->migratetype,
 									NULL);
 			if (ret) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return ret;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	return false;
 }
 /*
  * Try finding a free buddy page on the fallback list and put it on the free
  * list of requested migratetype, possibly along with other pages from the same
  * block, depending on fragmentation avoidance heuristics. Returns true if
  * fallback was found so that __rmqueue_smallest() can grab it.
  *
  * The use of signed ints for order and current_order is a deliberate
  * deviation from the rest of this file, to make the for loop
  * condition simpler.
  */
 static inline bool
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int fallback_mt;
 	bool can_steal;
 	/*
 	 * Find the largest available free page in the other list. This roughly
 	 * approximates finding the pageblock with the most free pages, which
 	 * would be too costly to do exactly.
 	 */
 	for (current_order = MAX_ORDER - 1; current_order >= order;
 				--current_order) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
 				start_migratetype, false, &can_steal);
 		if (fallback_mt == -1)
 			continue;
 		/*
 		 * We cannot steal all free pages from the pageblock and the
 		 * requested migratetype is movable. In that case it's better to
 		 * steal and split the smallest available page instead of the
 		 * largest available page, because even if the next movable
 		 * allocation falls back into a different pageblock than this
 		 * one, it won't cause permanent fragmentation.
 		 */
 		if (!can_steal && start_migratetype == MIGRATE_MOVABLE
 					&& current_order > order)
 			goto find_smallest;
 		goto do_steal;
 	}
 	return false;
 find_smallest:
 	for (current_order = order; current_order < MAX_ORDER;
 							current_order++) {
 		area = &(zone->free_area[current_order]);
 		fallback_mt = find_suitable_fallback(area, current_order,
 				start_migratetype, false, &can_steal);
 		if (fallback_mt != -1)
 			break;
 	}
 	/*
 	 * This should not happen - we already found a suitable fallback
 	 * when looking for the largest page.
 	 */
 	VM_BUG_ON(current_order == MAX_ORDER);
 do_steal:
 	page = list_first_entry(&area->free_list[fallback_mt],
 							struct page, lru);
 	steal_suitable_fallback(zone, page, start_migratetype, can_steal);
 	trace_mm_page_alloc_extfrag(page, order, current_order,
 		start_migratetype, fallback_mt);
 	return true;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 				int migratetype)
 {
 	struct page *page;
 retry:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page)) {
 		if (migratetype == MIGRATE_MOVABLE)
 			page = __rmqueue_cma_fallback(zone, order);
 		if (!page && __rmqueue_fallback(zone, order, migratetype))
 			goto retry;
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, bool cold)
 {
 	int i, alloced = 0;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		if (unlikely(check_pcp_refill(page)))
 			continue;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(!cold))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 	}
 	/*
 	 * i pages were removed from the buddy list even if some leak due
 	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
 	 * on i. Do not confuse with 'alloced' which is the number of
 	 * pages added to the pcp list.
 	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return alloced;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain, batch;
 	local_irq_save(flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pcplists of the indicated processor and zone.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	unsigned long flags;
 	struct per_cpu_pageset *pset;
 	struct per_cpu_pages *pcp;
 	local_irq_save(flags);
 	pset = per_cpu_ptr(zone->pageset, cpu);
 	pcp = &pset->pcp;
 	if (pcp->count) {
 		free_pcppages_bulk(zone, pcp->count, pcp);
 		pcp->count = 0;
 	}
 	local_irq_restore(flags);
 }
 /*
  * Drain pcplists of all zones on the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		drain_pages_zone(cpu, zone);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  *
  * The CPU has to be pinned. When zone parameter is non-NULL, spill just
  * the single zone's pages.
  */
 void drain_local_pages(struct zone *zone)
 {
 	int cpu = smp_processor_id();
 	if (zone)
 		drain_pages_zone(cpu, zone);
 	else
 		drain_pages(cpu);
 }
 static void drain_local_pages_wq(struct work_struct *work)
 {
 	/*
 	 * drain_all_pages doesn't use proper cpu hotplug protection so
 	 * we can race with cpu offline when the WQ can move this from
 	 * a cpu pinned worker to an unbound one. We can operate on a different
 	 * cpu which is allright but we also have to make sure to not move to
 	 * a different one.
 	 */
 	preempt_disable();
 	drain_local_pages(NULL);
 	preempt_enable();
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * When zone parameter is non-NULL, spill just the single zone's pages.
  *
  * Note that this can be extremely slow as the draining happens in a workqueue.
  */
 void drain_all_pages(struct zone *zone)
 {
 	int cpu;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * Make sure nobody triggers this path before mm_percpu_wq is fully
 	 * initialized.
 	 */
 	if (WARN_ON_ONCE(!mm_percpu_wq))
 		return;
 	/*
 	 * Do not drain if one is already in progress unless it's specific to
 	 * a zone. Such callers are primarily CMA and memory hotplug and need
 	 * the drain to be complete when the call returns.
 	 */
 	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
 		if (!zone)
 			return;
 		mutex_lock(&pcpu_drain_mutex);
 	}
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		struct per_cpu_pageset *pcp;
 		struct zone *z;
 		bool has_pcps = false;
 		if (zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count)
 				has_pcps = true;
 		} else {
 			for_each_populated_zone(z) {
 				pcp = per_cpu_ptr(z->pageset, cpu);
 				if (pcp->pcp.count) {
 					has_pcps = true;
 					break;
 				}
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	for_each_cpu(cpu, &cpus_with_pcps) {
 		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
 		INIT_WORK(work, drain_local_pages_wq);
 		queue_work_on(cpu, mm_percpu_wq, work);
 	}
 	for_each_cpu(cpu, &cpus_with_pcps)
 		flush_work(per_cpu_ptr(&pcpu_drain, cpu));
 	mutex_unlock(&pcpu_drain_mutex);
 }
 #ifdef CONFIG_HIBERNATION
 /*
  * Touch the watchdog for every WD_PAGE_COUNT pages.
  */
 #define WD_PAGE_COUNT	(128*1024)
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
 	unsigned long flags;
 	unsigned int order, t;
 	struct page *page;
 	if (zone_is_empty(zone))
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone_end_pfn(zone);
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
 			if (!--page_count) {
 				touch_nmi_watchdog();
 				page_count = WD_PAGE_COUNT;
 			}
 			if (page_zone(page) != zone)
 				continue;
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each_entry(page,
 				&zone->free_area[order].free_list[t], lru) {
 			unsigned long i;
 			pfn = page_to_pfn(page);
 			for (i = 0; i < (1UL << order); i++) {
 				if (!--page_count) {
 					touch_nmi_watchdog();
 					page_count = WD_PAGE_COUNT;
 				}
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 			}
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == true ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, bool cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 	if (!free_pcp_prepare(page))
 		return;
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat HIGHATOMIC as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
 			free_one_page(zone, page, pfn, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (!cold)
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = READ_ONCE(pcp->batch);
 		free_pcppages_bulk(zone, batch, pcp);
 		pcp->count -= batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, bool cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order);
 }
 EXPORT_SYMBOL_GPL(split_page);
 int __isolate_free_page(struct page *page, unsigned int order)
 {
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	mt = get_pageblock_migratetype(page);
 	if (!is_migrate_isolate(mt)) {
 		/*
 		 * Obey watermarks as if the page was being allocated. We can
 		 * emulate a high-order watermark check with a raised order-0
 		 * watermark, because we already know our high-order page
 		 * exists.
 		 */
 		watermark = min_wmark_pages(zone) + (1UL << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
 		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
 	 * pageblock
 	 */
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
 			    && !is_migrate_highatomic(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1UL << order;
 }
 /*
  * Update NUMA hit/miss statistics
  *
  * Must be called with interrupts disabled.
  */
 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
 	enum numa_stat_item local_stat = NUMA_LOCAL;
 	if (z->node != numa_node_id())
 		local_stat = NUMA_OTHER;
 	if (z->node == preferred_zone->node)
 		__inc_numa_state(z, NUMA_HIT);
 	else {
 		__inc_numa_state(z, NUMA_MISS);
 		__inc_numa_state(preferred_zone, NUMA_FOREIGN);
 	}
 	__inc_numa_state(z, local_stat);
 #endif
 }
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 			bool cold, struct per_cpu_pages *pcp,
 			struct list_head *list)
 {
 	struct page *page;
 	do {
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 		if (cold)
 			page = list_last_entry(list, struct page, lru);
 		else
 			page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} while (check_new_pcp(page));
 	return page;
 }
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
 			gfp_t gfp_flags, int migratetype)
 {
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 	unsigned long flags;
 	local_irq_save(flags);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
 	}
 	local_irq_restore(flags);
 	return page;
 }
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
 static inline
 struct page *rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
 			gfp_t gfp_flags, unsigned int alloc_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	if (likely(order == 0)) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				gfp_flags, migratetype);
 		goto out;
 	}
 	/*
 	 * We most definitely don't want callers attempting to
 	 * allocate greater than order-1 page units with __GFP_NOFAIL.
 	 */
 	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
 	spin_lock_irqsave(&zone->lock, flags);
 	do {
 		page = NULL;
 		if (alloc_flags & ALLOC_HARDER) {
 			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
 			if (page)
 				trace_mm_page_alloc_zone_locked(page, order, migratetype);
 		}
 		if (!page)
 			page = __rmqueue(zone, order, migratetype);
 	} while (page && check_new_pages(page, order));
 	spin_unlock(&zone->lock);
 	if (!page)
 		goto failed;
 	__mod_zone_freepage_state(zone, -(1 << order),
 				  get_pcppage_migratetype(page));
 	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 out:
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	bool ignore_gfp_highmem;
 	bool ignore_gfp_reclaim;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_reclaim = true,
 	.ignore_gfp_highmem = true,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_reclaim &&
 			(gfp_mask & __GFP_DIRECT_RECLAIM))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_reclaim))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free base pages are above 'mark'. For high-order checks it
  * will return true of the order-0 watermark is reached and there is at least
  * one free page of a suitable size. Checking now avoids taking the zone lock
  * to check in the allocation paths if no pages are free.
  */
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int classzone_idx, unsigned int alloc_flags,
 			 long free_pages)
 {
 	long min = mark;
 	int o;
 	const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	/*
 	 * If the caller does not have rights to ALLOC_HARDER then subtract
 	 * the high-atomic reserves. This will over-estimate the size of the
 	 * atomic reserve but it avoids a search.
 	 */
 	if (likely(!alloc_harder)) {
 		free_pages -= z->nr_reserved_highatomic;
 	} else {
 		/*
 		 * OOM victims can try even harder than normal ALLOC_HARDER
 		 * users on the grounds that it's definitely going to be in
 		 * the exit path shortly and free memory. Any allocation it
 		 * makes during the free path will be small and short-lived.
 		 */
 		if (alloc_flags & ALLOC_OOM)
 			min -= min / 2;
 		else
 			min -= min / 4;
 	}
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	/*
 	 * Check watermarks for an order-0 allocation request. If these
 	 * are not met, then a high-order request also cannot go ahead
 	 * even if a suitable page happened to be free.
 	 */
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return false;
 	/* If this is an order-0 request then the watermark is fine */
 	if (!order)
 		return true;
 	/* For a high-order request, check at least one suitable page is free */
 	for (o = order; o < MAX_ORDER; o++) {
 		struct free_area *area = &z->free_area[o];
 		int mt;
 		if (!area->nr_free)
 			continue;
 		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
 			if (!list_empty(&area->free_list[mt]))
 				return true;
 		}
 #ifdef CONFIG_CMA
 		if ((alloc_flags & ALLOC_CMA) &&
 		    !list_empty(&area->free_list[MIGRATE_CMA])) {
 			return true;
 		}
 #endif
 		if (alloc_harder &&
 			!list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
 			return true;
 	}
 	return false;
 }
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 		      int classzone_idx, unsigned int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx, unsigned int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	long cma_pages = 0;
 #ifdef CONFIG_CMA
 	/* If allocation can't use CMA areas don't use free CMA pages */
 	if (!(alloc_flags & ALLOC_CMA))
 		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 	/*
 	 * Fast check for order-0 only. If this fails then the reserves
 	 * need to be calculated. There is a corner case where the check
 	 * passes but only the high-order atomic reserve are free. If
 	 * the caller is !atomic then it'll uselessly search the free
 	 * list. That corner case is then slower but it is harmless.
 	 */
 	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
 		return true;
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					free_pages);
 }
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 			unsigned long mark, int classzone_idx)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
 				RECLAIM_DISTANCE;
 }
 #else	/* CONFIG_NUMA */
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
 	struct zoneref *z = ac->preferred_zoneref;
 	struct zone *zone;
 	struct pglist_data *last_pgdat_dirty_limit = NULL;
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
 	 */
 	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		struct page *page;
 		unsigned long mark;
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
 			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a node that is within its dirty
 		 * limit, such that no single node holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the node's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-node dirty limit in the slowpath
 		 * (spread_dirty_pages unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * nodes are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of nodes in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if (ac->spread_dirty_pages) {
 			if (last_pgdat_dirty_limit == zone->zone_pgdat)
 				continue;
 			if (!node_dirty_ok(zone->zone_pgdat)) {
 				last_pgdat_dirty_limit = zone->zone_pgdat;
 				continue;
 			}
 		}
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_fast(zone, order, mark,
 				       ac_classzone_idx(ac), alloc_flags)) {
 			int ret;
 			/* Checked here to keep the fast path fast */
 			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 			if (alloc_flags & ALLOC_NO_WATERMARKS)
 				goto try_this_zone;
 			if (node_reclaim_mode == 0 ||
 			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 			ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
 			switch (ret) {
 			case NODE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case NODE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
 						ac_classzone_idx(ac), alloc_flags))
 					goto try_this_zone;
 				continue;
 			}
 		}
 try_this_zone:
 		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			prep_new_page(page, order, gfp_mask, alloc_flags);
 			/*
 			 * If this is a high-order atomic allocation then check
 			 * if the pageblock should be reserved for the future
 			 */
 			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
 				reserve_highatomic_pageblock(page, zone, order);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
 	if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
 		return;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (tsk_is_oom_victim(current) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	show_mem(filter, nodemask);
 }
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
 	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
 		return;
 	pr_warn("%s: ", current->comm);
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
 	pr_cont("%pV", &vaf);
 	va_end(args);
 	pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
 	if (nodemask)
 		pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
 	else
 		pr_cont("(null)\n");
 	cpuset_print_current_mems_allowed();
 	dump_stack();
 	warn_alloc_show_mem(gfp_mask, nodemask);
 }
 static inline struct page *
 __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
 			      unsigned int alloc_flags,
 			      const struct alloc_context *ac)
 {
 	struct page *page;
 	page = get_page_from_freelist(gfp_mask, order,
 			alloc_flags|ALLOC_CPUSET, ac);
 	/*
 	 * fallback to ignore cpuset restriction if our nodes
 	 * are depleted
 	 */
 	if (!page)
 		page = get_page_from_freelist(gfp_mask, order,
 				alloc_flags, ac);
 	return page;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	const struct alloc_context *ac, unsigned long *did_some_progress)
 {
 	struct oom_control oc = {
 		.zonelist = ac->zonelist,
 		.nodemask = ac->nodemask,
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
 	};
 	struct page *page;
 	*did_some_progress = 0;
 	/*
 	 * Acquire the oom lock.  If that fails, somebody else is
 	 * making progress for us.
 	 */
 	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure. But make sure that this reclaim
 	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
 	 * allocation which will never fail due to oom_lock already held.
 	 */
 	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
 				      ~__GFP_DIRECT_RECLAIM, order,
 				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
 	if (page)
 		goto out;
 	/* Coredumps can quickly deplete all memory reserves */
 	if (current->flags & PF_DUMPCORE)
 		goto out;
 	/* The OOM killer will not help higher order allocs */
 	if (order > PAGE_ALLOC_COSTLY_ORDER)
 		goto out;
 	/*
 	 * We have already exhausted all our reclaim opportunities without any
 	 * success so it is time to admit defeat. We will skip the OOM killer
 	 * because it is very likely that the caller has a more reasonable
 	 * fallback than shooting a random task.
 	 */
 	if (gfp_mask & __GFP_RETRY_MAYFAIL)
 		goto out;
 	/* The OOM killer does not needlessly kill tasks for lowmem */
 	if (ac->high_zoneidx < ZONE_NORMAL)
 		goto out;
 	if (pm_suspended_storage())
 		goto out;
 	/*
 	 * XXX: GFP_NOFS allocations should rather fail than rely on
 	 * other request to make a forward progress.
 	 * We are in an unfortunate situation where out_of_memory cannot
 	 * do much for this context but let's try it to at least get
 	 * access to memory reserved if the current task is killed (see
 	 * out_of_memory). Once filesystems are ready to handle allocation
 	 * failures more gracefully we should just bail out here.
 	 */
 	/* The OOM killer may not free memory on a specific node */
 	if (gfp_mask & __GFP_THISNODE)
 		goto out;
 	/* Exhausted what can be done so it's blamo time */
 	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;
 		/*
 		 * Help non-failing allocations by giving them access to memory
 		 * reserves
 		 */
 		if (gfp_mask & __GFP_NOFAIL)
 			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
 					ALLOC_NO_WATERMARKS, ac);
 	}
 out:
 	mutex_unlock(&oom_lock);
 	return page;
 }
 /*
  * Maximum number of compaction retries wit a progress before OOM
  * killer is consider as the only way to move forward.
  */
 #define MAX_COMPACT_RETRIES 16
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum compact_priority prio, enum compact_result *compact_result)
 {
 	struct page *page;
 	unsigned int noreclaim_flag;
 	if (!order)
 		return NULL;
 	noreclaim_flag = memalloc_noreclaim_save();
 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
 									prio);
 	memalloc_noreclaim_restore(noreclaim_flag);
 	if (*compact_result <= COMPACT_INACTIVE)
 		return NULL;
 	/*
 	 * At least in one zone compaction wasn't deferred or skipped, so let's
 	 * count a compaction stall
 	 */
 	count_vm_event(COMPACTSTALL);
 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 	if (page) {
 		struct zone *zone = page_zone(page);
 		zone->compact_blockskip_flush = false;
 		compaction_defer_reset(zone, order, true);
 		count_vm_event(COMPACTSUCCESS);
 		return page;
 	}
 	/*
 	 * It's bad if compaction run occurs and fails. The most likely reason
 	 * is that pages exist, but not enough to satisfy watermarks.
 	 */
 	count_vm_event(COMPACTFAIL);
 	cond_resched();
 	return NULL;
 }
 static inline bool
 should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 		     enum compact_result compact_result,
 		     enum compact_priority *compact_priority,
 		     int *compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
 	int min_priority;
 	bool ret = false;
 	int retries = *compaction_retries;
 	enum compact_priority priority = *compact_priority;
 	if (!order)
 		return false;
 	if (compaction_made_progress(compact_result))
 		(*compaction_retries)++;
 	/*
 	 * compaction considers all the zone as desperately out of memory
 	 * so it doesn't really make much sense to retry except when the
 	 * failure could be caused by insufficient priority
 	 */
 	if (compaction_failed(compact_result))
 		goto check_priority;
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
 	 * due to locks contention before we declare that we should give up.
 	 * But do not retry if the given zonelist is not suitable for
 	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result)) {
 		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
 		goto out;
 	}
 	/*
 	 * !costly requests are much more important than __GFP_RETRY_MAYFAIL
 	 * costly ones because they are de facto nofail and invoke OOM
 	 * killer to move on while costly can fail and users are ready
 	 * to cope with that. 1/4 retries is rather arbitrary but we
 	 * would need much more detailed feedback from compaction to
 	 * make a better decision.
 	 */
 	if (order > PAGE_ALLOC_COSTLY_ORDER)
 		max_retries /= 4;
 	if (*compaction_retries <= max_retries) {
 		ret = true;
 		goto out;
 	}
 	/*
 	 * Make sure there are attempts at the highest priority if we exhausted
 	 * all retries or failed at the lower priorities.
 	 */
 check_priority:
 	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
 			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
 	if (*compact_priority > min_priority) {
 		(*compact_priority)--;
 		*compaction_retries = 0;
 		ret = true;
 	}
 out:
 	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
 	return ret;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum compact_priority prio, enum compact_result *compact_result)
 {
 	*compact_result = COMPACT_SKIPPED;
 	return NULL;
 }
 static inline bool
 should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
 		     enum compact_result compact_result,
 		     enum compact_priority *compact_priority,
 		     int *compaction_retries)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
 		return false;
 	/*
 	 * There are setups with compaction disabled which would prefer to loop
 	 * inside the allocator rather than hit the oom killer prematurely.
 	 * Let's give them a good hope and keep retrying while the order-0
 	 * watermarks are OK.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 					ac->nodemask) {
 		if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
 					ac_classzone_idx(ac), alloc_flags))
 			return true;
 	}
 	return false;
 }
 #endif /* CONFIG_COMPACTION */
 #ifdef CONFIG_LOCKDEP
 struct lockdep_map __fs_reclaim_map =
 	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
 static bool __need_fs_reclaim(gfp_t gfp_mask)
 {
 	gfp_mask = current_gfp_context(gfp_mask);
 	/* no reclaim without waiting on it */
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
 		return false;
 	/* this guy won't enter reclaim */
 	if (current->flags & PF_MEMALLOC)
 		return false;
 	/* We're only interested __GFP_FS allocations for now */
 	if (!(gfp_mask & __GFP_FS))
 		return false;
 	if (gfp_mask & __GFP_NOLOCKDEP)
 		return false;
 	return true;
 }
 void fs_reclaim_acquire(gfp_t gfp_mask)
 {
 	if (__need_fs_reclaim(gfp_mask))
 		lock_map_acquire(&__fs_reclaim_map);
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
 void fs_reclaim_release(gfp_t gfp_mask)
 {
 	if (__need_fs_reclaim(gfp_mask))
 		lock_map_release(&__fs_reclaim_map);
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_release);
 #endif
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 					const struct alloc_context *ac)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	unsigned int noreclaim_flag;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	noreclaim_flag = memalloc_noreclaim_save();
 	fs_reclaim_acquire(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
 								ac->nodemask);
 	current->reclaim_state = NULL;
 	fs_reclaim_release(gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 retry:
 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists or in high alloc reserves.
 	 * Shrink them them and try again
 	 */
 	if (!page && !drained) {
 		unreserve_highatomic_pageblock(ac, false);
 		drain_all_pages(NULL);
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	pg_data_t *last_pgdat = NULL;
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
 					ac->high_zoneidx, ac->nodemask) {
 		if (last_pgdat != zone->zone_pgdat)
 			wakeup_kswapd(zone, order, ac->high_zoneidx);
 		last_pgdat = zone->zone_pgdat;
 	}
 }
 static inline unsigned int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (gfp_mask & __GFP_ATOMIC) {
 		/*
 		 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
 		 * if it can't schedule.
 		 */
 		if (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
 		 * comment for __cpuset_node_allowed().
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 #ifdef CONFIG_CMA
 	if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
 		alloc_flags |= ALLOC_CMA;
 #endif
 	return alloc_flags;
 }
 static bool oom_reserves_allowed(struct task_struct *tsk)
 {
 	if (!tsk_is_oom_victim(tsk))
 		return false;
 	/*
 	 * !MMU doesn't have oom reaper so give access to memory reserves
 	 * only to the thread with TIF_MEMDIE set
 	 */
 	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
 		return false;
 	return true;
 }
 /*
  * Distinguish requests which really need access to full memory
  * reserves from oom victims which can live with a portion of it
  */
 static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
 {
 	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
 		return 0;
 	if (gfp_mask & __GFP_MEMALLOC)
 		return ALLOC_NO_WATERMARKS;
 	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
 		return ALLOC_NO_WATERMARKS;
 	if (!in_interrupt()) {
 		if (current->flags & PF_MEMALLOC)
 			return ALLOC_NO_WATERMARKS;
 		else if (oom_reserves_allowed(current))
 			return ALLOC_OOM;
 	}
 	return 0;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!__gfp_pfmemalloc_flags(gfp_mask);
 }
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
  *
  * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
  * without success, or when we couldn't even meet the watermark if we
  * reclaimed all remaining pages on the LRU lists.
  *
  * Returns true if a retry is viable or false to enter the oom path.
  */
 static inline bool
 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		     struct alloc_context *ac, int alloc_flags,
 		     bool did_some_progress, int *no_progress_loops)
 {
 	struct zone *zone;
 	struct zoneref *z;
 	/*
 	 * Costly allocations might have made a progress but this doesn't mean
 	 * their order will become available due to high fragmentation so
 	 * always increment the no progress counter for them
 	 */
 	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
 		*no_progress_loops = 0;
 	else
 		(*no_progress_loops)++;
 	/*
 	 * Make sure we converge to OOM if we cannot make any progress
 	 * several times in the row.
 	 */
 	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
 		/* Before OOM, exhaust highatomic_reserve */
 		return unreserve_highatomic_pageblock(ac, true);
 	}
 	/*
 	 * Keep reclaiming pages while there is a chance this will lead
 	 * somewhere.  If none of the target zones can satisfy our allocation
 	 * request even if all reclaimable pages are considered then we are
 	 * screwed and have to go OOM.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 					ac->nodemask) {
 		unsigned long available;
 		unsigned long reclaimable;
 		unsigned long min_wmark = min_wmark_pages(zone);
 		bool wmark;
 		available = reclaimable = zone_reclaimable_pages(zone);
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 		/*
 		 * Would the allocation succeed if we reclaimed all
 		 * reclaimable pages?
 		 */
 		wmark = __zone_watermark_ok(zone, order, min_wmark,
 				ac_classzone_idx(ac), alloc_flags, available);
 		trace_reclaim_retry_zone(z, order, reclaimable,
 				available, min_wmark, *no_progress_loops, wmark);
 		if (wmark) {
 			/*
 			 * If we didn't make any progress and have a lot of
 			 * dirty + writeback pages then we should wait for
 			 * an IO to complete to slow down the reclaim and
 			 * prevent from pre mature OOM
 			 */
 			if (!did_some_progress) {
 				unsigned long write_pending;
 				write_pending = zone_page_state_snapshot(zone,
 							NR_ZONE_WRITE_PENDING);
 				if (2 * write_pending > reclaimable) {
 					congestion_wait(BLK_RW_ASYNC, HZ/10);
 					return true;
 				}
 			}
 			/*
 			 * Memory allocation/reclaim might be called from a WQ
 			 * context and the current implementation of the WQ
 			 * concurrency control doesn't recognize that
 			 * a particular WQ is congested if the worker thread is
 			 * looping without ever sleeping. Therefore we have to
 			 * do a short sleep here rather than calling
 			 * cond_resched().
 			 */
 			if (current->flags & PF_WQ_WORKER)
 				schedule_timeout_uninterruptible(1);
 			else
 				cond_resched();
 			return true;
 		}
 	}
 	return false;
 }
 static inline bool
 check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
 {
 	/*
 	 * It's possible that cpuset's mems_allowed and the nodemask from
 	 * mempolicy don't intersect. This should be normally dealt with by
 	 * policy_nodemask(), but it's possible to race with cpuset update in
 	 * such a way the check therein was true, and then it became false
 	 * before we got our cpuset_mems_cookie here.
 	 * This assumes that for all allocations, ac->nodemask can come only
 	 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
 	 * when it does not intersect with the cpuset restrictions) or the
 	 * caller can deal with a violated nodemask.
 	 */
 	if (cpusets_enabled() && ac->nodemask &&
 			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
 		ac->nodemask = NULL;
 		return true;
 	}
 	/*
 	 * When updating a task's mems_allowed or mempolicy nodemask, it is
 	 * possible to race with parallel threads in such a way that our
 	 * allocation can fail while the mask is being updated. If we are about
 	 * to fail, check if the cpuset changed during allocation and if so,
 	 * retry.
 	 */
 	if (read_mems_allowed_retry(cpuset_mems_cookie))
 		return true;
 	return false;
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
 	struct page *page = NULL;
 	unsigned int alloc_flags;
 	unsigned long did_some_progress;
 	enum compact_priority compact_priority;
 	enum compact_result compact_result;
 	int compaction_retries;
 	int no_progress_loops;
 	unsigned int cpuset_mems_cookie;
 	int reserve_flags;
 	/*
 	 * We also sanity check to catch abuse of atomic reserves being used by
 	 * callers that are not in atomic context.
 	 */
 	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
 				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
 		gfp_mask &= ~__GFP_ATOMIC;
 retry_cpuset:
 	compaction_retries = 0;
 	no_progress_loops = 0;
 	compact_priority = DEF_COMPACT_PRIORITY;
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/*
 	 * The fast path uses conservative alloc_flags to succeed only until
 	 * kswapd needs to be woken up, and to avoid the cost of setting up
 	 * alloc_flags precisely. So we do that now.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * We need to recalculate the starting point for the zonelist iterator
 	 * because we might have used different nodemask in the fast path, or
 	 * there was a cpuset modification and we are retrying - otherwise we
 	 * could end up iterating over non-eligible zones endlessly.
 	 */
 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
 					ac->high_zoneidx, ac->nodemask);
 	if (!ac->preferred_zoneref->zone)
 		goto nopage;
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
 	 * that first
 	 */
 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 	if (page)
 		goto got_pg;
 	/*
 	 * For costly allocations, try direct compaction first, as it's likely
 	 * that we have enough base pages and don't need to reclaim. For non-
 	 * movable high-order allocations, do that as well, as compaction will
 	 * try prevent permanent fragmentation by migrating from blocks of the
 	 * same migratetype.
 	 * Don't try this for allocations that are allowed to ignore
 	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
 	 */
 	if (can_direct_reclaim &&
 			(costly_order ||
 			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
 			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 						alloc_flags, ac,
 						INIT_COMPACT_PRIORITY,
 						&compact_result);
 		if (page)
 			goto got_pg;
 		/*
 		 * Checks for costly allocations with __GFP_NORETRY, which
 		 * includes THP page fault allocations
 		 */
 		if (costly_order && (gfp_mask & __GFP_NORETRY)) {
 			/*
 			 * If compaction is deferred for high-order allocations,
 			 * it is because sync compaction recently failed. If
 			 * this is the case and the caller requested a THP
 			 * allocation, we do not want to heavily disrupt the
 			 * system, so we fail the allocation instead of entering
 			 * direct reclaim.
 			 */
 			if (compact_result == COMPACT_DEFERRED)
 				goto nopage;
 			/*
 			 * Looks like reclaim/compaction is worth trying, but
 			 * sync compaction could be very expensive, so keep
 			 * using async compaction.
 			 */
 			compact_priority = INIT_COMPACT_PRIORITY;
 		}
 	}
 retry:
 	/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
 	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
 	if (reserve_flags)
 		alloc_flags = reserve_flags;
 	/*
 	 * Reset the zonelist iterators if memory policies can be ignored.
 	 * These allocations are high priority and system rather than user
 	 * orientated.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
 		ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
 					ac->high_zoneidx, ac->nodemask);
 	}
 	/* Attempt with potentially adjusted zonelist and alloc_flags */
 	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
 	if (page)
 		goto got_pg;
 	/* Caller is not willing to reclaim, we can't balance anything */
 	if (!can_direct_reclaim)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
 							&did_some_progress);
 	if (page)
 		goto got_pg;
 	/* Try direct compaction and then allocating */
 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
 					compact_priority, &compact_result);
 	if (page)
 		goto got_pg;
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		goto nopage;
 	/*
 	 * Do not retry costly high order allocations unless they are
 	 * __GFP_RETRY_MAYFAIL
 	 */
 	if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
 		goto nopage;
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
 				 did_some_progress > 0, &no_progress_loops))
 		goto retry;
 	/*
 	 * It doesn't make any sense to retry for the compaction if the order-0
 	 * reclaim is not able to make any progress because the current
 	 * implementation of the compaction depends on the sufficient amount
 	 * of free memory (see __compaction_suitable)
 	 */
 	if (did_some_progress > 0 &&
 			should_compact_retry(ac, order, alloc_flags,
 				compact_result, &compact_priority,
 				&compaction_retries))
 		goto retry;
 	/* Deal with possible cpuset update races before we start OOM killing */
 	if (check_retry_cpuset(cpuset_mems_cookie, ac))
 		goto retry_cpuset;
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
 		goto got_pg;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (tsk_is_oom_victim(current) &&
 	    (alloc_flags == ALLOC_OOM ||
 	     (gfp_mask & __GFP_NOMEMALLOC)))
 		goto nopage;
 	/* Retry as long as the OOM killer is making progress */
 	if (did_some_progress) {
 		no_progress_loops = 0;
 		goto retry;
 	}
 nopage:
 	/* Deal with possible cpuset update races before we fail */
 	if (check_retry_cpuset(cpuset_mems_cookie, ac))
 		goto retry_cpuset;
 	/*
 	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
 	 * we always retry
 	 */
 	if (gfp_mask & __GFP_NOFAIL) {
 		/*
 		 * All existing users of the __GFP_NOFAIL are blockable, so warn
 		 * of any new users that actually require GFP_NOWAIT
 		 */
 		if (WARN_ON_ONCE(!can_direct_reclaim))
 			goto fail;
 		/*
 		 * PF_MEMALLOC request from this context is rather bizarre
 		 * because we cannot reclaim anything and only can loop waiting
 		 * for somebody to do a work for us
 		 */
 		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
 		/*
 		 * non failing costly orders are a hard requirement which we
 		 * are not prepared for much so let's warn about these users
 		 * so that we can identify them and convert them to something
 		 * else.
 		 */
 		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
 		/*
 		 * Help non-failing allocations by giving them access to memory
 		 * reserves but do not use ALLOC_NO_WATERMARKS because this
 		 * could deplete whole memory reserves which would just make
 		 * the situation worse
 		 */
 		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
 		if (page)
 			goto got_pg;
 		cond_resched();
 		goto retry;
 	}
 fail:
 	warn_alloc(gfp_mask, ac->nodemask,
 			"page allocation failure: order:%u", order);
 got_pg:
 	return page;
 }
 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		int preferred_nid, nodemask_t *nodemask,
 		struct alloc_context *ac, gfp_t *alloc_mask,
 		unsigned int *alloc_flags)
 {
 	ac->high_zoneidx = gfp_zone(gfp_mask);
 	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
 	ac->nodemask = nodemask;
 	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 	if (cpusets_enabled()) {
 		*alloc_mask |= __GFP_HARDWALL;
 		if (!ac->nodemask)
 			ac->nodemask = &cpuset_current_mems_allowed;
 		else
 			*alloc_flags |= ALLOC_CPUSET;
 	}
 	fs_reclaim_acquire(gfp_mask);
 	fs_reclaim_release(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return false;
 	if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
 		*alloc_flags |= ALLOC_CMA;
 	return true;
 }
 /* Determine whether to spread dirty pages and what the first usable zone */
 static inline void finalise_ac(gfp_t gfp_mask,
 		unsigned int order, struct alloc_context *ac)
 {
 	/* Dirty zone balancing only done in the fast path */
 	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 	/*
 	 * The preferred zone is used for statistics but crucially it is
 	 * also used as the starting point for the zonelist iterator. It
 	 * may get reset for allocations that ignore memory policies.
 	 */
 	ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
 					ac->high_zoneidx, ac->nodemask);
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
 							nodemask_t *nodemask)
 {
 	struct page *page;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = { };
 	/*
 	 * There are several places where we assume that the order value is sane
 	 * so bail out early if the request is out of bound.
 	 */
 	if (unlikely(order >= MAX_ORDER)) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	gfp_mask &= gfp_allowed_mask;
 	alloc_mask = gfp_mask;
 	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
 		return NULL;
 	finalise_ac(gfp_mask, order, &ac);
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
 	if (likely(page))
 		goto out;
 	/*
 	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
 	 * resp. GFP_NOIO which has to be inherited for all allocation requests
 	 * from a particular context which has been marked by
 	 * memalloc_no{fs,io}_{save,restore}.
 	 */
 	alloc_mask = current_gfp_context(gfp_mask);
 	ac.spread_dirty_pages = false;
 	/*
 	 * Restore the original nodemask if it was potentially replaced with
 	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
 	 */
 	if (unlikely(ac.nodemask != nodemask))
 		ac.nodemask = nodemask;
 	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 out:
 	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
 	    unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
 		__free_pages(page, order);
 		page = NULL;
 	}
 	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, false);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 /*
  * Page Fragment:
  *  An arbitrary-length arbitrary-offset area of memory which resides
  *  within a 0 or higher order page.  Multiple fragments within that page
  *  are individually refcounted, in the page's reference counter.
  *
  * The page_frag functions below provide a simple allocation framework for
  * page fragments.  This is used by the network stack and network device
  * drivers to provide a backing region of memory for use as either an
  * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
  */
 static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
 					     gfp_t gfp_mask)
 {
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
 		    __GFP_NOMEMALLOC;
 	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
 				PAGE_FRAG_CACHE_MAX_ORDER);
 	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
 #endif
 	if (unlikely(!page))
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 	nc->va = page ? page_address(page) : NULL;
 	return page;
 }
 void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
 	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 	if (page_ref_sub_and_test(page, count)) {
 		unsigned int order = compound_order(page);
 		if (order == 0)
 			free_hot_cold_page(page, false);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 void *page_frag_alloc(struct page_frag_cache *nc,
 		      unsigned int fragsz, gfp_t gfp_mask)
 {
 	unsigned int size = PAGE_SIZE;
 	struct page *page;
 	int offset;
 	if (unlikely(!nc->va)) {
 refill:
 		page = __page_frag_cache_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 		/* if size can vary use size else just use PAGE_SIZE */
 		size = nc->size;
 #endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
 		page_ref_add(page, size - 1);
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = size;
 		nc->offset = size;
 	}
 	offset = nc->offset - fragsz;
 	if (unlikely(offset < 0)) {
 		page = virt_to_page(nc->va);
 		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
 			goto refill;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 		/* if size can vary use size else just use PAGE_SIZE */
 		size = nc->size;
 #endif
 		/* OK, page count is 0, we can safely set it */
 		set_page_count(page, size);
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = size;
 		offset = size - fragsz;
 	}
 	nc->pagecnt_bias--;
 	nc->offset = offset;
 	return nc->va + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 /*
  * Frees a page fragment allocated out of either a compound or order 0 page.
  */
 void page_frag_free(void *addr)
 {
 	struct page *page = virt_to_head_page(addr);
 	if (unlikely(put_page_testzero(page)))
 		__free_pages_ok(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
 static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  */
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone
  *
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
  *
  *     nr_free_zone_pages = managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned long sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->managed_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /**
  * nr_free_buffer_pages - count number of pages beyond high watermark
  *
  * nr_free_buffer_pages() counts the number of pages which are beyond the high
  * watermark within ZONE_DMA and ZONE_NORMAL.
  */
 unsigned long nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /**
  * nr_free_pagecache_pages - count number of pages beyond high watermark
  *
  * nr_free_pagecache_pages() counts the number of pages which are beyond the
  * high watermark within all zones.
  */
 unsigned long nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (IS_ENABLED(CONFIG_NUMA))
 		printk("Node %d ", zone_to_nid(zone));
 }
 long si_mem_available(void)
 {
 	long available;
 	unsigned long pagecache;
 	unsigned long wmark_low = 0;
 	unsigned long pages[NR_LRU_LISTS];
 	struct zone *zone;
 	int lru;
 	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
 		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 	for_each_zone(zone)
 		wmark_low += zone->watermark[WMARK_LOW];
 	/*
 	 * Estimate the amount of memory available for userspace allocations,
 	 * without causing swapping.
 	 */
 	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
 	/*
 	 * Not all the page cache can be freed, otherwise the system will
 	 * start swapping. Assume at least half of the page cache, or the
 	 * low watermark worth of cache, needs to stay.
 	 */
 	pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
 	pagecache -= min(pagecache / 2, wmark_low);
 	available += pagecache;
 	/*
 	 * Part of the reclaimable slab consists of items that are in use,
 	 * and cannot be freed. Cap this estimate at the low watermark.
 	 */
 	available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
 		     min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
 			 wmark_low);
 	/*
 	 * Part of the kernel memory, which can be released under memory
 	 * pressure.
 	 */
 	available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
 		PAGE_SHIFT;
 	if (available < 0)
 		available = 0;
 	return available;
 }
 EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = global_node_page_state(NR_SHMEM);
 	val->freeram = global_zone_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
 	unsigned long managed_highpages = 0;
 	unsigned long free_highpages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
 		managed_pages += pgdat->node_zones[zone_type].managed_pages;
 	val->totalram = managed_pages;
 	val->sharedram = node_page_state(pgdat, NR_SHMEM);
 	val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (is_highmem(zone)) {
 			managed_highpages += zone->managed_pages;
 			free_highpages += zone_page_state(zone, NR_FREE_PAGES);
 		}
 	}
 	val->totalhigh = managed_highpages;
 	val->freehigh = free_highpages;
 #else
 	val->totalhigh = managed_highpages;
 	val->freehigh = free_highpages;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
 {
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		return false;
 	/*
 	 * no node mask - aka implicit memory numa policy. Do not bother with
 	 * the synchronization - read_mems_allowed_begin - because we do not
 	 * have to be precise here.
 	 */
 	if (!nodemask)
 		nodemask = &cpuset_current_mems_allowed;
 	return !node_isset(nid, *nodemask);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static void show_migration_types(unsigned char type)
 {
 	static const char types[MIGRATE_TYPES] = {
 		[MIGRATE_UNMOVABLE]	= 'U',
 		[MIGRATE_MOVABLE]	= 'M',
 		[MIGRATE_RECLAIMABLE]	= 'E',
 		[MIGRATE_HIGHATOMIC]	= 'H',
 #ifdef CONFIG_CMA
 		[MIGRATE_CMA]		= 'C',
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 		[MIGRATE_ISOLATE]	= 'I',
 #endif
 	};
 	char tmp[MIGRATE_TYPES + 1];
 	char *p = tmp;
 	int i;
 	for (i = 0; i < MIGRATE_TYPES; i++) {
 		if (type & (1 << i))
 			*p++ = types[i];
 	}
 	*p = '\0';
 	printk(KERN_CONT "(%s) ", tmp);
 }
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  *
  * Bits in @filter:
  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
  *   cpuset.
  */
 void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
 	unsigned long free_pcp = 0;
 	int cpu;
 	struct zone *zone;
 	pg_data_t *pgdat;
 	for_each_populated_zone(zone) {
 		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		for_each_online_cpu(cpu)
 			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free:%lu free_pcp:%lu free_cma:%lu\n",
 		global_node_page_state(NR_ACTIVE_ANON),
 		global_node_page_state(NR_INACTIVE_ANON),
 		global_node_page_state(NR_ISOLATED_ANON),
 		global_node_page_state(NR_ACTIVE_FILE),
 		global_node_page_state(NR_INACTIVE_FILE),
 		global_node_page_state(NR_ISOLATED_FILE),
 		global_node_page_state(NR_UNEVICTABLE),
 		global_node_page_state(NR_FILE_DIRTY),
 		global_node_page_state(NR_WRITEBACK),
 		global_node_page_state(NR_UNSTABLE_NFS),
 		global_node_page_state(NR_SLAB_RECLAIMABLE),
 		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
 		global_zone_page_state(NR_PAGETABLE),
 		global_zone_page_state(NR_BOUNCE),
 		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
 		global_zone_page_state(NR_FREE_CMA_PAGES));
 	for_each_online_pgdat(pgdat) {
 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
 			continue;
 		printk("Node %d"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" mapped:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" shmem:%lukB"
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			" shmem_thp: %lukB"
 			" shmem_pmdmapped: %lukB"
 			" anon_thp: %lukB"
 #endif
 			" writeback_tmp:%lukB"
 			" unstable:%lukB"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
 			K(node_page_state(pgdat, NR_ACTIVE_ANON)),
 			K(node_page_state(pgdat, NR_INACTIVE_ANON)),
 			K(node_page_state(pgdat, NR_ACTIVE_FILE)),
 			K(node_page_state(pgdat, NR_INACTIVE_FILE)),
 			K(node_page_state(pgdat, NR_UNEVICTABLE)),
 			K(node_page_state(pgdat, NR_ISOLATED_ANON)),
 			K(node_page_state(pgdat, NR_ISOLATED_FILE)),
 			K(node_page_state(pgdat, NR_FILE_MAPPED)),
 			K(node_page_state(pgdat, NR_FILE_DIRTY)),
 			K(node_page_state(pgdat, NR_WRITEBACK)),
 			K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 			K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
 					* HPAGE_PMD_NR),
 			K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
 	for_each_populated_zone(zone) {
 		int i;
 		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		free_pcp = 0;
 		for_each_online_cpu(cpu)
 			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
 		show_node(zone);
 		printk(KERN_CONT
 			"%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" writepending:%lukB"
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" bounce:%lukB"
 			" free_pcp:%lukB"
 			" local_pcp:%ukB"
 			" free_cma:%lukB"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			zone_page_state(zone, NR_KERNEL_STACK_KB),
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
 			K(this_cpu_read(zone->pageset->pcp.count)),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
 		printk(KERN_CONT "\n");
 	}
 	for_each_populated_zone(zone) {
 		unsigned int order;
 		unsigned long nr[MAX_ORDER], flags, total = 0;
 		unsigned char types[MAX_ORDER];
 		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
 			continue;
 		show_node(zone);
 		printk(KERN_CONT "%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &zone->free_area[order];
 			int type;
 			nr[order] = area->nr_free;
 			total += nr[order] << order;
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
 				if (!list_empty(&area->free_list[type]))
 					types[order] |= 1 << type;
 			}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			printk(KERN_CONT "%lu*%lukB ",
 			       nr[order], K(1UL) << order);
 			if (nr[order])
 				show_migration_types(types[order]);
 		}
 		printk(KERN_CONT "= %lukB\n", K(total));
 	}
 	hugetlb_show_meminfo();
 	printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
 {
 	struct zone *zone;
 	enum zone_type zone_type = MAX_NR_ZONES;
 	int nr_zones = 0;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (managed_zone(zone)) {
 			zoneref_set_zone(zone, &zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 #ifdef CONFIG_NUMA
 static int __parse_numa_zonelist_order(char *s)
 {
 	/*
 	 * We used to support different zonlists modes but they turned
 	 * out to be just not useful. Let's keep the warning in place
 	 * if somebody still use the cmd line parameter so that we do
 	 * not fail it silently
 	 */
 	if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
 		pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	if (!s)
 		return 0;
 	return __parse_numa_zonelist_order(s);
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 char numa_zonelist_order[] = "Node";
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char *str;
 	int ret;
 	if (!write)
 		return proc_dostring(table, write, buffer, length, ppos);
 	str = memdup_user_nul(buffer, 16);
 	if (IS_ERR(str))
 		return PTR_ERR(str);
 	ret = __parse_numa_zonelist_order(str);
 	kfree(str);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = NUMA_NO_NODE;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
 		unsigned nr_nodes)
 {
 	struct zoneref *zonerefs;
 	int i;
 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
 	for (i = 0; i < nr_nodes; i++) {
 		int nr_zones;
 		pg_data_t *node = NODE_DATA(node_order[i]);
 		nr_zones = build_zonerefs_node(node, zonerefs);
 		zonerefs += nr_zones;
 	}
 	zonerefs->zone = NULL;
 	zonerefs->zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	struct zoneref *zonerefs;
 	int nr_zones;
 	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
 	nr_zones = build_zonerefs_node(pgdat, zonerefs);
 	zonerefs += nr_zones;
 	zonerefs->zone = NULL;
 	zonerefs->zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static void build_zonelists(pg_data_t *pgdat)
 {
 	static int node_order[MAX_NUMNODES];
 	int node, load, nr_nodes = 0;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (node_distance(local_node, node) !=
 		    node_distance(local_node, prev_node))
 			node_load[node] = load;
 		node_order[nr_nodes++] = node;
 		prev_node = node;
 		load--;
 	}
 	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
 	build_thisnode_zonelists(pgdat);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zoneref *z;
 	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL);
 	return z->zone->node;
 }
 #endif
 static void setup_min_unmapped_ratio(void);
 static void setup_min_slab_ratio(void);
 #else	/* CONFIG_NUMA */
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	struct zoneref *zonerefs;
 	int nr_zones;
 	local_node = pgdat->node_id;
 	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
 	nr_zones = build_zonerefs_node(pgdat, zonerefs);
 	zonerefs += nr_zones;
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
 		zonerefs += nr_zones;
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
 		zonerefs += nr_zones;
 	}
 	zonerefs->zone = NULL;
 	zonerefs->zone_idx = 0;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 static void __build_all_zonelists(void *data)
 {
 	int nid;
 	int __maybe_unused cpu;
 	pg_data_t *self = data;
 	static DEFINE_SPINLOCK(lock);
 	spin_lock(&lock);
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	/*
 	 * This node is hotadded and no memory is yet present.   So just
 	 * building zonelists is fine - no need to touch other nodes.
 	 */
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 	} else {
 		for_each_online_node(nid) {
 			pg_data_t *pgdat = NODE_DATA(nid);
 			build_zonelists(pgdat);
 		}
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		for_each_online_cpu(cpu)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	spin_unlock(&lock);
 }
 static noinline void __init
 build_all_zonelists_init(void)
 {
 	int cpu;
 	__build_all_zonelists(NULL);
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu)
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 	mminit_verify_zonelist();
 	cpuset_init_current_mems_allowed();
 }
 /*
  * unless system_state == SYSTEM_BOOTING.
  *
  * __ref due to call of __init annotated helper build_all_zonelists_init
  * [protected by SYSTEM_BOOTING].
  */
 void __ref build_all_zonelists(pg_data_t *pgdat)
 {
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
 		__build_all_zonelists(pgdat);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
 		nr_online_nodes,
 		page_group_by_mobility_disabled ? "off" : "on",
 		vm_total_pages);
 #ifdef CONFIG_NUMA
 	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
 	unsigned long end_pfn = start_pfn + size;
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long pfn;
 	unsigned long nr_initialised = 0;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	struct memblock_region *r = NULL, *tmp;
 #endif
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	/*
 	 * Honor reservation requested by the driver for this ZONE_DEVICE
 	 * memory
 	 */
 	if (altmap && start_pfn == altmap->base_pfn)
 		start_pfn += altmap->reserve;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s handed to this
 		 * function.  They do not exist on hotplugged memory.
 		 */
 		if (context != MEMMAP_EARLY)
 			goto not_early;
 		if (!early_pfn_valid(pfn))
 			continue;
 		if (!early_pfn_in_nid(pfn, nid))
 			continue;
 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
 			break;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		/*
 		 * Check given memblock attribute by firmware which can affect
 		 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
 		 * mirrored, it's an overlapped memmap init. skip it.
 		 */
 		if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
 			if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
 				for_each_memblock(memory, tmp)
 					if (pfn < memblock_region_memory_end_pfn(tmp))
 						break;
 				r = tmp;
 			}
 			if (pfn >= memblock_region_memory_base_pfn(r) &&
 			    memblock_is_mirror(r)) {
 				/* already initialized as NORMAL */
 				pfn = memblock_region_memory_end_pfn(r);
 				continue;
 			}
 		}
 #endif
 not_early:
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made.
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
 			struct page *page = pfn_to_page(pfn);
 			__init_single_page(page, pfn, zone, nid);
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			cond_resched();
 		} else {
 			__init_single_pfn(pfn, zone, nid);
 		}
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	unsigned int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->managed_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 /*
  * pcp->high and pcp->batch values are related and dependent on one another:
  * ->batch must never be higher then ->high.
  * The following function updates them in a safe manner without read side
  * locking.
  *
  * Any new users of pcp->batch and pcp->high should ensure they can cope with
  * those fields changing asynchronously (acording the the above rule).
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
 static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
 		unsigned long batch)
 {
        /* start with a fail safe value for batch */
 	pcp->batch = 1;
 	smp_wmb();
        /* Update high, then batch, in order */
 	pcp->high = high;
 	smp_wmb();
 	pcp->batch = batch;
 }
 /* a companion to pageset_set_high() */
 static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
 }
 static void pageset_init(struct per_cpu_pageset *p)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	pageset_init(p);
 	pageset_set_batch(p, batch);
 }
 /*
  * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void pageset_set_high(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	unsigned long batch = max(1UL, high / 4);
 	if ((high / 4) > (PAGE_SHIFT * 8))
 		batch = PAGE_SHIFT * 8;
 	pageset_update(&p->pcp, high, batch);
 }
 static void pageset_set_high_and_batch(struct zone *zone,
 				       struct per_cpu_pageset *pcp)
 {
 	if (percpu_pagelist_fraction)
 		pageset_set_high(pcp,
 			(zone->managed_pages /
 				percpu_pagelist_fraction));
 	else
 		pageset_set_batch(pcp, zone_batchsize(zone));
 }
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 {
 	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 	pageset_init(pcp);
 	pageset_set_high_and_batch(zone, pcp);
 }
 void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu)
 		zone_pageset_init(zone, cpu);
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct pglist_data *pgdat;
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 	for_each_online_pgdat(pgdat)
 		pgdat->per_cpu_nodestats =
 			alloc_percpu(struct per_cpu_nodestat);
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (populated_zone(zone))
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 void __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int zone_idx = zone_idx(zone) + 1;
 	if (zone_idx > pgdat->nr_zones)
 		pgdat->nr_zones = zone_idx;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	zone->initialized = 1;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn,
 					struct mminit_pfnnid_cache *state)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
 	if (state->last_start <= pfn && pfn < state->last_end)
 		return state->last_nid;
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
 		state->last_start = start_pfn;
 		state->last_end = end_pfn;
 		state->last_nid = nid;
 	}
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 /**
  * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
  *
  * If an architecture guarantees that all ranges registered contain no holes
  * and may be freed, this this function may be used instead of calling
  * memblock_free_early_nid() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			memblock_free_early_nid(PFN_PHYS(start_pfn),
 					(end_pfn - start_pfn) << PAGE_SHIFT,
 					this_nid);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered contain no holes and may
  * be freed, this function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by memblock_set_node(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (!mirrored_kernelcore &&
 			*zone_start_pfn < zone_movable_pfn[nid] &&
 			*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn,
 					unsigned long *ignored)
 {
 	/* When hotadd a new node from cpu_up(), the node should be empty */
 	if (!node_start_pfn && !node_end_pfn)
 		return 0;
 	/* Get the start and end of the zone */
 	*zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	*zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				zone_start_pfn, zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	*zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
 	*zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return *zone_end_pfn - *zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long zone_start_pfn, zone_end_pfn;
 	unsigned long nr_absent;
 	/* When hotadd a new node from cpu_up(), the node should be empty */
 	if (!node_start_pfn && !node_end_pfn)
 		return 0;
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 	/*
 	 * ZONE_MOVABLE handling.
 	 * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
 	 * and vice versa.
 	 */
 	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
 		unsigned long start_pfn, end_pfn;
 		struct memblock_region *r;
 		for_each_memblock(memory, r) {
 			start_pfn = clamp(memblock_region_memory_base_pfn(r),
 					  zone_start_pfn, zone_end_pfn);
 			end_pfn = clamp(memblock_region_memory_end_pfn(r),
 					zone_start_pfn, zone_end_pfn);
 			if (zone_type == ZONE_MOVABLE &&
 			    memblock_is_mirror(r))
 				nr_absent += end_pfn - start_pfn;
 			if (zone_type == ZONE_NORMAL &&
 			    !memblock_is_mirror(r))
 				nr_absent += end_pfn - start_pfn;
 		}
 	}
 	return nr_absent;
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn,
 					unsigned long *zones_size)
 {
 	unsigned int zone;
 	*zone_start_pfn = node_start_pfn;
 	for (zone = 0; zone < zone_type; zone++)
 		*zone_start_pfn += zones_size[zone];
 	*zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 						unsigned long node_start_pfn,
 						unsigned long node_end_pfn,
 						unsigned long *zones_size,
 						unsigned long *zholes_size)
 {
 	unsigned long realtotalpages = 0, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		unsigned long zone_start_pfn, zone_end_pfn;
 		unsigned long size, real_size;
 		size = zone_spanned_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn,
 						  node_end_pfn,
 						  &zone_start_pfn,
 						  &zone_end_pfn,
 						  zones_size);
 		real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
 						  node_start_pfn, node_end_pfn,
 						  zholes_size);
 		if (size)
 			zone->zone_start_pfn = zone_start_pfn;
 		else
 			zone->zone_start_pfn = 0;
 		zone->spanned_pages = size;
 		zone->present_pages = real_size;
 		totalpages += size;
 		realtotalpages += real_size;
 	}
 	pgdat->node_spanned_pages = totalpages;
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone,
 				unsigned long zone_start_pfn,
 				unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags =
 			memblock_virt_alloc_node_nopanic(usemapsize,
 							 pgdat->node_id);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 				unsigned long zone_start_pfn, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __paginginit set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
 						   unsigned long present_pages)
 {
 	unsigned long pages = spanned_pages;
 	/*
 	 * Provide a more accurate estimation if there are holes within
 	 * the zone and SPARSEMEM is in use. If there are holes within the
 	 * zone, each populated memory region may cost us one or two extra
 	 * memmap pages due to alignment because memmap pages for each
 	 * populated regions may not be naturally aligned on page boundary.
 	 * So the (present_pages >> 4) heuristic is a tradeoff for that.
 	 */
 	if (spanned_pages > present_pages + (present_pages >> 4) &&
 	    IS_ENABLED(CONFIG_SPARSEMEM))
 		pages = present_pages;
 	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
 }
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  *
  * NOTE: pgdat should get zeroed by caller.
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
 	pgdat->numabalancing_migrate_nr_pages = 0;
 	pgdat->numabalancing_migrate_next_window = jiffies;
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	spin_lock_init(&pgdat->split_queue_lock);
 	INIT_LIST_HEAD(&pgdat->split_queue);
 	pgdat->split_queue_len = 0;
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 #ifdef CONFIG_COMPACTION
 	init_waitqueue_head(&pgdat->kcompactd_wait);
 #endif
 	pgdat_page_ext_init(pgdat);
 	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(node_lruvec(pgdat));
 	pgdat->per_cpu_nodestats = &boot_nodestats;
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
 		unsigned long zone_start_pfn = zone->zone_start_pfn;
 		size = zone->spanned_pages;
 		realsize = freesize = zone->present_pages;
 		/*
 		 * Adjust freesize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages = calc_memmap_size(size, realsize);
 		if (!is_highmem_idx(j)) {
 			if (freesize >= memmap_pages) {
 				freesize -= memmap_pages;
 				if (memmap_pages)
 					printk(KERN_DEBUG
 					       "  %s zone: %lu pages used for memmap\n",
 					       zone_names[j], memmap_pages);
 			} else
 				pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
 					zone_names[j], memmap_pages, freesize);
 		}
 		/* Account for reserved pages */
 		if (j == 0 && freesize > dma_reserve) {
 			freesize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
 		else if (nr_kernel_pages > memmap_pages * 2)
 			nr_kernel_pages -= memmap_pages;
 		nr_all_pages += freesize;
 		/*
 		 * Set an approximate value for lowmem here, it will be adjusted
 		 * when the bootmem allocator frees pages into the buddy system.
 		 * And all highmem pages will be managed by the buddy system.
 		 */
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 #endif
 		zone->name = zone_names[j];
 		zone->zone_pgdat = pgdat;
 		spin_lock_init(&zone->lock);
 		zone_seqlock_init(zone);
 		zone_pcp_init(zone);
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
 		init_currently_empty_zone(zone, zone_start_pfn, size);
 		memmap_init(size, nid, j, zone_start_pfn);
 	}
 }
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long __maybe_unused start = 0;
 	unsigned long __maybe_unused offset = 0;
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 	offset = pgdat->node_start_pfn - start;
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		end = pgdat_end_pfn(pgdat);
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = memblock_virt_alloc_node_nopanic(size,
 							       pgdat->node_id);
 		pgdat->node_mem_map = map + offset;
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= offset;
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long start_pfn = 0;
 	unsigned long end_pfn = 0;
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	pgdat->per_cpu_nodestats = NULL;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
 	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
 		(u64)start_pfn << PAGE_SHIFT,
 		end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
 #else
 	start_pfn = node_start_pfn;
 #endif
 	calculate_node_totalpages(pgdat, start_pfn, end_pfn,
 				  zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	reset_deferred_meminit(pgdat);
 	free_area_init_core(pgdat);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 void __init setup_nr_node_ids(void)
 {
 	unsigned int highest;
 	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
 	nr_node_ids = highest + 1;
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		pr_warn("Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * memblock_set_node().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
 	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 	struct memblock_region *r;
 	/* Need to find movable_zone earlier when movable_node is specified. */
 	find_usable_zone_for_movable();
 	/*
 	 * If movable_node is specified, ignore kernelcore and movablecore
 	 * options.
 	 */
 	if (movable_node_is_enabled()) {
 		for_each_memblock(memory, r) {
 			if (!memblock_is_hotpluggable(r))
 				continue;
 			nid = r->nid;
 			usable_startpfn = PFN_DOWN(r->base);
 			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
 				min(usable_startpfn, zone_movable_pfn[nid]) :
 				usable_startpfn;
 		}
 		goto out2;
 	}
 	/*
 	 * If kernelcore=mirror is specified, ignore movablecore option
 	 */
 	if (mirrored_kernelcore) {
 		bool mem_below_4gb_not_mirrored = false;
 		for_each_memblock(memory, r) {
 			if (memblock_is_mirror(r))
 				continue;
 			nid = r->nid;
 			usable_startpfn = memblock_region_memory_base_pfn(r);
 			if (usable_startpfn < 0x100000) {
 				mem_below_4gb_not_mirrored = true;
 				continue;
 			}
 			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
 				min(usable_startpfn, zone_movable_pfn[nid]) :
 				usable_startpfn;
 		}
 		if (mem_below_4gb_not_mirrored)
 			pr_warn("This configuration results in unmirrored kernel memory.");
 		goto out2;
 	}
 	/*
 	 * If movablecore=nn[KMG] was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		required_movablecore = min(totalpages, required_movablecore);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/*
 	 * If kernelcore was not specified or kernelcore size is larger
 	 * than totalpages, there is no ZONE_MOVABLE.
 	 */
 	if (!required_kernelcore || required_kernelcore >= totalpages)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 out2:
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_MEMORY] = saved_node_state;
 }
 /* Any regular or high memory on that node ? */
 static void check_for_memory(pg_data_t *pgdat, int nid)
 {
 	enum zone_type zone_type;
 	if (N_MEMORY == N_NORMAL_MEMORY)
 		return;
 	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (populated_zone(zone)) {
 			node_set_state(nid, N_HIGH_MEMORY);
 			if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
 			    zone_type <= ZONE_NORMAL)
 				node_set_state(nid, N_NORMAL_MEMORY);
 			break;
 		}
 	}
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by memblock_set_node(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	start_pfn = find_min_pfn_with_active_regions();
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		end_pfn = max(max_zone_pfn[i], start_pfn);
 		arch_zone_lowest_possible_pfn[i] = start_pfn;
 		arch_zone_highest_possible_pfn[i] = end_pfn;
 		start_pfn = end_pfn;
 	}
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	pr_info("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		pr_info("  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			pr_cont("empty\n");
 		else
 			pr_cont("[mem %#018Lx-%#018Lx]\n",
 				(u64)arch_zone_lowest_possible_pfn[i]
 					<< PAGE_SHIFT,
 				((u64)arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	pr_info("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			pr_info("  Node %d: %#018Lx\n", i,
 			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early node map */
 	pr_info("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
 			(u64)start_pfn << PAGE_SHIFT,
 			((u64)end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_MEMORY);
 		check_for_memory(pgdat, nid);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	/* parse kernelcore=mirror */
 	if (parse_option_str(p, "mirror")) {
 		mirrored_kernelcore = true;
 		return 0;
 	}
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 void adjust_managed_page_count(struct page *page, long count)
 {
 	spin_lock(&managed_page_count_lock);
 	page_zone(page)->managed_pages += count;
 	totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
 	if (PageHighMem(page))
 		totalhigh_pages += count;
 #endif
 	spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
 	unsigned long pages = 0;
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
 	if (pages && s)
 		pr_info("Freeing %s memory: %ldK\n",
 			s, pages << (PAGE_SHIFT - 10));
 	return pages;
 }
 EXPORT_SYMBOL(free_reserved_area);
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
 	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
 void __init mem_init_print_info(const char *str)
 {
 	unsigned long physpages, codesize, datasize, rosize, bss_size;
 	unsigned long init_code_size, init_data_size;
 	physpages = get_num_physpages();
 	codesize = _etext - _stext;
 	datasize = _edata - _sdata;
 	rosize = __end_rodata - __start_rodata;
 	bss_size = __bss_stop - __bss_start;
 	init_data_size = __init_end - __init_begin;
 	init_code_size = _einittext - _sinittext;
 	/*
 	 * Detect special cases and adjust section sizes accordingly:
 	 * 1) .init.* may be embedded into .data sections
 	 * 2) .init.text.* may be out of [__init_begin, __init_end],
 	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
 	do { \
 		if (start <= pos && pos < end && size > adj) \
 			size -= adj; \
 	} while (0)
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
 	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
 	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
 	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
 	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
 #undef	adj_init_size
 	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
 #ifdef	CONFIG_HIGHMEM
 		", %luK highmem"
 #endif
 		"%s%s)\n",
 		nr_free_pages() << (PAGE_SHIFT - 10),
 		physpages << (PAGE_SHIFT - 10),
 		codesize >> 10, datasize >> 10, rosize >> 10,
 		(init_data_size + init_code_size) >> 10, bss_size >> 10,
 		(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
 		totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef	CONFIG_HIGHMEM
 		totalhigh_pages << (PAGE_SHIFT - 10),
 #endif
 		str ? ", " : "", str ? str : "");
 }
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by managed_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
 	lru_add_drain_cpu(cpu);
 	drain_pages(cpu);
 	/*
 	 * Spill the event counters of the dead processor
 	 * into the current processors event counters.
 	 * This artificially elevates the count of the current
 	 * processor.
 	 */
 	vm_events_fold_cpu(cpu);
 	/*
 	 * Zero the differential counters of the dead processor
 	 * so that the vm statistics are consistent.
 	 *
 	 * This is only okay since the processor is dead and cannot
 	 * race with what we are doing.
 	 */
 	cpu_vm_stats_fold(cpu);
 	return 0;
 }
 void __init page_alloc_init(void)
 {
 	int ret;
 	ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
 					"mm/page_alloc:dead", NULL,
 					page_alloc_cpu_dead);
 	WARN_ON(ret < 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		pgdat->totalreserve_pages = 0;
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->managed_pages)
 				max = zone->managed_pages;
 			pgdat->totalreserve_pages += max;
 			reserve_pages += max;
 		}
 	}
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long managed_pages = zone->managed_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = managed_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				managed_pages += lower_zone->managed_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->managed_pages;
 	}
 	for_each_zone(zone) {
 		u64 min, low;
 		spin_lock_irqsave(&zone->lock, flags);
 		min = (u64)pages_min * zone->managed_pages;
 		do_div(min, lowmem_pages);
 		low = (u64)pages_low * zone->managed_pages;
 		do_div(low, vm_total_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas control asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			unsigned long min_pages;
 			min_pages = zone->managed_pages / 1024;
 			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = min;
 		}
 		/*
 		 * Set the kswapd watermarks distance according to the
 		 * scale factor in proportion to available memory, but
 		 * ensure a minimum size on small systems.
 		 */
 		min = max_t(u64, min >> 2,
 			    mult_frac(zone->managed_pages,
 				      watermark_scale_factor, 10000));
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) +
 					low + min;
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
 					low + min * 2;
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	static DEFINE_SPINLOCK(lock);
 	spin_lock(&lock);
 	__setup_per_zone_wmarks();
 	spin_unlock(&lock);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	int new_min_free_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (new_min_free_kbytes > user_min_free_kbytes) {
 		min_free_kbytes = new_min_free_kbytes;
 		if (min_free_kbytes < 128)
 			min_free_kbytes = 128;
 		if (min_free_kbytes > 65536)
 			min_free_kbytes = 65536;
 	} else {
 		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
 				new_min_free_kbytes, user_min_free_kbytes);
 	}
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 #ifdef CONFIG_NUMA
 	setup_min_unmapped_ratio();
 	setup_min_slab_ratio();
 #endif
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	or extra_free_kbytes changes.
  */
 int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write) {
 		user_min_free_kbytes = min_free_kbytes;
 		setup_per_zone_wmarks();
 	}
 	return 0;
 }
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 static void setup_min_unmapped_ratio(void)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
 	for_each_online_pgdat(pgdat)
 		pgdat->min_unmapped_pages = 0;
 	for_each_zone(zone)
 		zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
 }
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	setup_min_unmapped_ratio();
 	return 0;
 }
 static void setup_min_slab_ratio(void)
 {
 	pg_data_t *pgdat;
 	struct zone *zone;
 	for_each_online_pgdat(pgdat)
 		pgdat->min_slab_pages = 0;
 	for_each_zone(zone)
 		zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
 }
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	setup_min_slab_ratio();
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu
  * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int old_percpu_pagelist_fraction;
 	int ret;
 	mutex_lock(&pcp_batch_high_lock);
 	old_percpu_pagelist_fraction = percpu_pagelist_fraction;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || ret < 0)
 		goto out;
 	/* Sanity checking to avoid pcp imbalance */
 	if (percpu_pagelist_fraction &&
 	    percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
 		percpu_pagelist_fraction = old_percpu_pagelist_fraction;
 		ret = -EINVAL;
 		goto out;
 	}
 	/* No change? */
 	if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
 		goto out;
 	for_each_populated_zone(zone) {
 		unsigned int cpu;
 		for_each_possible_cpu(cpu)
 			pageset_set_high_and_batch(zone,
 					per_cpu_ptr(zone->pageset, cpu));
 	}
 out:
 	mutex_unlock(&pcp_batch_high_lock);
 	return ret;
 }
 #ifdef CONFIG_NUMA
 int hashdist = HASHDIST_DEFAULT;
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
 /*
  * Returns the number of pages that arch has reserved but
  * is not known to alloc_large_system_hash().
  */
 static unsigned long __init arch_reserved_kernel_pages(void)
 {
 	return 0;
 }
 #endif
 /*
  * Adaptive scale is meant to reduce sizes of hash tables on large memory
  * machines. As memory size is increased the scale is also increased but at
  * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
  * quadruples the scale is increased by one, which means the size of hash table
  * only doubles, instead of quadrupling as well.
  * Because 32-bit systems cannot have large physical memory, where this scaling
  * makes sense, it is disabled on such platforms.
  */
 #if __BITS_PER_LONG > 32
 #define ADAPT_SCALE_BASE	(64ul << 30)
 #define ADAPT_SCALE_SHIFT	2
 #define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT)
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	gfp_t gfp_flags;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		numentries -= arch_reserved_kernel_pages();
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 #if __BITS_PER_LONG > 32
 		if (!high_limit) {
 			unsigned long adapt;
 			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
 			     adapt <<= ADAPT_SCALE_SHIFT)
 				scale++;
 		}
 #endif
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	/*
 	 * memblock allocator returns zeroed memory already, so HASH_ZERO is
 	 * currently not used when HASH_EARLY is specified.
 	 */
 	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = memblock_virt_alloc_nopanic(size, 0);
 		else if (hashdist)
 			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, gfp_flags);
 				kmemleak_alloc(table, size, 1, gfp_flags);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
 		tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
  * check without lock_page also may miss some movable non-lru pages at
  * race condition. So you can't expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * Hugepages are not in LRU lists, but they're movable.
 		 * We need not scan over tail pages bacause we don't
 		 * handle each tail page individually in migration.
 		 */
 		if (PageHuge(page)) {
 			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_refcount is zero at all time.
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (skip_hwpoisoned_pages && PageHWPoison(page))
 			continue;
 		if (__PageMovable(page))
 			continue;
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check
 		 * it.  But now, memory offline itself doesn't call
 		 * shrink_node_slabs() and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (!zone_spans_pfn(zone, pfn))
 		return false;
 	return !has_unmovable_pages(zone, page, 0, true);
 }
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
 					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long nr_reclaimed;
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	migrate_prep();
 	while (pfn < end || !list_empty(&cc->migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc->migratepages)) {
 			cc->nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc, pfn, end);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 		ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
 				    NULL, 0, cc->mode, MR_CMA);
 	}
 	if (ret < 0) {
 		putback_movable_pages(&cc->migratepages);
 		return ret;
 	}
 	return 0;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  * @gfp_mask:	GFP mask to use during compaction
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
 	unsigned int order;
 	int ret = 0;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
 		.gfp_mask = current_gfp_context(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype,
 				       false);
 	if (ret)
 		return ret;
 	/*
 	 * In case of -EBUSY, we'd like to know which page causes problem.
 	 * So, just fall through. test_pages_isolated() has a tracepoint
 	 * which will report the busy page.
 	 *
 	 * It is possible that busy pages could become available before
 	 * the call to test_pages_isolated, and the range will actually be
 	 * allocated.  So, if we fall through be sure to clear ret so that
 	 * -EBUSY is not accidentally used or returned to caller.
 	 */
 	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret && ret != -EBUSY)
 		goto done;
 	ret =0;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages(cc.zone);
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			outer_start = start;
 			break;
 		}
 		outer_start &= ~0UL << order;
 	}
 	if (outer_start != start) {
 		order = page_order(pfn_to_page(outer_start));
 		/*
 		 * outer_start page could be small order buddy page and
 		 * it doesn't include start page. Adjust outer_start
 		 * in this case to report failed page properly
 		 * on tracepoint in test_pages_isolated()
 		 */
 		if (outer_start + (1UL << order) <= start)
 			outer_start = start;
 	}
 	/* Make sure the range is really isolated. */
-	if (test_pages_isolated(outer_start, end, false)) {
+	ret = test_pages_isolated(outer_start, end, false);
-		pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
+	if (ret) {
-			__func__, outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(&cc, outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	unsigned int count = 0;
 	for (; nr_pages--; pfn++) {
 		struct page *page = pfn_to_page(pfn);
 		count += page_count(page) != 1;
 		__free_page(page);
 	}
 	WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalulated.
  */
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	unsigned cpu;
 	mutex_lock(&pcp_batch_high_lock);
 	for_each_possible_cpu(cpu)
 		pageset_set_high_and_batch(zone,
 				per_cpu_ptr(zone->pageset, cpu));
 	mutex_unlock(&pcp_batch_high_lock);
 }
 #endif
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	int cpu;
 	struct per_cpu_pageset *pset;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		for_each_online_cpu(cpu) {
 			pset = per_cpu_ptr(zone->pageset, cpu);
 			drain_zonestat(zone, pset);
 		}
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be in a single zone and isolated
  * before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	unsigned int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	offline_mem_sections(pfn, end_pfn);
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		/*
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
 		if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
 			pfn++;
 			SetPageReserved(page);
 			continue;
 		}
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		pr_info("remove from free list %lx %d %lx\n",
 			pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	unsigned int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }