Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/mm_inline.h>

59

#include <linux/mm_inline.h>

60

#include <linux/migrate.h>

60

#include <linux/migrate.h>

61

#include <linux/page-debug-flags.h>

61

#include <linux/page-debug-flags.h>

62

#include <linux/hugetlb.h>

62

#include <linux/hugetlb.h>

63

#include <linux/sched/rt.h>

63

#include <linux/sched/rt.h>

64

65

#include <asm/sections.h>

65

#include <asm/sections.h>

66

#include <asm/tlbflush.h>

66

#include <asm/tlbflush.h>

67

#include <asm/div64.h>

67

#include <asm/div64.h>

68

#include "internal.h"

68

#include "internal.h"

69

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

70

/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */

71

static DEFINE_MUTEX(pcp_batch_high_lock);

71

static DEFINE_MUTEX(pcp_batch_high_lock);

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

72

#define MIN_PERCPU_PAGELIST_FRACTION (8)

73

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

74

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

75

DEFINE_PER_CPU(int, numa_node);

75

DEFINE_PER_CPU(int, numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

76

EXPORT_PER_CPU_SYMBOL(numa_node);

77

#endif

77

#endif

78

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

79

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

80

/*

80

/*

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

81

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

82

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

83

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

84

* defined in <linux/topology.h>.

84

* defined in <linux/topology.h>.

85

*/

85

*/

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

86

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

87

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

88

#endif

88

#endif

89

90

/*

90

/*

91

* Array of node states.

91

* Array of node states.

92

*/

92

*/

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

93

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

94

[N_POSSIBLE] = NODE_MASK_ALL,

94

[N_POSSIBLE] = NODE_MASK_ALL,

95

[N_ONLINE] = { { [0] = 1UL } },

95

[N_ONLINE] = { { [0] = 1UL } },

96

#ifndef CONFIG_NUMA

96

#ifndef CONFIG_NUMA

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

97

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

98

#ifdef CONFIG_HIGHMEM

98

#ifdef CONFIG_HIGHMEM

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

99

[N_HIGH_MEMORY] = { { [0] = 1UL } },

100

#endif

100

#endif

101

#ifdef CONFIG_MOVABLE_NODE

101

#ifdef CONFIG_MOVABLE_NODE

102

[N_MEMORY] = { { [0] = 1UL } },

102

[N_MEMORY] = { { [0] = 1UL } },

103

#endif

103

#endif

104

[N_CPU] = { { [0] = 1UL } },

104

[N_CPU] = { { [0] = 1UL } },

105

#endif /* NUMA */

105

#endif /* NUMA */

106

};

106

};

107

EXPORT_SYMBOL(node_states);

107

EXPORT_SYMBOL(node_states);

108

109

/* Protect totalram_pages and zone->managed_pages */

109

/* Protect totalram_pages and zone->managed_pages */

110

static DEFINE_SPINLOCK(managed_page_count_lock);

110

static DEFINE_SPINLOCK(managed_page_count_lock);

111

112

unsigned long totalram_pages __read_mostly;

112

unsigned long totalram_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

113

unsigned long totalreserve_pages __read_mostly;

114

/*

114

/*

115

* When calculating the number of globally allowed dirty pages, there

115

* When calculating the number of globally allowed dirty pages, there

116

* is a certain number of per-zone reserves that should not be

116

* is a certain number of per-zone reserves that should not be

117

* considered dirtyable memory. This is the sum of those reserves

117

* considered dirtyable memory. This is the sum of those reserves

118

* over all existing zones that contribute dirtyable memory.

118

* over all existing zones that contribute dirtyable memory.

119

*/

119

*/

120

unsigned long dirty_balance_reserve __read_mostly;

120

unsigned long dirty_balance_reserve __read_mostly;

121

122

int percpu_pagelist_fraction;

122

int percpu_pagelist_fraction;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

123

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

124

125

#ifdef CONFIG_PM_SLEEP

125

#ifdef CONFIG_PM_SLEEP

126

/*

126

/*

127

* The following functions are used by the suspend/hibernate code to temporarily

127

* The following functions are used by the suspend/hibernate code to temporarily

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

128

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

129

* while devices are suspended. To avoid races with the suspend/hibernate code,

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

130

* they should always be called with pm_mutex held (gfp_allowed_mask also should

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

131

* only be modified with pm_mutex held, unless the suspend/hibernate code is

132

* guaranteed not to run in parallel with that modification).

132

* guaranteed not to run in parallel with that modification).

133

*/

133

*/

134

135

static gfp_t saved_gfp_mask;

135

static gfp_t saved_gfp_mask;

136

137

void pm_restore_gfp_mask(void)

137

void pm_restore_gfp_mask(void)

138

{

138

{

139

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(!mutex_is_locked(&pm_mutex));

140

if (saved_gfp_mask) {

140

if (saved_gfp_mask) {

141

gfp_allowed_mask = saved_gfp_mask;

141

gfp_allowed_mask = saved_gfp_mask;

142

saved_gfp_mask = 0;

142

saved_gfp_mask = 0;

143

}

143

}

144

}

144

}

145

146

void pm_restrict_gfp_mask(void)

146

void pm_restrict_gfp_mask(void)

147

{

147

{

148

WARN_ON(!mutex_is_locked(&pm_mutex));

148

WARN_ON(!mutex_is_locked(&pm_mutex));

149

WARN_ON(saved_gfp_mask);

149

WARN_ON(saved_gfp_mask);

150

saved_gfp_mask = gfp_allowed_mask;

150

saved_gfp_mask = gfp_allowed_mask;

151

gfp_allowed_mask &= ~GFP_IOFS;

151

gfp_allowed_mask &= ~GFP_IOFS;

152

}

152

}

153

154

bool pm_suspended_storage(void)

154

bool pm_suspended_storage(void)

155

{

155

{

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

156

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

157

return false;

157

return false;

158

return true;

158

return true;

159

}

159

}

160

#endif /* CONFIG_PM_SLEEP */

160

#endif /* CONFIG_PM_SLEEP */

161

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

162

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

163

int pageblock_order __read_mostly;

163

int pageblock_order __read_mostly;

164

#endif

164

#endif

165

166

static void __free_pages_ok(struct page *page, unsigned int order);

166

static void __free_pages_ok(struct page *page, unsigned int order);

167

168

/*

168

/*

169

* results with 256, 32 in the lowmem_reserve sysctl:

169

* results with 256, 32 in the lowmem_reserve sysctl:

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

170

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

171

* 1G machine -> (16M dma, 784M normal, 224M high)

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

172

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

173

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

174

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

175

*

175

*

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

176

* TBD: should special case ZONE_DMA32 machines here - in those we normally

177

* don't need any ZONE_NORMAL reservation

177

* don't need any ZONE_NORMAL reservation

178

*/

178

*/

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

179

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

180

#ifdef CONFIG_ZONE_DMA

180

#ifdef CONFIG_ZONE_DMA

181

256,

181

256,

182

#endif

182

#endif

183

#ifdef CONFIG_ZONE_DMA32

183

#ifdef CONFIG_ZONE_DMA32

184

256,

184

256,

185

#endif

185

#endif

186

#ifdef CONFIG_HIGHMEM

186

#ifdef CONFIG_HIGHMEM

187

32,

187

32,

188

#endif

188

#endif

189

32,

189

32,

190

};

190

};

191

192

EXPORT_SYMBOL(totalram_pages);

192

EXPORT_SYMBOL(totalram_pages);

193

194

static char * const zone_names[MAX_NR_ZONES] = {

194

static char * const zone_names[MAX_NR_ZONES] = {

195

#ifdef CONFIG_ZONE_DMA

195

#ifdef CONFIG_ZONE_DMA

196

"DMA",

196

"DMA",

197

#endif

197

#endif

198

#ifdef CONFIG_ZONE_DMA32

198

#ifdef CONFIG_ZONE_DMA32

199

"DMA32",

199

"DMA32",

200

#endif

200

#endif

201

"Normal",

201

"Normal",

202

#ifdef CONFIG_HIGHMEM

202

#ifdef CONFIG_HIGHMEM

203

"HighMem",

203

"HighMem",

204

#endif

204

#endif

205

"Movable",

205

"Movable",

206

};

206

};

207

208

int min_free_kbytes = 1024;

208

int min_free_kbytes = 1024;

209

int user_min_free_kbytes;

209

int user_min_free_kbytes;

210

211

static unsigned long __meminitdata nr_kernel_pages;

211

static unsigned long __meminitdata nr_kernel_pages;

212

static unsigned long __meminitdata nr_all_pages;

212

static unsigned long __meminitdata nr_all_pages;

213

static unsigned long __meminitdata dma_reserve;

213

static unsigned long __meminitdata dma_reserve;

214

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

215

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

216

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

217

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

218

static unsigned long __initdata required_kernelcore;

218

static unsigned long __initdata required_kernelcore;

219

static unsigned long __initdata required_movablecore;

219

static unsigned long __initdata required_movablecore;

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

220

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

221

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

222

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

223

int movable_zone;

223

int movable_zone;

224

EXPORT_SYMBOL(movable_zone);

224

EXPORT_SYMBOL(movable_zone);

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

225

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

226

227

#if MAX_NUMNODES > 1

227

#if MAX_NUMNODES > 1

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

228

int nr_node_ids __read_mostly = MAX_NUMNODES;

229

int nr_online_nodes __read_mostly = 1;

229

int nr_online_nodes __read_mostly = 1;

230

EXPORT_SYMBOL(nr_node_ids);

230

EXPORT_SYMBOL(nr_node_ids);

231

EXPORT_SYMBOL(nr_online_nodes);

231

EXPORT_SYMBOL(nr_online_nodes);

232

#endif

232

#endif

233

234

int page_group_by_mobility_disabled __read_mostly;

234

int page_group_by_mobility_disabled __read_mostly;

235

236

void set_pageblock_migratetype(struct page *page, int migratetype)

236

void set_pageblock_migratetype(struct page *page, int migratetype)

237

{

237

{

238

239

if (unlikely(page_group_by_mobility_disabled))

239

if (unlikely(page_group_by_mobility_disabled))

240

migratetype = MIGRATE_UNMOVABLE;

240

migratetype = MIGRATE_UNMOVABLE;

241

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

242

set_pageblock_flags_group(page, (unsigned long)migratetype,

243

PB_migrate, PB_migrate_end);

243

PB_migrate, PB_migrate_end);

244

}

244

}

245

246

bool oom_killer_disabled __read_mostly;

246

bool oom_killer_disabled __read_mostly;

247

248

#ifdef CONFIG_DEBUG_VM

248

#ifdef CONFIG_DEBUG_VM

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

249

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

250

{

250

{

251

int ret = 0;

251

int ret = 0;

252

unsigned seq;

252

unsigned seq;

253

unsigned long pfn = page_to_pfn(page);

253

unsigned long pfn = page_to_pfn(page);

254

unsigned long sp, start_pfn;

254

unsigned long sp, start_pfn;

255

256

do {

256

do {

257

seq = zone_span_seqbegin(zone);

257

seq = zone_span_seqbegin(zone);

258

start_pfn = zone->zone_start_pfn;

258

start_pfn = zone->zone_start_pfn;

259

sp = zone->spanned_pages;

259

sp = zone->spanned_pages;

260

if (!zone_spans_pfn(zone, pfn))

260

if (!zone_spans_pfn(zone, pfn))

261

ret = 1;

261

ret = 1;

262

} while (zone_span_seqretry(zone, seq));

262

} while (zone_span_seqretry(zone, seq));

263

264

if (ret)

264

if (ret)

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

265

pr_err("page %lu outside zone [ %lu - %lu ]\n",

266

pfn, start_pfn, start_pfn + sp);

266

pfn, start_pfn, start_pfn + sp);

267

268

return ret;

268

return ret;

269

}

269

}

270

271

static int page_is_consistent(struct zone *zone, struct page *page)

271

static int page_is_consistent(struct zone *zone, struct page *page)

272

{

272

{

273

if (!pfn_valid_within(page_to_pfn(page)))

273

if (!pfn_valid_within(page_to_pfn(page)))

274

return 0;

274

return 0;

275

if (zone != page_zone(page))

275

if (zone != page_zone(page))

276

return 0;

276

return 0;

277

278

return 1;

278

return 1;

279

}

279

}

280

/*

280

/*

281

* Temporary debugging check for pages not lying within a given zone.

281

* Temporary debugging check for pages not lying within a given zone.

282

*/

282

*/

283

static int bad_range(struct zone *zone, struct page *page)

283

static int bad_range(struct zone *zone, struct page *page)

284

{

284

{

285

if (page_outside_zone_boundaries(zone, page))

285

if (page_outside_zone_boundaries(zone, page))

286

return 1;

286

return 1;

287

if (!page_is_consistent(zone, page))

287

if (!page_is_consistent(zone, page))

288

return 1;

288

return 1;

289

290

return 0;

290

return 0;

291

}

291

}

292

#else

292

#else

293

static inline int bad_range(struct zone *zone, struct page *page)

293

static inline int bad_range(struct zone *zone, struct page *page)

294

{

294

{

295

return 0;

295

return 0;

296

}

296

}

297

#endif

297

#endif

298

299

static void bad_page(struct page *page)

299

static void bad_page(struct page *page)

300

{

300

{

301

static unsigned long resume;

301

static unsigned long resume;

302

static unsigned long nr_shown;

302

static unsigned long nr_shown;

303

static unsigned long nr_unshown;

303

static unsigned long nr_unshown;

304

305

/* Don't complain about poisoned pages */

305

/* Don't complain about poisoned pages */

306

if (PageHWPoison(page)) {

306

if (PageHWPoison(page)) {

307

page_mapcount_reset(page); /* remove PageBuddy */

307

page_mapcount_reset(page); /* remove PageBuddy */

308

return;

308

return;

309

}

309

}

310

311

/*

311

/*

312

* Allow a burst of 60 reports, then keep quiet for that minute;

312

* Allow a burst of 60 reports, then keep quiet for that minute;

313

* or allow a steady drip of one report per second.

313

* or allow a steady drip of one report per second.

314

*/

314

*/

315

if (nr_shown == 60) {

315

if (nr_shown == 60) {

316

if (time_before(jiffies, resume)) {

316

if (time_before(jiffies, resume)) {

317

nr_unshown++;

317

nr_unshown++;

318

goto out;

318

goto out;

319

}

319

}

320

if (nr_unshown) {

320

if (nr_unshown) {

321

printk(KERN_ALERT

321

printk(KERN_ALERT

322

"BUG: Bad page state: %lu messages suppressed\n",

322

"BUG: Bad page state: %lu messages suppressed\n",

323

nr_unshown);

323

nr_unshown);

324

nr_unshown = 0;

324

nr_unshown = 0;

325

}

325

}

326

nr_shown = 0;

326

nr_shown = 0;

327

}

327

}

328

if (nr_shown++ == 0)

328

if (nr_shown++ == 0)

329

resume = jiffies + 60 * HZ;

329

resume = jiffies + 60 * HZ;

330

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

331

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

332

current->comm, page_to_pfn(page));

332

current->comm, page_to_pfn(page));

333

dump_page(page);

333

dump_page(page);

334

335

print_modules();

335

print_modules();

336

dump_stack();

336

dump_stack();

337

out:

337

out:

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

338

/* Leave bad fields for debug, except PageBuddy could make trouble */

339

page_mapcount_reset(page); /* remove PageBuddy */

339

page_mapcount_reset(page); /* remove PageBuddy */

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

340

add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

341

}

341

}

342

343

/*

343

/*

344

* Higher-order pages are called "compound pages". They are structured thusly:

344

* Higher-order pages are called "compound pages". They are structured thusly:

345

*

345

*

346

* The first PAGE_SIZE page is called the "head page".

346

* The first PAGE_SIZE page is called the "head page".

347

*

347

*

348

* The remaining PAGE_SIZE pages are called "tail pages".

348

* The remaining PAGE_SIZE pages are called "tail pages".

349

*

349

*

350

* All pages have PG_compound set. All tail pages have their ->first_page

350

* All pages have PG_compound set. All tail pages have their ->first_page

351

* pointing at the head page.

351

* pointing at the head page.

352

*

352

*

353

* The first tail page's ->lru.next holds the address of the compound page's

353

* The first tail page's ->lru.next holds the address of the compound page's

354

* put_page() function. Its ->lru.prev holds the order of allocation.

354

* put_page() function. Its ->lru.prev holds the order of allocation.

355

* This usage means that zero-order pages may not be compound.

355

* This usage means that zero-order pages may not be compound.

356

*/

356

*/

357

358

static void free_compound_page(struct page *page)

358

static void free_compound_page(struct page *page)

359

{

359

{

360

__free_pages_ok(page, compound_order(page));

360

__free_pages_ok(page, compound_order(page));

361

}

361

}

362

363

void prep_compound_page(struct page *page, unsigned long order)

363

void prep_compound_page(struct page *page, unsigned long order)

364

{

364

{

365

int i;

365

int i;

366

int nr_pages = 1 << order;

366

int nr_pages = 1 << order;

367

368

set_compound_page_dtor(page, free_compound_page);

368

set_compound_page_dtor(page, free_compound_page);

369

set_compound_order(page, order);

369

set_compound_order(page, order);

370

__SetPageHead(page);

370

__SetPageHead(page);

371

for (i = 1; i < nr_pages; i++) {

371

for (i = 1; i < nr_pages; i++) {

372

struct page *p = page + i;

372

struct page *p = page + i;

373

set_page_count(p, 0);

373

set_page_count(p, 0);

374

p->first_page = page;

374

p->first_page = page;

375

/* Make sure p->first_page is always valid for PageTail() */

375

/* Make sure p->first_page is always valid for PageTail() */

376

smp_wmb();

376

smp_wmb();

377

__SetPageTail(p);

377

__SetPageTail(p);

378

}

378

}

379

}

379

}

380

381

/* update __split_huge_page_refcount if you change this function */

381

/* update __split_huge_page_refcount if you change this function */

382

static int destroy_compound_page(struct page *page, unsigned long order)

382

static int destroy_compound_page(struct page *page, unsigned long order)

383

{

383

{

384

int i;

384

int i;

385

int nr_pages = 1 << order;

385

int nr_pages = 1 << order;

386

int bad = 0;

386

int bad = 0;

387

388

if (unlikely(compound_order(page) != order)) {

388

if (unlikely(compound_order(page) != order)) {

389

bad_page(page);

389

bad_page(page);

390

bad++;

390

bad++;

391

}

391

}

392

393

__ClearPageHead(page);

393

__ClearPageHead(page);

394

395

for (i = 1; i < nr_pages; i++) {

395

for (i = 1; i < nr_pages; i++) {

396

struct page *p = page + i;

396

struct page *p = page + i;

397

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

398

if (unlikely(!PageTail(p) || (p->first_page != page))) {

399

bad_page(page);

399

bad_page(page);

400

bad++;

400

bad++;

401

}

401

}

402

__ClearPageTail(p);

402

__ClearPageTail(p);

403

}

403

}

404

405

return bad;

405

return bad;

406

}

406

}

407

408

static inline void prep_zero_page(struct page *page, unsigned int order,

408

static inline void prep_zero_page(struct page *page, unsigned int order,

409

gfp_t gfp_flags)

409

gfp_t gfp_flags)

410

{

410

{

411

int i;

411

int i;

412

413

/*

413

/*

414

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

414

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

415

* and __GFP_HIGHMEM from hard or soft interrupt context.

415

* and __GFP_HIGHMEM from hard or soft interrupt context.

416

*/

416

*/

417

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

417

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

418

for (i = 0; i < (1 << order); i++)

418

for (i = 0; i < (1 << order); i++)

419

clear_highpage(page + i);

419

clear_highpage(page + i);

420

}

420

}

421

422

#ifdef CONFIG_DEBUG_PAGEALLOC

422

#ifdef CONFIG_DEBUG_PAGEALLOC

423

unsigned int _debug_guardpage_minorder;

423

unsigned int _debug_guardpage_minorder;

424

425

static int __init debug_guardpage_minorder_setup(char *buf)

425

static int __init debug_guardpage_minorder_setup(char *buf)

426

{

426

{

427

unsigned long res;

427

unsigned long res;

428

429

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

429

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

430

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

430

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

431

return 0;

431

return 0;

432

}

432

}

433

_debug_guardpage_minorder = res;

433

_debug_guardpage_minorder = res;

434

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

434

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

435

return 0;

435

return 0;

436

}

436

}

437

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

437

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

438

439

static inline void set_page_guard_flag(struct page *page)

439

static inline void set_page_guard_flag(struct page *page)

440

{

440

{

441

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

441

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

442

}

442

}

443

444

static inline void clear_page_guard_flag(struct page *page)

444

static inline void clear_page_guard_flag(struct page *page)

445

{

445

{

446

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

446

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

447

}

447

}

448

#else

448

#else

449

static inline void set_page_guard_flag(struct page *page) { }

449

static inline void set_page_guard_flag(struct page *page) { }

450

static inline void clear_page_guard_flag(struct page *page) { }

450

static inline void clear_page_guard_flag(struct page *page) { }

451

#endif

451

#endif

452

453

static inline void set_page_order(struct page *page, unsigned int order)

453

static inline void set_page_order(struct page *page, unsigned int order)

454

{

454

{

455

set_page_private(page, order);

455

set_page_private(page, order);

456

__SetPageBuddy(page);

456

__SetPageBuddy(page);

457

}

457

}

458

459

static inline void rmv_page_order(struct page *page)

459

static inline void rmv_page_order(struct page *page)

460

{

460

{

461

__ClearPageBuddy(page);

461

__ClearPageBuddy(page);

462

set_page_private(page, 0);

462

set_page_private(page, 0);

463

}

463

}

464

465

/*

465

/*

466

* Locate the struct page for both the matching buddy in our

466

* Locate the struct page for both the matching buddy in our

467

* pair (buddy1) and the combined O(n+1) page they form (page).

467

* pair (buddy1) and the combined O(n+1) page they form (page).

468

*

468

*

469

* 1) Any buddy B1 will have an order O twin B2 which satisfies

469

* 1) Any buddy B1 will have an order O twin B2 which satisfies

470

* the following equation:

470

* the following equation:

471

* B2 = B1 ^ (1 << O)

471

* B2 = B1 ^ (1 << O)

472

* For example, if the starting buddy (buddy2) is #8 its order

472

* For example, if the starting buddy (buddy2) is #8 its order

473

* 1 buddy is #10:

473

* 1 buddy is #10:

474

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

474

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

475

*

475

*

476

* 2) Any buddy B will have an order O+1 parent P which

476

* 2) Any buddy B will have an order O+1 parent P which

477

* satisfies the following equation:

477

* satisfies the following equation:

478

* P = B & ~(1 << O)

478

* P = B & ~(1 << O)

479

*

479

*

480

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

480

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

481

*/

481

*/

482

static inline unsigned long

482

static inline unsigned long

483

__find_buddy_index(unsigned long page_idx, unsigned int order)

483

__find_buddy_index(unsigned long page_idx, unsigned int order)

484

{

484

{

485

return page_idx ^ (1 << order);

485

return page_idx ^ (1 << order);

486

}

486

}

487

488

/*

488

/*

489

* This function checks whether a page is free && is the buddy

489

* This function checks whether a page is free && is the buddy

490

* we can do coalesce a page and its buddy if

490

* we can do coalesce a page and its buddy if

491

* (a) the buddy is not in a hole &&

491

* (a) the buddy is not in a hole &&

492

* (b) the buddy is in the buddy system &&

492

* (b) the buddy is in the buddy system &&

493

* (c) a page and its buddy have the same order &&

493

* (c) a page and its buddy have the same order &&

494

* (d) a page and its buddy are in the same zone.

494

* (d) a page and its buddy are in the same zone.

495

*

495

*

496

* For recording whether a page is in the buddy system, we set ->_mapcount

496

* For recording whether a page is in the buddy system, we set ->_mapcount

497

* PAGE_BUDDY_MAPCOUNT_VALUE.

497

* PAGE_BUDDY_MAPCOUNT_VALUE.

498

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

498

* Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is

499

* serialized by zone->lock.

499

* serialized by zone->lock.

500

*

500

*

501

* For recording page's order, we use page_private(page).

501

* For recording page's order, we use page_private(page).

502

*/

502

*/

503

static inline int page_is_buddy(struct page *page, struct page *buddy,

503

static inline int page_is_buddy(struct page *page, struct page *buddy,

504

unsigned int order)

504

unsigned int order)

505

{

505

{

506

if (!pfn_valid_within(page_to_pfn(buddy)))

506

if (!pfn_valid_within(page_to_pfn(buddy)))

507

return 0;

507

return 0;

508

509

if (page_is_guard(buddy) && page_order(buddy) == order) {

509

if (page_is_guard(buddy) && page_order(buddy) == order) {

510

VM_BUG_ON(page_count(buddy) != 0);

510

VM_BUG_ON(page_count(buddy) != 0);

511

512

if (page_zone_id(page) != page_zone_id(buddy))

512

if (page_zone_id(page) != page_zone_id(buddy))

513

return 0;

513

return 0;

514

515

return 1;

515

return 1;

516

}

516

}

517

518

if (PageBuddy(buddy) && page_order(buddy) == order) {

518

if (PageBuddy(buddy) && page_order(buddy) == order) {

519

VM_BUG_ON(page_count(buddy) != 0);

519

VM_BUG_ON(page_count(buddy) != 0);

520

521

/*

521

/*

522

* zone check is done late to avoid uselessly

522

* zone check is done late to avoid uselessly

523

* calculating zone/node ids for pages that could

523

* calculating zone/node ids for pages that could

524

* never merge.

524

* never merge.

525

*/

525

*/

526

if (page_zone_id(page) != page_zone_id(buddy))

526

if (page_zone_id(page) != page_zone_id(buddy))

527

return 0;

527

return 0;

528

529

return 1;

529

return 1;

530

}

530

}

531

return 0;

531

return 0;

532

}

532

}

533

534

/*

534

/*

535

* Freeing function for a buddy system allocator.

535

* Freeing function for a buddy system allocator.

536

*

536

*

537

* The concept of a buddy system is to maintain direct-mapped table

537

* The concept of a buddy system is to maintain direct-mapped table

538

* (containing bit values) for memory blocks of various "orders".

538

* (containing bit values) for memory blocks of various "orders".

539

* The bottom level table contains the map for the smallest allocatable

539

* The bottom level table contains the map for the smallest allocatable

540

* units of memory (here, pages), and each level above it describes

540

* units of memory (here, pages), and each level above it describes

541

* pairs of units from the levels below, hence, "buddies".

541

* pairs of units from the levels below, hence, "buddies".

542

* At a high level, all that happens here is marking the table entry

542

* At a high level, all that happens here is marking the table entry

543

* at the bottom level available, and propagating the changes upward

543

* at the bottom level available, and propagating the changes upward

544

* as necessary, plus some accounting needed to play nicely with other

544

* as necessary, plus some accounting needed to play nicely with other

545

* parts of the VM system.

545

* parts of the VM system.

546

* At each level, we keep a list of pages, which are heads of continuous

546

* At each level, we keep a list of pages, which are heads of continuous

547

* free pages of length of (1 << order) and marked with _mapcount

547

* free pages of length of (1 << order) and marked with _mapcount

548

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

548

* PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)

549

* field.

549

* field.

550

* So when we are allocating or freeing one, we can derive the state of the

550

* So when we are allocating or freeing one, we can derive the state of the

551

* other. That is, if we allocate a small block, and both were

551

* other. That is, if we allocate a small block, and both were

552

* free, the remainder of the region must be split into blocks.

552

* free, the remainder of the region must be split into blocks.

553

* If a block is freed, and its buddy is also free, then this

553

* If a block is freed, and its buddy is also free, then this

554

* triggers coalescing into a block of larger size.

554

* triggers coalescing into a block of larger size.

555

*

555

*

556

* -- nyc

556

* -- nyc

557

*/

557

*/

558

559

static inline void __free_one_page(struct page *page,

559

static inline void __free_one_page(struct page *page,

560

unsigned long pfn,

560

unsigned long pfn,

561

struct zone *zone, unsigned int order,

561

struct zone *zone, unsigned int order,

562

int migratetype)

562

int migratetype)

563

{

563

{

564

unsigned long page_idx;

564

unsigned long page_idx;

565

unsigned long combined_idx;

565

unsigned long combined_idx;

566

unsigned long uninitialized_var(buddy_idx);

566

unsigned long uninitialized_var(buddy_idx);

567

struct page *buddy;

567

struct page *buddy;

568

569

VM_BUG_ON(!zone_is_initialized(zone));

569

VM_BUG_ON(!zone_is_initialized(zone));

570

571

if (unlikely(PageCompound(page)))

571

if (unlikely(PageCompound(page)))

572

if (unlikely(destroy_compound_page(page, order)))

572

if (unlikely(destroy_compound_page(page, order)))

573

return;

573

return;

574

575

VM_BUG_ON(migratetype == -1);

575

VM_BUG_ON(migratetype == -1);

576

577

page_idx = pfn & ((1 << MAX_ORDER) - 1);

577

page_idx = pfn & ((1 << MAX_ORDER) - 1);

578

579

VM_BUG_ON(page_idx & ((1 << order) - 1));

579

VM_BUG_ON(page_idx & ((1 << order) - 1));

580

VM_BUG_ON(bad_range(zone, page));

580

VM_BUG_ON(bad_range(zone, page));

581

582

while (order < MAX_ORDER-1) {

582

while (order < MAX_ORDER-1) {

583

buddy_idx = __find_buddy_index(page_idx, order);

583

buddy_idx = __find_buddy_index(page_idx, order);

584

buddy = page + (buddy_idx - page_idx);

584

buddy = page + (buddy_idx - page_idx);

585

if (!page_is_buddy(page, buddy, order))

585

if (!page_is_buddy(page, buddy, order))

586

break;

586

break;

587

/*

587

/*

588

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

588

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

589

* merge with it and move up one order.

589

* merge with it and move up one order.

590

*/

590

*/

591

if (page_is_guard(buddy)) {

591

if (page_is_guard(buddy)) {

592

clear_page_guard_flag(buddy);

592

clear_page_guard_flag(buddy);

593

set_page_private(page, 0);

593

set_page_private(page, 0);

594

__mod_zone_freepage_state(zone, 1 << order,

594

__mod_zone_freepage_state(zone, 1 << order,

595

migratetype);

595

migratetype);

596

} else {

596

} else {

597

list_del(&buddy->lru);

597

list_del(&buddy->lru);

598

zone->free_area[order].nr_free--;

598

zone->free_area[order].nr_free--;

599

rmv_page_order(buddy);

599

rmv_page_order(buddy);

600

}

600

}

601

combined_idx = buddy_idx & page_idx;

601

combined_idx = buddy_idx & page_idx;

602

page = page + (combined_idx - page_idx);

602

page = page + (combined_idx - page_idx);

603

page_idx = combined_idx;

603

page_idx = combined_idx;

604

order++;

604

order++;

605

}

605

}

606

set_page_order(page, order);

606

set_page_order(page, order);

607

608

/*

608

/*

609

* If this is not the largest possible page, check if the buddy

609

* If this is not the largest possible page, check if the buddy

610

* of the next-highest order is free. If it is, it's possible

610

* of the next-highest order is free. If it is, it's possible

611

* that pages are being freed that will coalesce soon. In case,

611

* that pages are being freed that will coalesce soon. In case,

612

* that is happening, add the free page to the tail of the list

612

* that is happening, add the free page to the tail of the list

613

* so it's less likely to be used soon and more likely to be merged

613

* so it's less likely to be used soon and more likely to be merged

614

* as a higher order page

614

* as a higher order page

615

*/

615

*/

616

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

616

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

617

struct page *higher_page, *higher_buddy;

617

struct page *higher_page, *higher_buddy;

618

combined_idx = buddy_idx & page_idx;

618

combined_idx = buddy_idx & page_idx;

619

higher_page = page + (combined_idx - page_idx);

619

higher_page = page + (combined_idx - page_idx);

620

buddy_idx = __find_buddy_index(combined_idx, order + 1);

620

buddy_idx = __find_buddy_index(combined_idx, order + 1);

621

higher_buddy = higher_page + (buddy_idx - combined_idx);

621

higher_buddy = higher_page + (buddy_idx - combined_idx);

622

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

622

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

623

list_add_tail(&page->lru,

623

list_add_tail(&page->lru,

624

&zone->free_area[order].free_list[migratetype]);

624

&zone->free_area[order].free_list[migratetype]);

625

goto out;

625

goto out;

626

}

626

}

627

}

627

}

628

629

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

629

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

630

out:

630

out:

631

zone->free_area[order].nr_free++;

631

zone->free_area[order].nr_free++;

632

}

632

}

633

634

static inline int free_pages_check(struct page *page)

634

static inline int free_pages_check(struct page *page)

635

{

635

{

636

if (unlikely(page_mapcount(page) |

636

if (unlikely(page_mapcount(page) |

637

(page->mapping != NULL) |

637

(page->mapping != NULL) |

638

(atomic_read(&page->_count) != 0) |

638

(atomic_read(&page->_count) != 0) |

639

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

639

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

640

(mem_cgroup_bad_page_check(page)))) {

640

(mem_cgroup_bad_page_check(page)))) {

641

bad_page(page);

641

bad_page(page);

642

return 1;

642

return 1;

643

}

643

}

644

page_nid_reset_last(page);

644

page_nid_reset_last(page);

645

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

645

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

646

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

646

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

647

return 0;

647

return 0;

648

}

648

}

649

650

/*

650

/*

651

* Frees a number of pages from the PCP lists

651

* Frees a number of pages from the PCP lists

652

* Assumes all pages on list are in same zone, and of same order.

652

* Assumes all pages on list are in same zone, and of same order.

653

* count is the number of pages to free.

653

* count is the number of pages to free.

654

*

654

*

655

* If the zone was previously in an "all pages pinned" state then look to

655

* If the zone was previously in an "all pages pinned" state then look to

656

* see if this freeing clears that state.

656

* see if this freeing clears that state.

657

*

657

*

658

* And clear the zone's pages_scanned counter, to hold off the "all pages are

658

* And clear the zone's pages_scanned counter, to hold off the "all pages are

659

* pinned" detection logic.

659

* pinned" detection logic.

660

*/

660

*/

661

static void free_pcppages_bulk(struct zone *zone, int count,

661

static void free_pcppages_bulk(struct zone *zone, int count,

662

struct per_cpu_pages *pcp)

662

struct per_cpu_pages *pcp)

663

{

663

{

664

int migratetype = 0;

664

int migratetype = 0;

665

int batch_free = 0;

665

int batch_free = 0;

666

int to_free = count;

666

int to_free = count;

667

unsigned long nr_scanned;

667

unsigned long nr_scanned;

668

669

spin_lock(&zone->lock);

669

spin_lock(&zone->lock);

670

nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);

670

nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);

671

if (nr_scanned)

671

if (nr_scanned)

672

__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);

672

__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);

673

674

while (to_free) {

674

while (to_free) {

675

struct page *page;

675

struct page *page;

676

struct list_head *list;

676

struct list_head *list;

677

678

/*

678

/*

679

* Remove pages from lists in a round-robin fashion. A

679

* Remove pages from lists in a round-robin fashion. A

680

* batch_free count is maintained that is incremented when an

680

* batch_free count is maintained that is incremented when an

681

* empty list is encountered. This is so more pages are freed

681

* empty list is encountered. This is so more pages are freed

682

* off fuller lists instead of spinning excessively around empty

682

* off fuller lists instead of spinning excessively around empty

683

* lists

683

* lists

684

*/

684

*/

685

do {

685

do {

686

batch_free++;

686

batch_free++;

687

if (++migratetype == MIGRATE_PCPTYPES)

687

if (++migratetype == MIGRATE_PCPTYPES)

688

migratetype = 0;

688

migratetype = 0;

689

list = &pcp->lists[migratetype];

689

list = &pcp->lists[migratetype];

690

} while (list_empty(list));

690

} while (list_empty(list));

691

692

/* This is the only non-empty list. Free them all. */

692

/* This is the only non-empty list. Free them all. */

693

if (batch_free == MIGRATE_PCPTYPES)

693

if (batch_free == MIGRATE_PCPTYPES)

694

batch_free = to_free;

694

batch_free = to_free;

695

696

do {

696

do {

697

int mt; /* migratetype of the to-be-freed page */

697

int mt; /* migratetype of the to-be-freed page */

698

699

page = list_entry(list->prev, struct page, lru);

699

page = list_entry(list->prev, struct page, lru);

700

/* must delete as __free_one_page list manipulates */

700

/* must delete as __free_one_page list manipulates */

701

list_del(&page->lru);

701

list_del(&page->lru);

702

mt = get_freepage_migratetype(page);

702

mt = get_freepage_migratetype(page);

703

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

703

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

704

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

704

__free_one_page(page, page_to_pfn(page), zone, 0, mt);

705

trace_mm_page_pcpu_drain(page, 0, mt);

705

trace_mm_page_pcpu_drain(page, 0, mt);

706

if (likely(!is_migrate_isolate_page(page))) {

706

if (likely(!is_migrate_isolate_page(page))) {

707

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

707

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

708

if (is_migrate_cma(mt))

708

if (is_migrate_cma(mt))

709

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

709

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

710

}

710

}

711

} while (--to_free && --batch_free && !list_empty(list));

711

} while (--to_free && --batch_free && !list_empty(list));

712

}

712

}

713

spin_unlock(&zone->lock);

713

spin_unlock(&zone->lock);

714

}

714

}

715

716

static void free_one_page(struct zone *zone,

716

static void free_one_page(struct zone *zone,

717

struct page *page, unsigned long pfn,

717

struct page *page, unsigned long pfn,

718

unsigned int order,

718

unsigned int order,

719

int migratetype)

719

int migratetype)

720

{

720

{

721

unsigned long nr_scanned;

721

unsigned long nr_scanned;

722

spin_lock(&zone->lock);

722

spin_lock(&zone->lock);

723

nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);

723

nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);

724

if (nr_scanned)

724

if (nr_scanned)

725

__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);

725

__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);

726

727

__free_one_page(page, pfn, zone, order, migratetype);

727

__free_one_page(page, pfn, zone, order, migratetype);

728

if (unlikely(!is_migrate_isolate(migratetype)))

728

if (unlikely(!is_migrate_isolate(migratetype)))

729

__mod_zone_freepage_state(zone, 1 << order, migratetype);

729

__mod_zone_freepage_state(zone, 1 << order, migratetype);

730

spin_unlock(&zone->lock);

730

spin_unlock(&zone->lock);

731

}

731

}

732

733

static bool free_pages_prepare(struct page *page, unsigned int order)

733

static bool free_pages_prepare(struct page *page, unsigned int order)

734

{

734

{

735

int i;

735

int i;

736

int bad = 0;

736

int bad = 0;

737

738

trace_mm_page_free(page, order);

738

trace_mm_page_free(page, order);

739

kmemcheck_free_shadow(page, order);

739

kmemcheck_free_shadow(page, order);

740

741

if (PageAnon(page))

741

if (PageAnon(page))

742

page->mapping = NULL;

742

page->mapping = NULL;

743

for (i = 0; i < (1 << order); i++)

743

for (i = 0; i < (1 << order); i++)

744

bad += free_pages_check(page + i);

744

bad += free_pages_check(page + i);

745

if (bad)

745

if (bad)

746

return false;

746

return false;

747

748

if (!PageHighMem(page)) {

748

if (!PageHighMem(page)) {

749

debug_check_no_locks_freed(page_address(page),

749

debug_check_no_locks_freed(page_address(page),

750

PAGE_SIZE << order);

750

PAGE_SIZE << order);

751

debug_check_no_obj_freed(page_address(page),

751

debug_check_no_obj_freed(page_address(page),

752

PAGE_SIZE << order);

752

PAGE_SIZE << order);

753

}

753

}

754

arch_free_page(page, order);

754

arch_free_page(page, order);

755

kernel_map_pages(page, 1 << order, 0);

755

kernel_map_pages(page, 1 << order, 0);

756

757

return true;

757

return true;

758

}

758

}

759

760

static void __free_pages_ok(struct page *page, unsigned int order)

760

static void __free_pages_ok(struct page *page, unsigned int order)

761

{

761

{

762

unsigned long flags;

762

unsigned long flags;

763

int migratetype;

763

int migratetype;

764

unsigned long pfn = page_to_pfn(page);

764

unsigned long pfn = page_to_pfn(page);

765

766

if (!free_pages_prepare(page, order))

766

if (!free_pages_prepare(page, order))

767

return;

767

return;

768

769

migratetype = get_pfnblock_migratetype(page, pfn);

769

migratetype = get_pfnblock_migratetype(page, pfn);

770

local_irq_save(flags);

770

local_irq_save(flags);

771

__count_vm_events(PGFREE, 1 << order);

771

__count_vm_events(PGFREE, 1 << order);

772

set_freepage_migratetype(page, migratetype);

772

set_freepage_migratetype(page, migratetype);

773

free_one_page(page_zone(page), page, pfn, order, migratetype);

773

free_one_page(page_zone(page), page, pfn, order, migratetype);

774

local_irq_restore(flags);

774

local_irq_restore(flags);

775

}

775

}

776

777

void __init __free_pages_bootmem(struct page *page, unsigned int order)

777

void __init __free_pages_bootmem(struct page *page, unsigned int order)

778

{

778

{

779

unsigned int nr_pages = 1 << order;

779

unsigned int nr_pages = 1 << order;

780

struct page *p = page;

780

struct page *p = page;

781

unsigned int loop;

781

unsigned int loop;

782

783

prefetchw(p);

783

prefetchw(p);

784

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

784

for (loop = 0; loop < (nr_pages - 1); loop++, p++) {

785

prefetchw(p + 1);

785

prefetchw(p + 1);

786

__ClearPageReserved(p);

786

__ClearPageReserved(p);

787

set_page_count(p, 0);

787

set_page_count(p, 0);

788

}

788

}

789

__ClearPageReserved(p);

789

__ClearPageReserved(p);

790

set_page_count(p, 0);

790

set_page_count(p, 0);

791

792

page_zone(page)->managed_pages += nr_pages;

792

page_zone(page)->managed_pages += nr_pages;

793

set_page_refcounted(page);

793

set_page_refcounted(page);

794

__free_pages(page, order);

794

__free_pages(page, order);

795

}

795

}

796

797

#ifdef CONFIG_CMA

797

#ifdef CONFIG_CMA

798

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

798

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */

799

void __init init_cma_reserved_pageblock(struct page *page)

799

void __init init_cma_reserved_pageblock(struct page *page)

800

{

800

{

801

unsigned i = pageblock_nr_pages;

801

unsigned i = pageblock_nr_pages;

802

struct page *p = page;

802

struct page *p = page;

803

804

do {

804

do {

805

__ClearPageReserved(p);

805

__ClearPageReserved(p);

806

set_page_count(p, 0);

806

set_page_count(p, 0);

807

} while (++p, --i);

807

} while (++p, --i);

808

809

set_pageblock_migratetype(page, MIGRATE_CMA);

809

set_pageblock_migratetype(page, MIGRATE_CMA);

810

811

if (pageblock_order >= MAX_ORDER) {

811

if (pageblock_order >= MAX_ORDER) {

812

i = pageblock_nr_pages;

812

i = pageblock_nr_pages;

813

p = page;

813

p = page;

814

do {

814

do {

815

set_page_refcounted(p);

815

set_page_refcounted(p);

816

__free_pages(p, MAX_ORDER - 1);

816

__free_pages(p, MAX_ORDER - 1);

817

p += MAX_ORDER_NR_PAGES;

817

p += MAX_ORDER_NR_PAGES;

818

} while (i -= MAX_ORDER_NR_PAGES);

818

} while (i -= MAX_ORDER_NR_PAGES);

819

} else {

819

} else {

820

set_page_refcounted(page);

820

set_page_refcounted(page);

821

__free_pages(page, pageblock_order);

821

__free_pages(page, pageblock_order);

822

}

822

}

823

824

adjust_managed_page_count(page, pageblock_nr_pages);

824

adjust_managed_page_count(page, pageblock_nr_pages);

825

}

825

}

826

#endif

826

#endif

827

828

/*

828

/*

829

* The order of subdivision here is critical for the IO subsystem.

829

* The order of subdivision here is critical for the IO subsystem.

830

* Please do not alter this order without good reasons and regression

830

* Please do not alter this order without good reasons and regression

831

* testing. Specifically, as large blocks of memory are subdivided,

831

* testing. Specifically, as large blocks of memory are subdivided,

832

* the order in which smaller blocks are delivered depends on the order

832

* the order in which smaller blocks are delivered depends on the order

833

* they're subdivided in this function. This is the primary factor

833

* they're subdivided in this function. This is the primary factor

834

* influencing the order in which pages are delivered to the IO

834

* influencing the order in which pages are delivered to the IO

835

* subsystem according to empirical testing, and this is also justified

835

* subsystem according to empirical testing, and this is also justified

836

* by considering the behavior of a buddy system containing a single

836

* by considering the behavior of a buddy system containing a single

837

* large block of memory acted on by a series of small allocations.

837

* large block of memory acted on by a series of small allocations.

838

* This behavior is a critical factor in sglist merging's success.

838

* This behavior is a critical factor in sglist merging's success.

839

*

839

*

840

* -- nyc

840

* -- nyc

841

*/

841

*/

842

static inline void expand(struct zone *zone, struct page *page,

842

static inline void expand(struct zone *zone, struct page *page,

843

int low, int high, struct free_area *area,

843

int low, int high, struct free_area *area,

844

int migratetype)

844

int migratetype)

845

{

845

{

846

unsigned long size = 1 << high;

846

unsigned long size = 1 << high;

847

848

while (high > low) {

848

while (high > low) {

849

area--;

849

area--;

850

high--;

850

high--;

851

size >>= 1;

851

size >>= 1;

852

VM_BUG_ON(bad_range(zone, &page[size]));

852

VM_BUG_ON(bad_range(zone, &page[size]));

853

854

#ifdef CONFIG_DEBUG_PAGEALLOC

854

#ifdef CONFIG_DEBUG_PAGEALLOC

855

if (high < debug_guardpage_minorder()) {

855

if (high < debug_guardpage_minorder()) {

856

/*

856

/*

857

* Mark as guard pages (or page), that will allow to

857

* Mark as guard pages (or page), that will allow to

858

* merge back to allocator when buddy will be freed.

858

* merge back to allocator when buddy will be freed.

859

* Corresponding page table entries will not be touched,

859

* Corresponding page table entries will not be touched,

860

* pages will stay not present in virtual address space

860

* pages will stay not present in virtual address space

861

*/

861

*/

862

INIT_LIST_HEAD(&page[size].lru);

862

INIT_LIST_HEAD(&page[size].lru);

863

set_page_guard_flag(&page[size]);

863

set_page_guard_flag(&page[size]);

864

set_page_private(&page[size], high);

864

set_page_private(&page[size], high);

865

/* Guard pages are not available for any usage */

865

/* Guard pages are not available for any usage */

866

__mod_zone_freepage_state(zone, -(1 << high),

866

__mod_zone_freepage_state(zone, -(1 << high),

867

migratetype);

867

migratetype);

868

continue;

868

continue;

869

}

869

}

870

#endif

870

#endif

871

list_add(&page[size].lru, &area->free_list[migratetype]);

871

list_add(&page[size].lru, &area->free_list[migratetype]);

872

area->nr_free++;

872

area->nr_free++;

873

set_page_order(&page[size], high);

873

set_page_order(&page[size], high);

874

}

874

}

875

}

875

}

876

877

/*

877

/*

878

* This page is about to be returned from the page allocator

878

* This page is about to be returned from the page allocator

879

*/

879

*/

880

static inline int check_new_page(struct page *page)

880

static inline int check_new_page(struct page *page)

881

{

881

{

882

if (unlikely(page_mapcount(page) |

882

if (unlikely(page_mapcount(page) |

883

(page->mapping != NULL) |

883

(page->mapping != NULL) |

884

(atomic_read(&page->_count) != 0) |

884

(atomic_read(&page->_count) != 0) |

885

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

885

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

886

(mem_cgroup_bad_page_check(page)))) {

886

(mem_cgroup_bad_page_check(page)))) {

887

bad_page(page);

887

bad_page(page);

888

return 1;

888

return 1;

889

}

889

}

890

return 0;

890

return 0;

891

}

891

}

892

893

static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)

893

static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)

894

{

894

{

895

int i;

895

int i;

896

897

for (i = 0; i < (1 << order); i++) {

897

for (i = 0; i < (1 << order); i++) {

898

struct page *p = page + i;

898

struct page *p = page + i;

899

if (unlikely(check_new_page(p)))

899

if (unlikely(check_new_page(p)))

900

return 1;

900

return 1;

901

}

901

}

902

903

set_page_private(page, 0);

903

set_page_private(page, 0);

904

set_page_refcounted(page);

904

set_page_refcounted(page);

905

906

arch_alloc_page(page, order);

906

arch_alloc_page(page, order);

907

kernel_map_pages(page, 1 << order, 1);

907

kernel_map_pages(page, 1 << order, 1);

908

909

if (gfp_flags & __GFP_ZERO)

909

if (gfp_flags & __GFP_ZERO)

910

prep_zero_page(page, order, gfp_flags);

910

prep_zero_page(page, order, gfp_flags);

911

912

if (order && (gfp_flags & __GFP_COMP))

912

if (order && (gfp_flags & __GFP_COMP))

913

prep_compound_page(page, order);

913

prep_compound_page(page, order);

914

915

return 0;

915

return 0;

916

}

916

}

917

918

/*

918

/*

919

* Go through the free lists for the given migratetype and remove

919

* Go through the free lists for the given migratetype and remove

920

* the smallest available page from the freelists

920

* the smallest available page from the freelists

921

*/

921

*/

922

static inline

922

static inline

923

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

923

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

924

int migratetype)

924

int migratetype)

925

{

925

{

926

unsigned int current_order;

926

unsigned int current_order;

927

struct free_area *area;

927

struct free_area *area;

928

struct page *page;

928

struct page *page;

929

930

/* Find a page of the appropriate size in the preferred list */

930

/* Find a page of the appropriate size in the preferred list */

931

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

931

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

932

area = &(zone->free_area[current_order]);

932

area = &(zone->free_area[current_order]);

933

if (list_empty(&area->free_list[migratetype]))

933

if (list_empty(&area->free_list[migratetype]))

934

continue;

934

continue;

935

936

page = list_entry(area->free_list[migratetype].next,

936

page = list_entry(area->free_list[migratetype].next,

937

struct page, lru);

937

struct page, lru);

938

list_del(&page->lru);

938

list_del(&page->lru);

939

rmv_page_order(page);

939

rmv_page_order(page);

940

area->nr_free--;

940

area->nr_free--;

941

expand(zone, page, order, current_order, area, migratetype);

941

expand(zone, page, order, current_order, area, migratetype);

942

set_freepage_migratetype(page, migratetype);

942

set_freepage_migratetype(page, migratetype);

943

return page;

943

return page;

944

}

944

}

945

946

return NULL;

946

return NULL;

947

}

947

}

948

949

950

/*

950

/*

951

* This array describes the order lists are fallen back to when

951

* This array describes the order lists are fallen back to when

952

* the free lists for the desirable migrate type are depleted

952

* the free lists for the desirable migrate type are depleted

953

*/

953

*/

954

static int fallbacks[MIGRATE_TYPES][4] = {

954

static int fallbacks[MIGRATE_TYPES][4] = {

955

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

955

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

956

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

956

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

957

#ifdef CONFIG_CMA

957

#ifdef CONFIG_CMA

958

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

958

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

959

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

959

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

960

#else

960

#else

961

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

961

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

962

#endif

962

#endif

963

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

963

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

964

#ifdef CONFIG_MEMORY_ISOLATION

964

#ifdef CONFIG_MEMORY_ISOLATION

965

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

965

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

966

#endif

966

#endif

967

};

967

};

968

969

/*

969

/*

970

* Move the free pages in a range to the free lists of the requested type.

970

* Move the free pages in a range to the free lists of the requested type.

971

* Note that start_page and end_pages are not aligned on a pageblock

971

* Note that start_page and end_pages are not aligned on a pageblock

972

* boundary. If alignment is required, use move_freepages_block()

972

* boundary. If alignment is required, use move_freepages_block()

973

*/

973

*/

974

int move_freepages(struct zone *zone,

974

int move_freepages(struct zone *zone,

975

struct page *start_page, struct page *end_page,

975

struct page *start_page, struct page *end_page,

976

int migratetype)

976

int migratetype)

977

{

977

{

978

struct page *page;

978

struct page *page;

979

unsigned long order;

979

unsigned long order;

980

int pages_moved = 0;

980

int pages_moved = 0;

981

982

#ifndef CONFIG_HOLES_IN_ZONE

982

#ifndef CONFIG_HOLES_IN_ZONE

983

/*

983

/*

984

* page_zone is not safe to call in this context when

984

* page_zone is not safe to call in this context when

985

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

985

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

986

* anyway as we check zone boundaries in move_freepages_block().

986

* anyway as we check zone boundaries in move_freepages_block().

987

* Remove at a later date when no bug reports exist related to

987

* Remove at a later date when no bug reports exist related to

988

* grouping pages by mobility

988

* grouping pages by mobility

989

*/

989

*/

990

BUG_ON(page_zone(start_page) != page_zone(end_page));

990

BUG_ON(page_zone(start_page) != page_zone(end_page));

991

#endif

991

#endif

992

993

for (page = start_page; page <= end_page;) {

993

for (page = start_page; page <= end_page;) {

994

/* Make sure we are not inadvertently changing nodes */

994

/* Make sure we are not inadvertently changing nodes */

995

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

995

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

996

997

if (!pfn_valid_within(page_to_pfn(page))) {

997

if (!pfn_valid_within(page_to_pfn(page))) {

998

page++;

998

page++;

999

continue;

999

continue;

1000

}

1000

}

1001

1002

if (!PageBuddy(page)) {

1002

if (!PageBuddy(page)) {

1003

page++;

1003

page++;

1004

continue;

1004

continue;

1005

}

1005

}

1006

1007

order = page_order(page);

1007

order = page_order(page);

1008

list_move(&page->lru,

1008

list_move(&page->lru,

1009

&zone->free_area[order].free_list[migratetype]);

1009

&zone->free_area[order].free_list[migratetype]);

1010

set_freepage_migratetype(page, migratetype);

1010

set_freepage_migratetype(page, migratetype);

1011

page += 1 << order;

1011

page += 1 << order;

1012

pages_moved += 1 << order;

1012

pages_moved += 1 << order;

1013

}

1013

}

1014

1015

return pages_moved;

1015

return pages_moved;

1016

}

1016

}

1017

1018

int move_freepages_block(struct zone *zone, struct page *page,

1018

int move_freepages_block(struct zone *zone, struct page *page,

1019

int migratetype)

1019

int migratetype)

1020

{

1020

{

1021

unsigned long start_pfn, end_pfn;

1021

unsigned long start_pfn, end_pfn;

1022

struct page *start_page, *end_page;

1022

struct page *start_page, *end_page;

1023

1024

start_pfn = page_to_pfn(page);

1024

start_pfn = page_to_pfn(page);

1025

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1025

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

1026

start_page = pfn_to_page(start_pfn);

1026

start_page = pfn_to_page(start_pfn);

1027

end_page = start_page + pageblock_nr_pages - 1;

1027

end_page = start_page + pageblock_nr_pages - 1;

1028

end_pfn = start_pfn + pageblock_nr_pages - 1;

1028

end_pfn = start_pfn + pageblock_nr_pages - 1;

1029

1030

/* Do not cross zone boundaries */

1030

/* Do not cross zone boundaries */

1031

if (!zone_spans_pfn(zone, start_pfn))

1031

if (!zone_spans_pfn(zone, start_pfn))

1032

start_page = page;

1032

start_page = page;

1033

if (!zone_spans_pfn(zone, end_pfn))

1033

if (!zone_spans_pfn(zone, end_pfn))

1034

return 0;

1034

return 0;

1035

1036

return move_freepages(zone, start_page, end_page, migratetype);

1036

return move_freepages(zone, start_page, end_page, migratetype);

1037

}

1037

}

1038

1039

static void change_pageblock_range(struct page *pageblock_page,

1039

static void change_pageblock_range(struct page *pageblock_page,

1040

int start_order, int migratetype)

1040

int start_order, int migratetype)

1041

{

1041

{

1042

int nr_pageblocks = 1 << (start_order - pageblock_order);

1042

int nr_pageblocks = 1 << (start_order - pageblock_order);

1043

1044

while (nr_pageblocks--) {

1044

while (nr_pageblocks--) {

1045

set_pageblock_migratetype(pageblock_page, migratetype);

1045

set_pageblock_migratetype(pageblock_page, migratetype);

1046

pageblock_page += pageblock_nr_pages;

1046

pageblock_page += pageblock_nr_pages;

1047

}

1047

}

1048

}

1048

}

1049

1050

/*

1050

/*

1051

* If breaking a large block of pages, move all free pages to the preferred

1051

* If breaking a large block of pages, move all free pages to the preferred

1052

* allocation list. If falling back for a reclaimable kernel allocation, be

1052

* allocation list. If falling back for a reclaimable kernel allocation, be

1053

* more aggressive about taking ownership of free pages.

1053

* more aggressive about taking ownership of free pages.

1054

*

1054

*

1055

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1055

* On the other hand, never change migration type of MIGRATE_CMA pageblocks

1056

* nor move CMA pages to different free lists. We don't want unmovable pages

1056

* nor move CMA pages to different free lists. We don't want unmovable pages

1057

* to be allocated from MIGRATE_CMA areas.

1057

* to be allocated from MIGRATE_CMA areas.

1058

*

1058

*

1059

* Returns the new migratetype of the pageblock (or the same old migratetype

1059

* Returns the new migratetype of the pageblock (or the same old migratetype

1060

* if it was unchanged).

1060

* if it was unchanged).

1061

*/

1061

*/

1062

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1062

static int try_to_steal_freepages(struct zone *zone, struct page *page,

1063

int start_type, int fallback_type)

1063

int start_type, int fallback_type)

1064

{

1064

{

1065

int current_order = page_order(page);

1065

int current_order = page_order(page);

1066

1067

/*

1067

/*

1068

* When borrowing from MIGRATE_CMA, we need to release the excess

1068

* When borrowing from MIGRATE_CMA, we need to release the excess

1069

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1069

* buddy pages to CMA itself. We also ensure the freepage_migratetype

1070

* is set to CMA so it is returned to the correct freelist in case

1070

* is set to CMA so it is returned to the correct freelist in case

1071

* the page ends up being not actually allocated from the pcp lists.

1071

* the page ends up being not actually allocated from the pcp lists.

1072

*/

1072

*/

1073

if (is_migrate_cma(fallback_type))

1073

if (is_migrate_cma(fallback_type))

1074

return fallback_type;

1074

return fallback_type;

1075

1076

/* Take ownership for orders >= pageblock_order */

1076

/* Take ownership for orders >= pageblock_order */

1077

if (current_order >= pageblock_order) {

1077

if (current_order >= pageblock_order) {

1078

change_pageblock_range(page, current_order, start_type);

1078

change_pageblock_range(page, current_order, start_type);

1079

return start_type;

1079

return start_type;

1080

}

1080

}

1081

1082

if (current_order >= pageblock_order / 2 ||

1082

if (current_order >= pageblock_order / 2 ||

1083

start_type == MIGRATE_RECLAIMABLE ||

1083

start_type == MIGRATE_RECLAIMABLE ||

1084

page_group_by_mobility_disabled) {

1084

page_group_by_mobility_disabled) {

1085

int pages;

1085

int pages;

1086

1087

pages = move_freepages_block(zone, page, start_type);

1087

pages = move_freepages_block(zone, page, start_type);

1088

1089

/* Claim the whole block if over half of it is free */

1089

/* Claim the whole block if over half of it is free */

1090

if (pages >= (1 << (pageblock_order-1)) ||

1090

if (pages >= (1 << (pageblock_order-1)) ||

1091

page_group_by_mobility_disabled) {

1091

page_group_by_mobility_disabled) {

1092

1093

set_pageblock_migratetype(page, start_type);

1093

set_pageblock_migratetype(page, start_type);

1094

return start_type;

1094

return start_type;

1095

}

1095

}

1096

1097

}

1097

}

1098

1099

return fallback_type;

1099

return fallback_type;

1100

}

1100

}

1101

1102

/* Remove an element from the buddy allocator from the fallback list */

1102

/* Remove an element from the buddy allocator from the fallback list */

1103

static inline struct page *

1103

static inline struct page *

1104

__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)

1104

__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)

1105

{

1105

{

1106

struct free_area *area;

1106

struct free_area *area;

1107

unsigned int current_order;

1107

unsigned int current_order;

1108

struct page *page;

1108

struct page *page;

1109

int migratetype, new_type, i;

1109

int migratetype, new_type, i;

1110

1111

/* Find the largest possible block of pages in the other list */

1111

/* Find the largest possible block of pages in the other list */

1112

for (current_order = MAX_ORDER-1;

1112

for (current_order = MAX_ORDER-1;

1113

current_order >= order && current_order <= MAX_ORDER-1;

1113

current_order >= order && current_order <= MAX_ORDER-1;

1114

--current_order) {

1114

--current_order) {

1115

for (i = 0;; i++) {

1115

for (i = 0;; i++) {

1116

migratetype = fallbacks[start_migratetype][i];

1116

migratetype = fallbacks[start_migratetype][i];

1117

1118

/* MIGRATE_RESERVE handled later if necessary */

1118

/* MIGRATE_RESERVE handled later if necessary */

1119

if (migratetype == MIGRATE_RESERVE)

1119

if (migratetype == MIGRATE_RESERVE)

1120

break;

1120

break;

1121

1122

area = &(zone->free_area[current_order]);

1122

area = &(zone->free_area[current_order]);

1123

if (list_empty(&area->free_list[migratetype]))

1123

if (list_empty(&area->free_list[migratetype]))

1124

continue;

1124

continue;

1125

1126

page = list_entry(area->free_list[migratetype].next,

1126

page = list_entry(area->free_list[migratetype].next,

1127

struct page, lru);

1127

struct page, lru);

1128

area->nr_free--;

1128

area->nr_free--;

1129

1130

new_type = try_to_steal_freepages(zone, page,

1130

new_type = try_to_steal_freepages(zone, page,

1131

start_migratetype,

1131

start_migratetype,

1132

migratetype);

1132

migratetype);

1133

1134

/* Remove the page from the freelists */

1134

/* Remove the page from the freelists */

1135

list_del(&page->lru);

1135

list_del(&page->lru);

1136

rmv_page_order(page);

1136

rmv_page_order(page);

1137

1138

expand(zone, page, order, current_order, area,

1138

expand(zone, page, order, current_order, area,

1139

new_type);

1139

new_type);

1140

/* The freepage_migratetype may differ from pageblock's

1140

/* The freepage_migratetype may differ from pageblock's

1141

* migratetype depending on the decisions in

1141

* migratetype depending on the decisions in

1142

* try_to_steal_freepages. This is OK as long as it does

1142

* try_to_steal_freepages. This is OK as long as it does

1143

* not differ for MIGRATE_CMA type.

1143

* not differ for MIGRATE_CMA type.

1144

*/

1144

*/

1145

set_freepage_migratetype(page, new_type);

1145

set_freepage_migratetype(page, new_type);

1146

1147

trace_mm_page_alloc_extfrag(page, order, current_order,

1147

trace_mm_page_alloc_extfrag(page, order, current_order,

1148

start_migratetype, migratetype, new_type);

1148

start_migratetype, migratetype, new_type);

1149

1150

return page;

1150

return page;

1151

}

1151

}

1152

}

1152

}

1153

1154

return NULL;

1154

return NULL;

1155

}

1155

}

1156

1157

/*

1157

/*

1158

* Do the hard work of removing an element from the buddy allocator.

1158

* Do the hard work of removing an element from the buddy allocator.

1159

* Call me with the zone->lock already held.

1159

* Call me with the zone->lock already held.

1160

*/

1160

*/

1161

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1161

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1162

int migratetype)

1162

int migratetype)

1163

{

1163

{

1164

struct page *page;

1164

struct page *page;

1165

1166

retry_reserve:

1166

retry_reserve:

1167

page = __rmqueue_smallest(zone, order, migratetype);

1167

page = __rmqueue_smallest(zone, order, migratetype);

1168

1169

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1169

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1170

page = __rmqueue_fallback(zone, order, migratetype);

1170

page = __rmqueue_fallback(zone, order, migratetype);

1171

1172

/*

1172

/*

1173

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1173

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1174

* is used because __rmqueue_smallest is an inline function

1174

* is used because __rmqueue_smallest is an inline function

1175

* and we want just one call site

1175

* and we want just one call site

1176

*/

1176

*/

1177

if (!page) {

1177

if (!page) {

1178

migratetype = MIGRATE_RESERVE;

1178

migratetype = MIGRATE_RESERVE;

1179

goto retry_reserve;

1179

goto retry_reserve;

1180

}

1180

}

1181

}

1181

}

1182

1183

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1183

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1184

return page;

1184

return page;

1185

}

1185

}

1186

1187

/*

1187

/*

1188

* Obtain a specified number of elements from the buddy allocator, all under

1188

* Obtain a specified number of elements from the buddy allocator, all under

1189

* a single hold of the lock, for efficiency. Add them to the supplied list.

1189

* a single hold of the lock, for efficiency. Add them to the supplied list.

1190

* Returns the number of new pages which were placed at *list.

1190

* Returns the number of new pages which were placed at *list.

1191

*/

1191

*/

1192

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1192

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1193

unsigned long count, struct list_head *list,

1193

unsigned long count, struct list_head *list,

1194

int migratetype, bool cold)

1194

int migratetype, bool cold)

1195

{

1195

{

1196

int i;

1196

int i;

1197

1198

spin_lock(&zone->lock);

1198

spin_lock(&zone->lock);

1199

for (i = 0; i < count; ++i) {

1199

for (i = 0; i < count; ++i) {

1200

struct page *page = __rmqueue(zone, order, migratetype);

1200

struct page *page = __rmqueue(zone, order, migratetype);

1201

if (unlikely(page == NULL))

1201

if (unlikely(page == NULL))

1202

break;

1202

break;

1203

1204

/*

1204

/*

1205

* Split buddy pages returned by expand() are received here

1205

* Split buddy pages returned by expand() are received here

1206

* in physical page order. The page is added to the callers and

1206

* in physical page order. The page is added to the callers and

1207

* list and the list head then moves forward. From the callers

1207

* list and the list head then moves forward. From the callers

1208

* perspective, the linked list is ordered by page number in

1208

* perspective, the linked list is ordered by page number in

1209

* some conditions. This is useful for IO devices that can

1209

* some conditions. This is useful for IO devices that can

1210

* merge IO requests if the physical pages are ordered

1210

* merge IO requests if the physical pages are ordered

1211

* properly.

1211

* properly.

1212

*/

1212

*/

1213

if (likely(!cold))

1213

if (likely(!cold))

1214

list_add(&page->lru, list);

1214

list_add(&page->lru, list);

1215

else

1215

else

1216

list_add_tail(&page->lru, list);

1216

list_add_tail(&page->lru, list);

1217

list = &page->lru;

1217

list = &page->lru;

1218

if (is_migrate_cma(get_freepage_migratetype(page)))

1218

if (is_migrate_cma(get_freepage_migratetype(page)))

1219

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1219

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1220

-(1 << order));

1220

-(1 << order));

1221

}

1221

}

1222

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1222

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1223

spin_unlock(&zone->lock);

1223

spin_unlock(&zone->lock);

1224

return i;

1224

return i;

1225

}

1225

}

1226

1227

#ifdef CONFIG_NUMA

1227

#ifdef CONFIG_NUMA

1228

/*

1228

/*

1229

* Called from the vmstat counter updater to drain pagesets of this

1229

* Called from the vmstat counter updater to drain pagesets of this

1230

* currently executing processor on remote nodes after they have

1230

* currently executing processor on remote nodes after they have

1231

* expired.

1231

* expired.

1232

*

1232

*

1233

* Note that this function must be called with the thread pinned to

1233

* Note that this function must be called with the thread pinned to

1234

* a single processor.

1234

* a single processor.

1235

*/

1235

*/

1236

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1236

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1237

{

1237

{

1238

unsigned long flags;

1238

unsigned long flags;

1239

int to_drain;

1239

int to_drain;

1240

unsigned long batch;

1240

unsigned long batch;

1241

1242

local_irq_save(flags);

1242

local_irq_save(flags);

1243

batch = ACCESS_ONCE(pcp->batch);

1243

batch = ACCESS_ONCE(pcp->batch);

1244

if (pcp->count >= batch)

1244

if (pcp->count >= batch)

1245

to_drain = batch;

1245

to_drain = batch;

1246

else

1246

else

1247

to_drain = pcp->count;

1247

to_drain = pcp->count;

1248

if (to_drain > 0) {

1248

if (to_drain > 0) {

1249

free_pcppages_bulk(zone, to_drain, pcp);

1249

free_pcppages_bulk(zone, to_drain, pcp);

1250

pcp->count -= to_drain;

1250

pcp->count -= to_drain;

1251

}

1251

}

1252

local_irq_restore(flags);

1252

local_irq_restore(flags);

1253

}

1253

}

1254

#endif

1254

#endif

1255

1256

/*

1256

/*

1257

* Drain pages of the indicated processor.

1257

* Drain pages of the indicated processor.

1258

*

1258

*

1259

* The processor must either be the current processor and the

1259

* The processor must either be the current processor and the

1260

* thread pinned to the current processor or a processor that

1260

* thread pinned to the current processor or a processor that

1261

* is not online.

1261

* is not online.

1262

*/

1262

*/

1263

static void drain_pages(unsigned int cpu)

1263

static void drain_pages(unsigned int cpu)

1264

{

1264

{

1265

unsigned long flags;

1265

unsigned long flags;

1266

struct zone *zone;

1266

struct zone *zone;

1267

1268

for_each_populated_zone(zone) {

1268

for_each_populated_zone(zone) {

1269

struct per_cpu_pageset *pset;

1269

struct per_cpu_pageset *pset;

1270

struct per_cpu_pages *pcp;

1270

struct per_cpu_pages *pcp;

1271

1272

local_irq_save(flags);

1272

local_irq_save(flags);

1273

pset = per_cpu_ptr(zone->pageset, cpu);

1273

pset = per_cpu_ptr(zone->pageset, cpu);

1274

1275

pcp = &pset->pcp;

1275

pcp = &pset->pcp;

1276

if (pcp->count) {

1276

if (pcp->count) {

1277

free_pcppages_bulk(zone, pcp->count, pcp);

1277

free_pcppages_bulk(zone, pcp->count, pcp);

1278

pcp->count = 0;

1278

pcp->count = 0;

1279

}

1279

}

1280

local_irq_restore(flags);

1280

local_irq_restore(flags);

1281

}

1281

}

1282

}

1282

}

1283

1284

/*

1284

/*

1285

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1285

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1286

*/

1286

*/

1287

void drain_local_pages(void *arg)

1287

void drain_local_pages(void *arg)

1288

{

1288

{

1289

drain_pages(smp_processor_id());

1289

drain_pages(smp_processor_id());

1290

}

1290

}

1291

1292

/*

1292

/*

1293

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1293

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1294

*

1294

*

1295

* Note that this code is protected against sending an IPI to an offline

1295

* Note that this code is protected against sending an IPI to an offline

1296

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1296

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1297

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1297

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1298

* nothing keeps CPUs from showing up after we populated the cpumask and

1298

* nothing keeps CPUs from showing up after we populated the cpumask and

1299

* before the call to on_each_cpu_mask().

1299

* before the call to on_each_cpu_mask().

1300

*/

1300

*/

1301

void drain_all_pages(void)

1301

void drain_all_pages(void)

1302

{

1302

{

1303

int cpu;

1303

int cpu;

1304

struct per_cpu_pageset *pcp;

1304

struct per_cpu_pageset *pcp;

1305

struct zone *zone;

1305

struct zone *zone;

1306

1307

/*

1307

/*

1308

* Allocate in the BSS so we wont require allocation in

1308

* Allocate in the BSS so we wont require allocation in

1309

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1309

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1310

*/

1310

*/

1311

static cpumask_t cpus_with_pcps;

1311

static cpumask_t cpus_with_pcps;

1312

1313

/*

1313

/*

1314

* We don't care about racing with CPU hotplug event

1314

* We don't care about racing with CPU hotplug event

1315

* as offline notification will cause the notified

1315

* as offline notification will cause the notified

1316

* cpu to drain that CPU pcps and on_each_cpu_mask

1316

* cpu to drain that CPU pcps and on_each_cpu_mask

1317

* disables preemption as part of its processing

1317

* disables preemption as part of its processing

1318

*/

1318

*/

1319

for_each_online_cpu(cpu) {

1319

for_each_online_cpu(cpu) {

1320

bool has_pcps = false;

1320

bool has_pcps = false;

1321

for_each_populated_zone(zone) {

1321

for_each_populated_zone(zone) {

1322

pcp = per_cpu_ptr(zone->pageset, cpu);

1322

pcp = per_cpu_ptr(zone->pageset, cpu);

1323

if (pcp->pcp.count) {

1323

if (pcp->pcp.count) {

1324

has_pcps = true;

1324

has_pcps = true;

1325

break;

1325

break;

1326

}

1326

}

1327

}

1327

}

1328

if (has_pcps)

1328

if (has_pcps)

1329

cpumask_set_cpu(cpu, &cpus_with_pcps);

1329

cpumask_set_cpu(cpu, &cpus_with_pcps);

1330

else

1330

else

1331

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1331

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1332

}

1332

}

1333

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1333

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1334

}

1334

}

1335

1336

#ifdef CONFIG_HIBERNATION

1336

#ifdef CONFIG_HIBERNATION

1337

1338

void mark_free_pages(struct zone *zone)

1338

void mark_free_pages(struct zone *zone)

1339

{

1339

{

1340

unsigned long pfn, max_zone_pfn;

1340

unsigned long pfn, max_zone_pfn;

1341

unsigned long flags;

1341

unsigned long flags;

1342

unsigned int order, t;

1342

unsigned int order, t;

1343

struct list_head *curr;

1343

struct list_head *curr;

1344

1345

if (zone_is_empty(zone))

1345

if (zone_is_empty(zone))

1346

return;

1346

return;

1347

1348

spin_lock_irqsave(&zone->lock, flags);

1348

spin_lock_irqsave(&zone->lock, flags);

1349

1350

max_zone_pfn = zone_end_pfn(zone);

1350

max_zone_pfn = zone_end_pfn(zone);

1351

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1351

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1352

if (pfn_valid(pfn)) {

1352

if (pfn_valid(pfn)) {

1353

struct page *page = pfn_to_page(pfn);

1353

struct page *page = pfn_to_page(pfn);

1354

1355

if (!swsusp_page_is_forbidden(page))

1355

if (!swsusp_page_is_forbidden(page))

1356

swsusp_unset_page_free(page);

1356

swsusp_unset_page_free(page);

1357

}

1357

}

1358

1359

for_each_migratetype_order(order, t) {

1359

for_each_migratetype_order(order, t) {

1360

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1360

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1361

unsigned long i;

1361

unsigned long i;

1362

1363

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1363

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1364

for (i = 0; i < (1UL << order); i++)

1364

for (i = 0; i < (1UL << order); i++)

1365

swsusp_set_page_free(pfn_to_page(pfn + i));

1365

swsusp_set_page_free(pfn_to_page(pfn + i));

1366

}

1366

}

1367

}

1367

}

1368

spin_unlock_irqrestore(&zone->lock, flags);

1368

spin_unlock_irqrestore(&zone->lock, flags);

1369

}

1369

}

1370

#endif /* CONFIG_PM */

1370

#endif /* CONFIG_PM */

1371

1372

/*

1372

/*

1373

* Free a 0-order page

1373

* Free a 0-order page

1374

* cold == true ? free a cold page : free a hot page

1374

* cold == true ? free a cold page : free a hot page

1375

*/

1375

*/

1376

void free_hot_cold_page(struct page *page, bool cold)

1376

void free_hot_cold_page(struct page *page, bool cold)

1377

{

1377

{

1378

struct zone *zone = page_zone(page);

1378

struct zone *zone = page_zone(page);

1379

struct per_cpu_pages *pcp;

1379

struct per_cpu_pages *pcp;

1380

unsigned long flags;

1380

unsigned long flags;

1381

unsigned long pfn = page_to_pfn(page);

1381

unsigned long pfn = page_to_pfn(page);

1382

int migratetype;

1382

int migratetype;

1383

1384

if (!free_pages_prepare(page, 0))

1384

if (!free_pages_prepare(page, 0))

1385

return;

1385

return;

1386

1387

migratetype = get_pfnblock_migratetype(page, pfn);

1387

migratetype = get_pfnblock_migratetype(page, pfn);

1388

set_freepage_migratetype(page, migratetype);

1388

set_freepage_migratetype(page, migratetype);

1389

local_irq_save(flags);

1389

local_irq_save(flags);

1390

__count_vm_event(PGFREE);

1390

__count_vm_event(PGFREE);

1391

1392

/*

1392

/*

1393

* We only track unmovable, reclaimable and movable on pcp lists.

1393

* We only track unmovable, reclaimable and movable on pcp lists.

1394

* Free ISOLATE pages back to the allocator because they are being

1394

* Free ISOLATE pages back to the allocator because they are being

1395

* offlined but treat RESERVE as movable pages so we can get those

1395

* offlined but treat RESERVE as movable pages so we can get those

1396

* areas back if necessary. Otherwise, we may have to free

1396

* areas back if necessary. Otherwise, we may have to free

1397

* excessively into the page allocator

1397

* excessively into the page allocator

1398

*/

1398

*/

1399

if (migratetype >= MIGRATE_PCPTYPES) {

1399

if (migratetype >= MIGRATE_PCPTYPES) {

1400

if (unlikely(is_migrate_isolate(migratetype))) {

1400

if (unlikely(is_migrate_isolate(migratetype))) {

1401

free_one_page(zone, page, pfn, 0, migratetype);

1401

free_one_page(zone, page, pfn, 0, migratetype);

1402

goto out;

1402

goto out;

1403

}

1403

}

1404

migratetype = MIGRATE_MOVABLE;

1404

migratetype = MIGRATE_MOVABLE;

1405

}

1405

}

1406

1407

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1407

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1408

if (!cold)

1408

if (!cold)

1409

list_add(&page->lru, &pcp->lists[migratetype]);

1409

list_add(&page->lru, &pcp->lists[migratetype]);

1410

else

1410

else

1411

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1411

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1412

pcp->count++;

1412

pcp->count++;

1413

if (pcp->count >= pcp->high) {

1413

if (pcp->count >= pcp->high) {

1414

unsigned long batch = ACCESS_ONCE(pcp->batch);

1414

unsigned long batch = ACCESS_ONCE(pcp->batch);

1415

free_pcppages_bulk(zone, batch, pcp);

1415

free_pcppages_bulk(zone, batch, pcp);

1416

pcp->count -= batch;

1416

pcp->count -= batch;

1417

}

1417

}

1418

1419

out:

1419

out:

1420

local_irq_restore(flags);

1420

local_irq_restore(flags);

1421

}

1421

}

1422

1423

/*

1423

/*

1424

* Free a list of 0-order pages

1424

* Free a list of 0-order pages

1425

*/

1425

*/

1426

void free_hot_cold_page_list(struct list_head *list, bool cold)

1426

void free_hot_cold_page_list(struct list_head *list, bool cold)

1427

{

1427

{

1428

struct page *page, *next;

1428

struct page *page, *next;

1429

1430

list_for_each_entry_safe(page, next, list, lru) {

1430

list_for_each_entry_safe(page, next, list, lru) {

1431

trace_mm_page_free_batched(page, cold);

1431

trace_mm_page_free_batched(page, cold);

1432

free_hot_cold_page(page, cold);

1432

free_hot_cold_page(page, cold);

1433

}

1433

}

1434

}

1434

}

1435

1436

/*

1436

/*

1437

* split_page takes a non-compound higher-order page, and splits it into

1437

* split_page takes a non-compound higher-order page, and splits it into

1438

* n (1<<order) sub-pages: page[0..n]

1438

* n (1<<order) sub-pages: page[0..n]

1439

* Each sub-page must be freed individually.

1439

* Each sub-page must be freed individually.

1440

*

1440

*

1441

* Note: this is probably too low level an operation for use in drivers.

1441

* Note: this is probably too low level an operation for use in drivers.

1442

* Please consult with lkml before using this in your driver.

1442

* Please consult with lkml before using this in your driver.

1443

*/

1443

*/

1444

void split_page(struct page *page, unsigned int order)

1444

void split_page(struct page *page, unsigned int order)

1445

{

1445

{

1446

int i;

1446

int i;

1447

1448

VM_BUG_ON(PageCompound(page));

1448

VM_BUG_ON(PageCompound(page));

1449

VM_BUG_ON(!page_count(page));

1449

VM_BUG_ON(!page_count(page));

1450

1451

#ifdef CONFIG_KMEMCHECK

1451

#ifdef CONFIG_KMEMCHECK

1452

/*

1452

/*

1453

* Split shadow pages too, because free(page[0]) would

1453

* Split shadow pages too, because free(page[0]) would

1454

* otherwise free the whole shadow.

1454

* otherwise free the whole shadow.

1455

*/

1455

*/

1456

if (kmemcheck_page_is_tracked(page))

1456

if (kmemcheck_page_is_tracked(page))

1457

split_page(virt_to_page(page[0].shadow), order);

1457

split_page(virt_to_page(page[0].shadow), order);

1458

#endif

1458

#endif

1459

1460

for (i = 1; i < (1 << order); i++)

1460

for (i = 1; i < (1 << order); i++)

1461

set_page_refcounted(page + i);

1461

set_page_refcounted(page + i);

1462

}

1462

}

1463

EXPORT_SYMBOL_GPL(split_page);

1463

EXPORT_SYMBOL_GPL(split_page);

1464

1465

static int __isolate_free_page(struct page *page, unsigned int order)

1465

static int __isolate_free_page(struct page *page, unsigned int order)

1466

{

1466

{

1467

unsigned long watermark;

1467

unsigned long watermark;

1468

struct zone *zone;

1468

struct zone *zone;

1469

int mt;

1469

int mt;

1470

1471

BUG_ON(!PageBuddy(page));

1471

BUG_ON(!PageBuddy(page));

1472

1473

zone = page_zone(page);

1473

zone = page_zone(page);

1474

mt = get_pageblock_migratetype(page);

1474

mt = get_pageblock_migratetype(page);

1475

1476

if (!is_migrate_isolate(mt)) {

1476

if (!is_migrate_isolate(mt)) {

1477

/* Obey watermarks as if the page was being allocated */

1477

/* Obey watermarks as if the page was being allocated */

1478

watermark = low_wmark_pages(zone) + (1 << order);

1478

watermark = low_wmark_pages(zone) + (1 << order);

1479

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1479

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1480

return 0;

1480

return 0;

1481

1482

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1482

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1483

}

1483

}

1484

1485

/* Remove page from free list */

1485

/* Remove page from free list */

1486

list_del(&page->lru);

1486

list_del(&page->lru);

1487

zone->free_area[order].nr_free--;

1487

zone->free_area[order].nr_free--;

1488

rmv_page_order(page);

1488

rmv_page_order(page);

1489

1490

/* Set the pageblock if the isolated page is at least a pageblock */

1490

/* Set the pageblock if the isolated page is at least a pageblock */

1491

if (order >= pageblock_order - 1) {

1491

if (order >= pageblock_order - 1) {

1492

struct page *endpage = page + (1 << order) - 1;

1492

struct page *endpage = page + (1 << order) - 1;

1493

for (; page < endpage; page += pageblock_nr_pages) {

1493

for (; page < endpage; page += pageblock_nr_pages) {

1494

int mt = get_pageblock_migratetype(page);

1494

int mt = get_pageblock_migratetype(page);

1495

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1495

if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))

1496

set_pageblock_migratetype(page,

1496

set_pageblock_migratetype(page,

1497

MIGRATE_MOVABLE);

1497

MIGRATE_MOVABLE);

1498

}

1498

}

1499

}

1499

}

1500

1501

return 1UL << order;

1501

return 1UL << order;

1502

}

1502

}

1503

1504

/*

1504

/*

1505

* Similar to split_page except the page is already free. As this is only

1505

* Similar to split_page except the page is already free. As this is only

1506

* being used for migration, the migratetype of the block also changes.

1506

* being used for migration, the migratetype of the block also changes.

1507

* As this is called with interrupts disabled, the caller is responsible

1507

* As this is called with interrupts disabled, the caller is responsible

1508

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1508

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1509

* are enabled.

1509

* are enabled.

1510

*

1510

*

1511

* Note: this is probably too low level an operation for use in drivers.

1511

* Note: this is probably too low level an operation for use in drivers.

1512

* Please consult with lkml before using this in your driver.

1512

* Please consult with lkml before using this in your driver.

1513

*/

1513

*/

1514

int split_free_page(struct page *page)

1514

int split_free_page(struct page *page)

1515

{

1515

{

1516

unsigned int order;

1516

unsigned int order;

1517

int nr_pages;

1517

int nr_pages;

1518

1519

order = page_order(page);

1519

order = page_order(page);

1520

1521

nr_pages = __isolate_free_page(page, order);

1521

nr_pages = __isolate_free_page(page, order);

1522

if (!nr_pages)

1522

if (!nr_pages)

1523

return 0;

1523

return 0;

1524

1525

/* Split into individual pages */

1525

/* Split into individual pages */

1526

set_page_refcounted(page);

1526

set_page_refcounted(page);

1527

split_page(page, order);

1527

split_page(page, order);

1528

return nr_pages;

1528

return nr_pages;

1529

}

1529

}

1530

1531

/*

1531

/*

1532

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1532

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1533

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1533

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1534

* or two.

1534

* or two.

1535

*/

1535

*/

1536

static inline

1536

static inline

1537

struct page *buffered_rmqueue(struct zone *preferred_zone,

1537

struct page *buffered_rmqueue(struct zone *preferred_zone,

1538

struct zone *zone, unsigned int order,

1538

struct zone *zone, unsigned int order,

1539

gfp_t gfp_flags, int migratetype)

1539

gfp_t gfp_flags, int migratetype)

1540

{

1540

{

1541

unsigned long flags;

1541

unsigned long flags;

1542

struct page *page;

1542

struct page *page;

1543

bool cold = ((gfp_flags & __GFP_COLD) != 0);

1543

bool cold = ((gfp_flags & __GFP_COLD) != 0);

1544

1545

again:

1545

again:

1546

if (likely(order == 0)) {

1546

if (likely(order == 0)) {

1547

struct per_cpu_pages *pcp;

1547

struct per_cpu_pages *pcp;

1548

struct list_head *list;

1548

struct list_head *list;

1549

1550

local_irq_save(flags);

1550

local_irq_save(flags);

1551

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1551

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1552

list = &pcp->lists[migratetype];

1552

list = &pcp->lists[migratetype];

1553

if (list_empty(list)) {

1553

if (list_empty(list)) {

1554

pcp->count += rmqueue_bulk(zone, 0,

1554

pcp->count += rmqueue_bulk(zone, 0,

1555

pcp->batch, list,

1555

pcp->batch, list,

1556

migratetype, cold);

1556

migratetype, cold);

1557

if (unlikely(list_empty(list)))

1557

if (unlikely(list_empty(list)))

1558

goto failed;

1558

goto failed;

1559

}

1559

}

1560

1561

if (cold)

1561

if (cold)

1562

page = list_entry(list->prev, struct page, lru);

1562

page = list_entry(list->prev, struct page, lru);

1563

else

1563

else

1564

page = list_entry(list->next, struct page, lru);

1564

page = list_entry(list->next, struct page, lru);

1565

1566

list_del(&page->lru);

1566

list_del(&page->lru);

1567

pcp->count--;

1567

pcp->count--;

1568

} else {

1568

} else {

1569

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1569

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1570

/*

1570

/*

1571

* __GFP_NOFAIL is not to be used in new code.

1571

* __GFP_NOFAIL is not to be used in new code.

1572

*

1572

*

1573

* All __GFP_NOFAIL callers should be fixed so that they

1573

* All __GFP_NOFAIL callers should be fixed so that they

1574

* properly detect and handle allocation failures.

1574

* properly detect and handle allocation failures.

1575

*

1575

*

1576

* We most definitely don't want callers attempting to

1576

* We most definitely don't want callers attempting to

1577

* allocate greater than order-1 page units with

1577

* allocate greater than order-1 page units with

1578

* __GFP_NOFAIL.

1578

* __GFP_NOFAIL.

1579

*/

1579

*/

1580

WARN_ON_ONCE(order > 1);

1580

WARN_ON_ONCE(order > 1);

1581

}

1581

}

1582

spin_lock_irqsave(&zone->lock, flags);

1582

spin_lock_irqsave(&zone->lock, flags);

1583

page = __rmqueue(zone, order, migratetype);

1583

page = __rmqueue(zone, order, migratetype);

1584

spin_unlock(&zone->lock);

1584

spin_unlock(&zone->lock);

1585

if (!page)

1585

if (!page)

1586

goto failed;

1586

goto failed;

1587

__mod_zone_freepage_state(zone, -(1 << order),

1587

__mod_zone_freepage_state(zone, -(1 << order),

1588

get_freepage_migratetype(page));

1588

get_freepage_migratetype(page));

1589

}

1589

}

1590

1591

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1591

__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));

1592

1593

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1593

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1594

zone_statistics(preferred_zone, zone, gfp_flags);

1594

zone_statistics(preferred_zone, zone, gfp_flags);

1595

local_irq_restore(flags);

1595

local_irq_restore(flags);

1596

1597

VM_BUG_ON(bad_range(zone, page));

1597

VM_BUG_ON(bad_range(zone, page));

1598

if (prep_new_page(page, order, gfp_flags))

1598

if (prep_new_page(page, order, gfp_flags))

1599

goto again;

1599

goto again;

1600

return page;

1600

return page;

1601

1602

failed:

1602

failed:

1603

local_irq_restore(flags);

1603

local_irq_restore(flags);

1604

return NULL;

1604

return NULL;

1605

}

1605

}

1606

1607

#ifdef CONFIG_FAIL_PAGE_ALLOC

1607

#ifdef CONFIG_FAIL_PAGE_ALLOC

1608

1609

static struct {

1609

static struct {

1610

struct fault_attr attr;

1610

struct fault_attr attr;

1611

1612

u32 ignore_gfp_highmem;

1612

u32 ignore_gfp_highmem;

1613

u32 ignore_gfp_wait;

1613

u32 ignore_gfp_wait;

1614

u32 min_order;

1614

u32 min_order;

1615

} fail_page_alloc = {

1615

} fail_page_alloc = {

1616

.attr = FAULT_ATTR_INITIALIZER,

1616

.attr = FAULT_ATTR_INITIALIZER,

1617

.ignore_gfp_wait = 1,

1617

.ignore_gfp_wait = 1,

1618

.ignore_gfp_highmem = 1,

1618

.ignore_gfp_highmem = 1,

1619

.min_order = 1,

1619

.min_order = 1,

1620

};

1620

};

1621

1622

static int __init setup_fail_page_alloc(char *str)

1622

static int __init setup_fail_page_alloc(char *str)

1623

{

1623

{

1624

return setup_fault_attr(&fail_page_alloc.attr, str);

1624

return setup_fault_attr(&fail_page_alloc.attr, str);

1625

}

1625

}

1626

__setup("fail_page_alloc=", setup_fail_page_alloc);

1626

__setup("fail_page_alloc=", setup_fail_page_alloc);

1627

1628

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1628

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1629

{

1629

{

1630

if (order < fail_page_alloc.min_order)

1630

if (order < fail_page_alloc.min_order)

1631

return false;

1631

return false;

1632

if (gfp_mask & __GFP_NOFAIL)

1632

if (gfp_mask & __GFP_NOFAIL)

1633

return false;

1633

return false;

1634

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1634

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1635

return false;

1635

return false;

1636

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1636

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1637

return false;

1637

return false;

1638

1639

return should_fail(&fail_page_alloc.attr, 1 << order);

1639

return should_fail(&fail_page_alloc.attr, 1 << order);

1640

}

1640

}

1641

1642

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1642

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1643

1644

static int __init fail_page_alloc_debugfs(void)

1644

static int __init fail_page_alloc_debugfs(void)

1645

{

1645

{

1646

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1646

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1647

struct dentry *dir;

1647

struct dentry *dir;

1648

1649

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1649

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1650

&fail_page_alloc.attr);

1650

&fail_page_alloc.attr);

1651

if (IS_ERR(dir))

1651

if (IS_ERR(dir))

1652

return PTR_ERR(dir);

1652

return PTR_ERR(dir);

1653

1654

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1654

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1655

&fail_page_alloc.ignore_gfp_wait))

1655

&fail_page_alloc.ignore_gfp_wait))

1656

goto fail;

1656

goto fail;

1657

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1657

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1658

&fail_page_alloc.ignore_gfp_highmem))

1658

&fail_page_alloc.ignore_gfp_highmem))

1659

goto fail;

1659

goto fail;

1660

if (!debugfs_create_u32("min-order", mode, dir,

1660

if (!debugfs_create_u32("min-order", mode, dir,

1661

&fail_page_alloc.min_order))

1661

&fail_page_alloc.min_order))

1662

goto fail;

1662

goto fail;

1663

1664

return 0;

1664

return 0;

1665

fail:

1665

fail:

1666

debugfs_remove_recursive(dir);

1666

debugfs_remove_recursive(dir);

1667

1668

return -ENOMEM;

1668

return -ENOMEM;

1669

}

1669

}

1670

1671

late_initcall(fail_page_alloc_debugfs);

1671

late_initcall(fail_page_alloc_debugfs);

1672

1673

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1673

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1674

1675

#else /* CONFIG_FAIL_PAGE_ALLOC */

1675

#else /* CONFIG_FAIL_PAGE_ALLOC */

1676

1677

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1677

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1678

{

1678

{

1679

return false;

1679

return false;

1680

}

1680

}

1681

1682

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1682

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1683

1684

/*

1684

/*

1685

* Return true if free pages are above 'mark'. This takes into account the order

1685

* Return true if free pages are above 'mark'. This takes into account the order

1686

* of the allocation.

1686

* of the allocation.

1687

*/

1687

*/

1688

static bool __zone_watermark_ok(struct zone *z, unsigned int order,

1688

static bool __zone_watermark_ok(struct zone *z, unsigned int order,

1689

unsigned long mark, int classzone_idx, int alloc_flags,

1689

unsigned long mark, int classzone_idx, int alloc_flags,

1690

long free_pages)

1690

long free_pages)

1691

{

1691

{

1692

/* free_pages my go negative - that's OK */

1692

/* free_pages my go negative - that's OK */

1693

long min = mark;

1693

long min = mark;

1694

int o;

1694

int o;

1695

long free_cma = 0;

1695

long free_cma = 0;

1696

1697

free_pages -= (1 << order) - 1;

1697

free_pages -= (1 << order) - 1;

1698

if (alloc_flags & ALLOC_HIGH)

1698

if (alloc_flags & ALLOC_HIGH)

1699

min -= min / 2;

1699

min -= min / 2;

1700

if (alloc_flags & ALLOC_HARDER)

1700

if (alloc_flags & ALLOC_HARDER)

1701

min -= min / 4;

1701

min -= min / 4;

1702

#ifdef CONFIG_CMA

1702

#ifdef CONFIG_CMA

1703

/* If allocation can't use CMA areas don't use free CMA pages */

1703

/* If allocation can't use CMA areas don't use free CMA pages */

1704

if (!(alloc_flags & ALLOC_CMA))

1704

if (!(alloc_flags & ALLOC_CMA))

1705

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1705

free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

1706

#endif

1706

#endif

1707

1708

if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])

1708

if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])

1709

return false;

1709

return false;

1710

for (o = 0; o < order; o++) {

1710

for (o = 0; o < order; o++) {

1711

/* At the next order, this order's pages become unavailable */

1711

/* At the next order, this order's pages become unavailable */

1712

free_pages -= z->free_area[o].nr_free << o;

1712

free_pages -= z->free_area[o].nr_free << o;

1713

1714

/* Require fewer higher order pages to be free */

1714

/* Require fewer higher order pages to be free */

1715

min >>= 1;

1715

min >>= 1;

1716

1717

if (free_pages <= min)

1717

if (free_pages <= min)

1718

return false;

1718

return false;

1719

}

1719

}

1720

return true;

1720

return true;

1721

}

1721

}

1722

1723

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

1723

bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,

1724

int classzone_idx, int alloc_flags)

1724

int classzone_idx, int alloc_flags)

1725

{

1725

{

1726

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1726

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1727

zone_page_state(z, NR_FREE_PAGES));

1727

zone_page_state(z, NR_FREE_PAGES));

1728

}

1728

}

1729

1730

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

1730

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,

1731

unsigned long mark, int classzone_idx, int alloc_flags)

1731

unsigned long mark, int classzone_idx, int alloc_flags)

1732

{

1732

{

1733

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1733

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1734

1735

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1735

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1736

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1736

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1737

1738

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1738

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1739

free_pages);

1739

free_pages);

1740

}

1740

}

1741

1742

#ifdef CONFIG_NUMA

1742

#ifdef CONFIG_NUMA

1743

/*

1743

/*

1744

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1744

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1745

* skip over zones that are not allowed by the cpuset, or that have

1745

* skip over zones that are not allowed by the cpuset, or that have

1746

* been recently (in last second) found to be nearly full. See further

1746

* been recently (in last second) found to be nearly full. See further

1747

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1747

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1748

* that have to skip over a lot of full or unallowed zones.

1748

* that have to skip over a lot of full or unallowed zones.

1749

*

1749

*

1750

* If the zonelist cache is present in the passed in zonelist, then

1750

* If the zonelist cache is present in the passed in zonelist, then

1751

* returns a pointer to the allowed node mask (either the current

1751

* returns a pointer to the allowed node mask (either the current

1752

* tasks mems_allowed, or node_states[N_MEMORY].)

1752

* tasks mems_allowed, or node_states[N_MEMORY].)

1753

*

1753

*

1754

* If the zonelist cache is not available for this zonelist, does

1754

* If the zonelist cache is not available for this zonelist, does

1755

* nothing and returns NULL.

1755

* nothing and returns NULL.

1756

*

1756

*

1757

* If the fullzones BITMAP in the zonelist cache is stale (more than

1757

* If the fullzones BITMAP in the zonelist cache is stale (more than

1758

* a second since last zap'd) then we zap it out (clear its bits.)

1758

* a second since last zap'd) then we zap it out (clear its bits.)

1759

*

1759

*

1760

* We hold off even calling zlc_setup, until after we've checked the

1760

* We hold off even calling zlc_setup, until after we've checked the

1761

* first zone in the zonelist, on the theory that most allocations will

1761

* first zone in the zonelist, on the theory that most allocations will

1762

* be satisfied from that first zone, so best to examine that zone as

1762

* be satisfied from that first zone, so best to examine that zone as

1763

* quickly as we can.

1763

* quickly as we can.

1764

*/

1764

*/

1765

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1765

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1766

{

1766

{

1767

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1767

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1768

nodemask_t *allowednodes; /* zonelist_cache approximation */

1768

nodemask_t *allowednodes; /* zonelist_cache approximation */

1769

1770

zlc = zonelist->zlcache_ptr;

1770

zlc = zonelist->zlcache_ptr;

1771

if (!zlc)

1771

if (!zlc)

1772

return NULL;

1772

return NULL;

1773

1774

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1774

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1775

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1775

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1776

zlc->last_full_zap = jiffies;

1776

zlc->last_full_zap = jiffies;

1777

}

1777

}

1778

1779

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1779

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1780

&cpuset_current_mems_allowed :

1780

&cpuset_current_mems_allowed :

1781

&node_states[N_MEMORY];

1781

&node_states[N_MEMORY];

1782

return allowednodes;

1782

return allowednodes;

1783

}

1783

}

1784

1785

/*

1785

/*

1786

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1786

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1787

* if it is worth looking at further for free memory:

1787

* if it is worth looking at further for free memory:

1788

* 1) Check that the zone isn't thought to be full (doesn't have its

1788

* 1) Check that the zone isn't thought to be full (doesn't have its

1789

* bit set in the zonelist_cache fullzones BITMAP).

1789

* bit set in the zonelist_cache fullzones BITMAP).

1790

* 2) Check that the zones node (obtained from the zonelist_cache

1790

* 2) Check that the zones node (obtained from the zonelist_cache

1791

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1791

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1792

* Return true (non-zero) if zone is worth looking at further, or

1792

* Return true (non-zero) if zone is worth looking at further, or

1793

* else return false (zero) if it is not.

1793

* else return false (zero) if it is not.

1794

*

1794

*

1795

* This check -ignores- the distinction between various watermarks,

1795

* This check -ignores- the distinction between various watermarks,

1796

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1796

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1797

* found to be full for any variation of these watermarks, it will

1797

* found to be full for any variation of these watermarks, it will

1798

* be considered full for up to one second by all requests, unless

1798

* be considered full for up to one second by all requests, unless

1799

* we are so low on memory on all allowed nodes that we are forced

1799

* we are so low on memory on all allowed nodes that we are forced

1800

* into the second scan of the zonelist.

1800

* into the second scan of the zonelist.

1801

*

1801

*

1802

* In the second scan we ignore this zonelist cache and exactly

1802

* In the second scan we ignore this zonelist cache and exactly

1803

* apply the watermarks to all zones, even it is slower to do so.

1803

* apply the watermarks to all zones, even it is slower to do so.

1804

* We are low on memory in the second scan, and should leave no stone

1804

* We are low on memory in the second scan, and should leave no stone

1805

* unturned looking for a free page.

1805

* unturned looking for a free page.

1806

*/

1806

*/

1807

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1807

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1808

nodemask_t *allowednodes)

1808

nodemask_t *allowednodes)

1809

{

1809

{

1810

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1810

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1811

int i; /* index of *z in zonelist zones */

1811

int i; /* index of *z in zonelist zones */

1812

int n; /* node that zone *z is on */

1812

int n; /* node that zone *z is on */

1813

1814

zlc = zonelist->zlcache_ptr;

1814

zlc = zonelist->zlcache_ptr;

1815

if (!zlc)

1815

if (!zlc)

1816

return 1;

1816

return 1;

1817

1818

i = z - zonelist->_zonerefs;

1818

i = z - zonelist->_zonerefs;

1819

n = zlc->z_to_n[i];

1819

n = zlc->z_to_n[i];

1820

1821

/* This zone is worth trying if it is allowed but not full */

1821

/* This zone is worth trying if it is allowed but not full */

1822

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1822

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1823

}

1823

}

1824

1825

/*

1825

/*

1826

* Given 'z' scanning a zonelist, set the corresponding bit in

1826

* Given 'z' scanning a zonelist, set the corresponding bit in

1827

* zlc->fullzones, so that subsequent attempts to allocate a page

1827

* zlc->fullzones, so that subsequent attempts to allocate a page

1828

* from that zone don't waste time re-examining it.

1828

* from that zone don't waste time re-examining it.

1829

*/

1829

*/

1830

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1830

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1831

{

1831

{

1832

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1832

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1833

int i; /* index of *z in zonelist zones */

1833

int i; /* index of *z in zonelist zones */

1834

1835

zlc = zonelist->zlcache_ptr;

1835

zlc = zonelist->zlcache_ptr;

1836

if (!zlc)

1836

if (!zlc)

1837

return;

1837

return;

1838

1839

i = z - zonelist->_zonerefs;

1839

i = z - zonelist->_zonerefs;

1840

1841

set_bit(i, zlc->fullzones);

1841

set_bit(i, zlc->fullzones);

1842

}

1842

}

1843

1844

/*

1844

/*

1845

* clear all zones full, called after direct reclaim makes progress so that

1845

* clear all zones full, called after direct reclaim makes progress so that

1846

* a zone that was recently full is not skipped over for up to a second

1846

* a zone that was recently full is not skipped over for up to a second

1847

*/

1847

*/

1848

static void zlc_clear_zones_full(struct zonelist *zonelist)

1848

static void zlc_clear_zones_full(struct zonelist *zonelist)

1849

{

1849

{

1850

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1850

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1851

1852

zlc = zonelist->zlcache_ptr;

1852

zlc = zonelist->zlcache_ptr;

1853

if (!zlc)

1853

if (!zlc)

1854

return;

1854

return;

1855

1856

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1856

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1857

}

1857

}

1858

1859

static bool zone_local(struct zone *local_zone, struct zone *zone)

1859

static bool zone_local(struct zone *local_zone, struct zone *zone)

1860

{

1860

{

1861

return local_zone->node == zone->node;

1861

return local_zone->node == zone->node;

1862

}

1862

}

1863

1864

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1864

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1865

{

1865

{

1866

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1866

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1867

}

1867

}

1868

1869

static void __paginginit init_zone_allows_reclaim(int nid)

1869

static void __paginginit init_zone_allows_reclaim(int nid)

1870

{

1870

{

1871

int i;

1871

int i;

1872

1873

for_each_node_state(i, N_MEMORY)

1873

for_each_node_state(i, N_MEMORY)

1874

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1874

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1875

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1875

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1876

else

1876

else

1877

zone_reclaim_mode = 1;

1877

zone_reclaim_mode = 1;

1878

}

1878

}

1879

1880

#else /* CONFIG_NUMA */

1880

#else /* CONFIG_NUMA */

1881

1882

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1882

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1883

{

1883

{

1884

return NULL;

1884

return NULL;

1885

}

1885

}

1886

1887

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1887

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1888

nodemask_t *allowednodes)

1888

nodemask_t *allowednodes)

1889

{

1889

{

1890

return 1;

1890

return 1;

1891

}

1891

}

1892

1893

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1893

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1894

{

1894

{

1895

}

1895

}

1896

1897

static void zlc_clear_zones_full(struct zonelist *zonelist)

1897

static void zlc_clear_zones_full(struct zonelist *zonelist)

1898

{

1898

{

1899

}

1899

}

1900

1901

static bool zone_local(struct zone *local_zone, struct zone *zone)

1901

static bool zone_local(struct zone *local_zone, struct zone *zone)

1902

{

1902

{

1903

return true;

1903

return true;

1904

}

1904

}

1905

1906

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1906

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1907

{

1907

{

1908

return true;

1908

return true;

1909

}

1909

}

1910

1911

static inline void init_zone_allows_reclaim(int nid)

1911

static inline void init_zone_allows_reclaim(int nid)

1912

{

1912

{

1913

}

1913

}

1914

#endif /* CONFIG_NUMA */

1914

#endif /* CONFIG_NUMA */

1915

1916

/*

1916

/*

1917

* get_page_from_freelist goes through the zonelist trying to allocate

1917

* get_page_from_freelist goes through the zonelist trying to allocate

1918

* a page.

1918

* a page.

1919

*/

1919

*/

1920

static struct page *

1920

static struct page *

1921

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1921

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1922

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1922

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1923

struct zone *preferred_zone, int classzone_idx, int migratetype)

1923

struct zone *preferred_zone, int classzone_idx, int migratetype)

1924

{

1924

{

1925

struct zoneref *z;

1925

struct zoneref *z;

1926

struct page *page = NULL;

1926

struct page *page = NULL;

1927

struct zone *zone;

1927

struct zone *zone;

1928

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1928

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1929

int zlc_active = 0; /* set if using zonelist_cache */

1929

int zlc_active = 0; /* set if using zonelist_cache */

1930

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1930

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1931

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1931

bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&

1932

(gfp_mask & __GFP_WRITE);

1932

(gfp_mask & __GFP_WRITE);

1933

1934

zonelist_scan:

1934

zonelist_scan:

1935

/*

1935

/*

1936

* Scan zonelist, looking for a zone with enough free.

1936

* Scan zonelist, looking for a zone with enough free.

1937

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1937

* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.

1938

*/

1938

*/

1939

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1939

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1940

high_zoneidx, nodemask) {

1940

high_zoneidx, nodemask) {

1941

unsigned long mark;

1941

unsigned long mark;

1942

1943

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1943

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1944

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1944

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1945

continue;

1945

continue;

1946

if (cpusets_enabled() &&

1946

if (cpusets_enabled() &&

1947

(alloc_flags & ALLOC_CPUSET) &&

1947

(alloc_flags & ALLOC_CPUSET) &&

1948

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1948

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1949

continue;

1949

continue;

1950

/*

1950

/*

1951

* Distribute pages in proportion to the individual

1951

* Distribute pages in proportion to the individual

1952

* zone size to ensure fair page aging. The zone a

1952

* zone size to ensure fair page aging. The zone a

1953

* page was allocated in should have no effect on the

1953

* page was allocated in should have no effect on the

1954

* time the page has in memory before being reclaimed.

1954

* time the page has in memory before being reclaimed.

1955

*/

1955

*/

1956

if (alloc_flags & ALLOC_FAIR) {

1956

if (alloc_flags & ALLOC_FAIR) {

1957

if (!zone_local(preferred_zone, zone))

1957

if (!zone_local(preferred_zone, zone))

1958

continue;

1958

break;

1959

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1959

if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)

1960

continue;

1960

continue;

1961

}

1961

}

1962

/*

1962

/*

1963

* When allocating a page cache page for writing, we

1963

* When allocating a page cache page for writing, we

1964

* want to get it from a zone that is within its dirty

1964

* want to get it from a zone that is within its dirty

1965

* limit, such that no single zone holds more than its

1965

* limit, such that no single zone holds more than its

1966

* proportional share of globally allowed dirty pages.

1966

* proportional share of globally allowed dirty pages.

1967

* The dirty limits take into account the zone's

1967

* The dirty limits take into account the zone's

1968

* lowmem reserves and high watermark so that kswapd

1968

* lowmem reserves and high watermark so that kswapd

1969

* should be able to balance it without having to

1969

* should be able to balance it without having to

1970

* write pages from its LRU list.

1970

* write pages from its LRU list.

1971

*

1971

*

1972

* This may look like it could increase pressure on

1972

* This may look like it could increase pressure on

1973

* lower zones by failing allocations in higher zones

1973

* lower zones by failing allocations in higher zones

1974

* before they are full. But the pages that do spill

1974

* before they are full. But the pages that do spill

1975

* over are limited as the lower zones are protected

1975

* over are limited as the lower zones are protected

1976

* by this very same mechanism. It should not become

1976

* by this very same mechanism. It should not become

1977

* a practical burden to them.

1977

* a practical burden to them.

1978

*

1978

*

1979

* XXX: For now, allow allocations to potentially

1979

* XXX: For now, allow allocations to potentially

1980

* exceed the per-zone dirty limit in the slowpath

1980

* exceed the per-zone dirty limit in the slowpath

1981

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1981

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1982

* which is important when on a NUMA setup the allowed

1982

* which is important when on a NUMA setup the allowed

1983

* zones are together not big enough to reach the

1983

* zones are together not big enough to reach the

1984

* global limit. The proper fix for these situations

1984

* global limit. The proper fix for these situations

1985

* will require awareness of zones in the

1985

* will require awareness of zones in the

1986

* dirty-throttling and the flusher threads.

1986

* dirty-throttling and the flusher threads.

1987

*/

1987

*/

1988

if (consider_zone_dirty && !zone_dirty_ok(zone))

1988

if (consider_zone_dirty && !zone_dirty_ok(zone))

1989

continue;

1989

continue;

1990

1991

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1991

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1992

if (!zone_watermark_ok(zone, order, mark,

1992

if (!zone_watermark_ok(zone, order, mark,

1993

classzone_idx, alloc_flags)) {

1993

classzone_idx, alloc_flags)) {

1994

int ret;

1994

int ret;

1995

1996

/* Checked here to keep the fast path fast */

1996

/* Checked here to keep the fast path fast */

1997

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1997

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1998

if (alloc_flags & ALLOC_NO_WATERMARKS)

1998

if (alloc_flags & ALLOC_NO_WATERMARKS)

1999

goto try_this_zone;

1999

goto try_this_zone;

2000

2001

if (IS_ENABLED(CONFIG_NUMA) &&

2001

if (IS_ENABLED(CONFIG_NUMA) &&

2002

!did_zlc_setup && nr_online_nodes > 1) {

2002

!did_zlc_setup && nr_online_nodes > 1) {

2003

/*

2003

/*

2004

* we do zlc_setup if there are multiple nodes

2004

* we do zlc_setup if there are multiple nodes

2005

* and before considering the first zone allowed

2005

* and before considering the first zone allowed

2006

* by the cpuset.

2006

* by the cpuset.

2007

*/

2007

*/

2008

allowednodes = zlc_setup(zonelist, alloc_flags);

2008

allowednodes = zlc_setup(zonelist, alloc_flags);

2009

zlc_active = 1;

2009

zlc_active = 1;

2010

did_zlc_setup = 1;

2010

did_zlc_setup = 1;

2011

}

2011

}

2012

2013

if (zone_reclaim_mode == 0 ||

2013

if (zone_reclaim_mode == 0 ||

2014

!zone_allows_reclaim(preferred_zone, zone))

2014

!zone_allows_reclaim(preferred_zone, zone))

2015

goto this_zone_full;

2015

goto this_zone_full;

2016

2017

/*

2017

/*

2018

* As we may have just activated ZLC, check if the first

2018

* As we may have just activated ZLC, check if the first

2019

* eligible zone has failed zone_reclaim recently.

2019

* eligible zone has failed zone_reclaim recently.

2020

*/

2020

*/

2021

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2021

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

2022

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2022

!zlc_zone_worth_trying(zonelist, z, allowednodes))

2023

continue;

2023

continue;

2024

2025

ret = zone_reclaim(zone, gfp_mask, order);

2025

ret = zone_reclaim(zone, gfp_mask, order);

2026

switch (ret) {

2026

switch (ret) {

2027

case ZONE_RECLAIM_NOSCAN:

2027

case ZONE_RECLAIM_NOSCAN:

2028

/* did not scan */

2028

/* did not scan */

2029

continue;

2029

continue;

2030

case ZONE_RECLAIM_FULL:

2030

case ZONE_RECLAIM_FULL:

2031

/* scanned but unreclaimable */

2031

/* scanned but unreclaimable */

2032

continue;

2032

continue;

2033

default:

2033

default:

2034

/* did we reclaim enough */

2034

/* did we reclaim enough */

2035

if (zone_watermark_ok(zone, order, mark,

2035

if (zone_watermark_ok(zone, order, mark,

2036

classzone_idx, alloc_flags))

2036

classzone_idx, alloc_flags))

2037

goto try_this_zone;

2037

goto try_this_zone;

2038

2039

/*

2039

/*

2040

* Failed to reclaim enough to meet watermark.

2040

* Failed to reclaim enough to meet watermark.

2041

* Only mark the zone full if checking the min

2041

* Only mark the zone full if checking the min

2042

* watermark or if we failed to reclaim just

2042

* watermark or if we failed to reclaim just

2043

* 1<<order pages or else the page allocator

2043

* 1<<order pages or else the page allocator

2044

* fastpath will prematurely mark zones full

2044

* fastpath will prematurely mark zones full

2045

* when the watermark is between the low and

2045

* when the watermark is between the low and

2046

* min watermarks.

2046

* min watermarks.

2047

*/

2047

*/

2048

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2048

if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||

2049

ret == ZONE_RECLAIM_SOME)

2049

ret == ZONE_RECLAIM_SOME)

2050

goto this_zone_full;

2050

goto this_zone_full;

2051

2052

continue;

2052

continue;

2053

}

2053

}

2054

}

2054

}

2055

2056

try_this_zone:

2056

try_this_zone:

2057

page = buffered_rmqueue(preferred_zone, zone, order,

2057

page = buffered_rmqueue(preferred_zone, zone, order,

2058

gfp_mask, migratetype);

2058

gfp_mask, migratetype);

2059

if (page)

2059

if (page)

2060

break;

2060

break;

2061

this_zone_full:

2061

this_zone_full:

2062

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2062

if (IS_ENABLED(CONFIG_NUMA) && zlc_active)

2063

zlc_mark_zone_full(zonelist, z);

2063

zlc_mark_zone_full(zonelist, z);

2064

}

2064

}

2065

2066

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2066

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

2067

/* Disable zlc cache for second zonelist scan */

2067

/* Disable zlc cache for second zonelist scan */

2068

zlc_active = 0;

2068

zlc_active = 0;

2069

goto zonelist_scan;

2069

goto zonelist_scan;

2070

}

2070

}

2071

2072

if (page)

2072

if (page)

2073

/*

2073

/*

2074

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2074

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

2075

* necessary to allocate the page. The expectation is

2075

* necessary to allocate the page. The expectation is

2076

* that the caller is taking steps that will free more

2076

* that the caller is taking steps that will free more

2077

* memory. The caller should avoid the page being used

2077

* memory. The caller should avoid the page being used

2078

* for !PFMEMALLOC purposes.

2078

* for !PFMEMALLOC purposes.

2079

*/

2079

*/

2080

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2080

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2081

2082

return page;

2082

return page;

2083

}

2083

}

2084

2085

/*

2085

/*

2086

* Large machines with many possible nodes should not always dump per-node

2086

* Large machines with many possible nodes should not always dump per-node

2087

* meminfo in irq context.

2087

* meminfo in irq context.

2088

*/

2088

*/

2089

static inline bool should_suppress_show_mem(void)

2089

static inline bool should_suppress_show_mem(void)

2090

{

2090

{

2091

bool ret = false;

2091

bool ret = false;

2092

2093

#if NODES_SHIFT > 8

2093

#if NODES_SHIFT > 8

2094

ret = in_interrupt();

2094

ret = in_interrupt();

2095

#endif

2095

#endif

2096

return ret;

2096

return ret;

2097

}

2097

}

2098

2099

static DEFINE_RATELIMIT_STATE(nopage_rs,

2099

static DEFINE_RATELIMIT_STATE(nopage_rs,

2100

DEFAULT_RATELIMIT_INTERVAL,

2100

DEFAULT_RATELIMIT_INTERVAL,

2101

DEFAULT_RATELIMIT_BURST);

2101

DEFAULT_RATELIMIT_BURST);

2102

2103

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2103

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

2104

{

2104

{

2105

unsigned int filter = SHOW_MEM_FILTER_NODES;

2105

unsigned int filter = SHOW_MEM_FILTER_NODES;

2106

2107

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2107

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

2108

debug_guardpage_minorder() > 0)

2108

debug_guardpage_minorder() > 0)

2109

return;

2109

return;

2110

2111

/*

2111

/*

2112

* Walking all memory to count page types is very expensive and should

2112

* Walking all memory to count page types is very expensive and should

2113

* be inhibited in non-blockable contexts.

2113

* be inhibited in non-blockable contexts.

2114

*/

2114

*/

2115

if (!(gfp_mask & __GFP_WAIT))

2115

if (!(gfp_mask & __GFP_WAIT))

2116

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2116

filter |= SHOW_MEM_FILTER_PAGE_COUNT;

2117

2118

/*

2118

/*

2119

* This documents exceptions given to allocations in certain

2119

* This documents exceptions given to allocations in certain

2120

* contexts that are allowed to allocate outside current's set

2120

* contexts that are allowed to allocate outside current's set

2121

* of allowed nodes.

2121

* of allowed nodes.

2122

*/

2122

*/

2123

if (!(gfp_mask & __GFP_NOMEMALLOC))

2123

if (!(gfp_mask & __GFP_NOMEMALLOC))

2124

if (test_thread_flag(TIF_MEMDIE) ||

2124

if (test_thread_flag(TIF_MEMDIE) ||

2125

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2125

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2126

filter &= ~SHOW_MEM_FILTER_NODES;

2126

filter &= ~SHOW_MEM_FILTER_NODES;

2127

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2127

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2128

filter &= ~SHOW_MEM_FILTER_NODES;

2128

filter &= ~SHOW_MEM_FILTER_NODES;

2129

2130

if (fmt) {

2130

if (fmt) {

2131

struct va_format vaf;

2131

struct va_format vaf;

2132

va_list args;

2132

va_list args;

2133

2134

va_start(args, fmt);

2134

va_start(args, fmt);

2135

2136

vaf.fmt = fmt;

2136

vaf.fmt = fmt;

2137

vaf.va = &args;

2137

vaf.va = &args;

2138

2139

pr_warn("%pV", &vaf);

2139

pr_warn("%pV", &vaf);

2140

2141

va_end(args);

2141

va_end(args);

2142

}

2142

}

2143

2144

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2144

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2145

current->comm, order, gfp_mask);

2145

current->comm, order, gfp_mask);

2146

2147

dump_stack();

2147

dump_stack();

2148

if (!should_suppress_show_mem())

2148

if (!should_suppress_show_mem())

2149

show_mem(filter);

2149

show_mem(filter);

2150

}

2150

}

2151

2152

static inline int

2152

static inline int

2153

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2153

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2154

unsigned long did_some_progress,

2154

unsigned long did_some_progress,

2155

unsigned long pages_reclaimed)

2155

unsigned long pages_reclaimed)

2156

{

2156

{

2157

/* Do not loop if specifically requested */

2157

/* Do not loop if specifically requested */

2158

if (gfp_mask & __GFP_NORETRY)

2158

if (gfp_mask & __GFP_NORETRY)

2159

return 0;

2159

return 0;

2160

2161

/* Always retry if specifically requested */

2161

/* Always retry if specifically requested */

2162

if (gfp_mask & __GFP_NOFAIL)

2162

if (gfp_mask & __GFP_NOFAIL)

2163

return 1;

2163

return 1;

2164

2165

/*

2165

/*

2166

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2166

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2167

* making forward progress without invoking OOM. Suspend also disables

2167

* making forward progress without invoking OOM. Suspend also disables

2168

* storage devices so kswapd will not help. Bail if we are suspending.

2168

* storage devices so kswapd will not help. Bail if we are suspending.

2169

*/

2169

*/

2170

if (!did_some_progress && pm_suspended_storage())

2170

if (!did_some_progress && pm_suspended_storage())

2171

return 0;

2171

return 0;

2172

2173

/*

2173

/*

2174

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2174

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2175

* means __GFP_NOFAIL, but that may not be true in other

2175

* means __GFP_NOFAIL, but that may not be true in other

2176

* implementations.

2176

* implementations.

2177

*/

2177

*/

2178

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2178

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2179

return 1;

2179

return 1;

2180

2181

/*

2181

/*

2182

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2182

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2183

* specified, then we retry until we no longer reclaim any pages

2183

* specified, then we retry until we no longer reclaim any pages

2184

* (above), or we've reclaimed an order of pages at least as

2184

* (above), or we've reclaimed an order of pages at least as

2185

* large as the allocation's order. In both cases, if the

2185

* large as the allocation's order. In both cases, if the

2186

* allocation still fails, we stop retrying.

2186

* allocation still fails, we stop retrying.

2187

*/

2187

*/

2188

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2188

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2189

return 1;

2189

return 1;

2190

2191

return 0;

2191

return 0;

2192

}

2192

}

2193

2194

static inline struct page *

2194

static inline struct page *

2195

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2195

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2196

struct zonelist *zonelist, enum zone_type high_zoneidx,

2196

struct zonelist *zonelist, enum zone_type high_zoneidx,

2197

nodemask_t *nodemask, struct zone *preferred_zone,

2197

nodemask_t *nodemask, struct zone *preferred_zone,

2198

int classzone_idx, int migratetype)

2198

int classzone_idx, int migratetype)

2199

{

2199

{

2200

struct page *page;

2200

struct page *page;

2201

2202

/* Acquire the OOM killer lock for the zones in zonelist */

2202

/* Acquire the OOM killer lock for the zones in zonelist */

2203

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2203

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2204

schedule_timeout_uninterruptible(1);

2204

schedule_timeout_uninterruptible(1);

2205

return NULL;

2205

return NULL;

2206

}

2206

}

2207

2208

/*

2208

/*

2209

* Go through the zonelist yet one more time, keep very high watermark

2209

* Go through the zonelist yet one more time, keep very high watermark

2210

* here, this is only to catch a parallel oom killing, we must fail if

2210

* here, this is only to catch a parallel oom killing, we must fail if

2211

* we're still under heavy pressure.

2211

* we're still under heavy pressure.

2212

*/

2212

*/

2213

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2213

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2214

order, zonelist, high_zoneidx,

2214

order, zonelist, high_zoneidx,

2215

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2215

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2216

preferred_zone, classzone_idx, migratetype);

2216

preferred_zone, classzone_idx, migratetype);

2217

if (page)

2217

if (page)

2218

goto out;

2218

goto out;

2219

2220

if (!(gfp_mask & __GFP_NOFAIL)) {

2220

if (!(gfp_mask & __GFP_NOFAIL)) {

2221

/* The OOM killer will not help higher order allocs */

2221

/* The OOM killer will not help higher order allocs */

2222

if (order > PAGE_ALLOC_COSTLY_ORDER)

2222

if (order > PAGE_ALLOC_COSTLY_ORDER)

2223

goto out;

2223

goto out;

2224

/* The OOM killer does not needlessly kill tasks for lowmem */

2224

/* The OOM killer does not needlessly kill tasks for lowmem */

2225

if (high_zoneidx < ZONE_NORMAL)

2225

if (high_zoneidx < ZONE_NORMAL)

2226

goto out;

2226

goto out;

2227

/*

2227

/*

2228

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2228

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2229

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2229

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2230

* The caller should handle page allocation failure by itself if

2230

* The caller should handle page allocation failure by itself if

2231

* it specifies __GFP_THISNODE.

2231

* it specifies __GFP_THISNODE.

2232

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2232

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2233

*/

2233

*/

2234

if (gfp_mask & __GFP_THISNODE)

2234

if (gfp_mask & __GFP_THISNODE)

2235

goto out;

2235

goto out;

2236

}

2236

}

2237

/* Exhausted what can be done so it's blamo time */

2237

/* Exhausted what can be done so it's blamo time */

2238

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2238

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2239

2240

out:

2240

out:

2241

clear_zonelist_oom(zonelist, gfp_mask);

2241

clear_zonelist_oom(zonelist, gfp_mask);

2242

return page;

2242

return page;

2243

}

2243

}

2244

2245

#ifdef CONFIG_COMPACTION

2245

#ifdef CONFIG_COMPACTION

2246

/* Try memory compaction for high-order allocations before reclaim */

2246

/* Try memory compaction for high-order allocations before reclaim */

2247

static struct page *

2247

static struct page *

2248

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2248

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2249

struct zonelist *zonelist, enum zone_type high_zoneidx,

2249

struct zonelist *zonelist, enum zone_type high_zoneidx,

2250

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2250

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2251

int classzone_idx, int migratetype, enum migrate_mode mode,

2251

int classzone_idx, int migratetype, enum migrate_mode mode,

2252

bool *contended_compaction, bool *deferred_compaction,

2252

bool *contended_compaction, bool *deferred_compaction,

2253

unsigned long *did_some_progress)

2253

unsigned long *did_some_progress)

2254

{

2254

{

2255

if (!order)

2255

if (!order)

2256

return NULL;

2256

return NULL;

2257

2258

if (compaction_deferred(preferred_zone, order)) {

2258

if (compaction_deferred(preferred_zone, order)) {

2259

*deferred_compaction = true;

2259

*deferred_compaction = true;

2260

return NULL;

2260

return NULL;

2261

}

2261

}

2262

2263

current->flags |= PF_MEMALLOC;

2263

current->flags |= PF_MEMALLOC;

2264

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2264

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2265

nodemask, mode,

2265

nodemask, mode,

2266

contended_compaction);

2266

contended_compaction);

2267

current->flags &= ~PF_MEMALLOC;

2267

current->flags &= ~PF_MEMALLOC;

2268

2269

if (*did_some_progress != COMPACT_SKIPPED) {

2269

if (*did_some_progress != COMPACT_SKIPPED) {

2270

struct page *page;

2270

struct page *page;

2271

2272

/* Page migration frees to the PCP lists but we want merging */

2272

/* Page migration frees to the PCP lists but we want merging */

2273

drain_pages(get_cpu());

2273

drain_pages(get_cpu());

2274

put_cpu();

2274

put_cpu();

2275

2276

page = get_page_from_freelist(gfp_mask, nodemask,

2276

page = get_page_from_freelist(gfp_mask, nodemask,

2277

order, zonelist, high_zoneidx,

2277

order, zonelist, high_zoneidx,

2278

alloc_flags & ~ALLOC_NO_WATERMARKS,

2278

alloc_flags & ~ALLOC_NO_WATERMARKS,

2279

preferred_zone, classzone_idx, migratetype);

2279

preferred_zone, classzone_idx, migratetype);

2280

if (page) {

2280

if (page) {

2281

preferred_zone->compact_blockskip_flush = false;

2281

preferred_zone->compact_blockskip_flush = false;

2282

compaction_defer_reset(preferred_zone, order, true);

2282

compaction_defer_reset(preferred_zone, order, true);

2283

count_vm_event(COMPACTSUCCESS);

2283

count_vm_event(COMPACTSUCCESS);

2284

return page;

2284

return page;

2285

}

2285

}

2286

2287

/*

2287

/*

2288

* It's bad if compaction run occurs and fails.

2288

* It's bad if compaction run occurs and fails.

2289

* The most likely reason is that pages exist,

2289

* The most likely reason is that pages exist,

2290

* but not enough to satisfy watermarks.

2290

* but not enough to satisfy watermarks.

2291

*/

2291

*/

2292

count_vm_event(COMPACTFAIL);

2292

count_vm_event(COMPACTFAIL);

2293

2294

/*

2294

/*

2295

* As async compaction considers a subset of pageblocks, only

2295

* As async compaction considers a subset of pageblocks, only

2296

* defer if the failure was a sync compaction failure.

2296

* defer if the failure was a sync compaction failure.

2297

*/

2297

*/

2298

if (mode != MIGRATE_ASYNC)

2298

if (mode != MIGRATE_ASYNC)

2299

defer_compaction(preferred_zone, order);

2299

defer_compaction(preferred_zone, order);

2300

2301

cond_resched();

2301

cond_resched();

2302

}

2302

}

2303

2304

return NULL;

2304

return NULL;

2305

}

2305

}

2306

#else

2306

#else

2307

static inline struct page *

2307

static inline struct page *

2308

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2308

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2309

struct zonelist *zonelist, enum zone_type high_zoneidx,

2309

struct zonelist *zonelist, enum zone_type high_zoneidx,

2310

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2310

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2311

int classzone_idx, int migratetype,

2311

int classzone_idx, int migratetype,

2312

enum migrate_mode mode, bool *contended_compaction,

2312

enum migrate_mode mode, bool *contended_compaction,

2313

bool *deferred_compaction, unsigned long *did_some_progress)

2313

bool *deferred_compaction, unsigned long *did_some_progress)

2314

{

2314

{

2315

return NULL;

2315

return NULL;

2316

}

2316

}

2317

#endif /* CONFIG_COMPACTION */

2317

#endif /* CONFIG_COMPACTION */

2318

2319

/* Perform direct synchronous page reclaim */

2319

/* Perform direct synchronous page reclaim */

2320

static int

2320

static int

2321

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2321

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2322

nodemask_t *nodemask)

2322

nodemask_t *nodemask)

2323

{

2323

{

2324

struct reclaim_state reclaim_state;

2324

struct reclaim_state reclaim_state;

2325

int progress;

2325

int progress;

2326

2327

cond_resched();

2327

cond_resched();

2328

2329

/* We now go into synchronous reclaim */

2329

/* We now go into synchronous reclaim */

2330

cpuset_memory_pressure_bump();

2330

cpuset_memory_pressure_bump();

2331

current->flags |= PF_MEMALLOC;

2331

current->flags |= PF_MEMALLOC;

2332

lockdep_set_current_reclaim_state(gfp_mask);

2332

lockdep_set_current_reclaim_state(gfp_mask);

2333

reclaim_state.reclaimed_slab = 0;

2333

reclaim_state.reclaimed_slab = 0;

2334

current->reclaim_state = &reclaim_state;

2334

current->reclaim_state = &reclaim_state;

2335

2336

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2336

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2337

2338

current->reclaim_state = NULL;

2338

current->reclaim_state = NULL;

2339

lockdep_clear_current_reclaim_state();

2339

lockdep_clear_current_reclaim_state();

2340

current->flags &= ~PF_MEMALLOC;

2340

current->flags &= ~PF_MEMALLOC;

2341

2342

cond_resched();

2342

cond_resched();

2343

2344

return progress;

2344

return progress;

2345

}

2345

}

2346

2347

/* The really slow allocator path where we enter direct reclaim */

2347

/* The really slow allocator path where we enter direct reclaim */

2348

static inline struct page *

2348

static inline struct page *

2349

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2349

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2350

struct zonelist *zonelist, enum zone_type high_zoneidx,

2350

struct zonelist *zonelist, enum zone_type high_zoneidx,

2351

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2351

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2352

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2352

int classzone_idx, int migratetype, unsigned long *did_some_progress)

2353

{

2353

{

2354

struct page *page = NULL;

2354

struct page *page = NULL;

2355

bool drained = false;

2355

bool drained = false;

2356

2357

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2357

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2358

nodemask);

2358

nodemask);

2359

if (unlikely(!(*did_some_progress)))

2359

if (unlikely(!(*did_some_progress)))

2360

return NULL;

2360

return NULL;

2361

2362

/* After successful reclaim, reconsider all zones for allocation */

2362

/* After successful reclaim, reconsider all zones for allocation */

2363

if (IS_ENABLED(CONFIG_NUMA))

2363

if (IS_ENABLED(CONFIG_NUMA))

2364

zlc_clear_zones_full(zonelist);

2364

zlc_clear_zones_full(zonelist);

2365

2366

retry:

2366

retry:

2367

page = get_page_from_freelist(gfp_mask, nodemask, order,

2367

page = get_page_from_freelist(gfp_mask, nodemask, order,

2368

zonelist, high_zoneidx,

2368

zonelist, high_zoneidx,

2369

alloc_flags & ~ALLOC_NO_WATERMARKS,

2369

alloc_flags & ~ALLOC_NO_WATERMARKS,

2370

preferred_zone, classzone_idx,

2370

preferred_zone, classzone_idx,

2371

migratetype);

2371

migratetype);

2372

2373

/*

2373

/*

2374

* If an allocation failed after direct reclaim, it could be because

2374

* If an allocation failed after direct reclaim, it could be because

2375

* pages are pinned on the per-cpu lists. Drain them and try again

2375

* pages are pinned on the per-cpu lists. Drain them and try again

2376

*/

2376

*/

2377

if (!page && !drained) {

2377

if (!page && !drained) {

2378

drain_all_pages();

2378

drain_all_pages();

2379

drained = true;

2379

drained = true;

2380

goto retry;

2380

goto retry;

2381

}

2381

}

2382

2383

return page;

2383

return page;

2384

}

2384

}

2385

2386

/*

2386

/*

2387

* This is called in the allocator slow-path if the allocation request is of

2387

* This is called in the allocator slow-path if the allocation request is of

2388

* sufficient urgency to ignore watermarks and take other desperate measures

2388

* sufficient urgency to ignore watermarks and take other desperate measures

2389

*/

2389

*/

2390

static inline struct page *

2390

static inline struct page *

2391

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2391

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2392

struct zonelist *zonelist, enum zone_type high_zoneidx,

2392

struct zonelist *zonelist, enum zone_type high_zoneidx,

2393

nodemask_t *nodemask, struct zone *preferred_zone,

2393

nodemask_t *nodemask, struct zone *preferred_zone,

2394

int classzone_idx, int migratetype)

2394

int classzone_idx, int migratetype)

2395

{

2395

{

2396

struct page *page;

2396

struct page *page;

2397

2398

do {

2398

do {

2399

page = get_page_from_freelist(gfp_mask, nodemask, order,

2399

page = get_page_from_freelist(gfp_mask, nodemask, order,

2400

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2400

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2401

preferred_zone, classzone_idx, migratetype);

2401

preferred_zone, classzone_idx, migratetype);

2402

2403

if (!page && gfp_mask & __GFP_NOFAIL)

2403

if (!page && gfp_mask & __GFP_NOFAIL)

2404

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2404

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2405

} while (!page && (gfp_mask & __GFP_NOFAIL));

2405

} while (!page && (gfp_mask & __GFP_NOFAIL));

2406

2407

return page;

2407

return page;

2408

}

2408

}

2409

2410

static void reset_alloc_batches(struct zonelist *zonelist,

2410

static void reset_alloc_batches(struct zonelist *zonelist,

2411

enum zone_type high_zoneidx,

2411

enum zone_type high_zoneidx,

2412

struct zone *preferred_zone)

2412

struct zone *preferred_zone)

2413

{

2413

{

2414

struct zoneref *z;

2414

struct zoneref *z;

2415

struct zone *zone;

2415

struct zone *zone;

2416

2417

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2417

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {

2418

/*

2418

/*

2419

* Only reset the batches of zones that were actually

2419

* Only reset the batches of zones that were actually

2420

* considered in the fairness pass, we don't want to

2420

* considered in the fairness pass, we don't want to

2421

* trash fairness information for zones that are not

2421

* trash fairness information for zones that are not

2422

* actually part of this zonelist's round-robin cycle.

2422

* actually part of this zonelist's round-robin cycle.

2423

*/

2423

*/

2424

if (!zone_local(preferred_zone, zone))

2424

if (!zone_local(preferred_zone, zone))

2425

continue;

2425

continue;

2426

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2426

mod_zone_page_state(zone, NR_ALLOC_BATCH,

2427

high_wmark_pages(zone) - low_wmark_pages(zone) -

2427

high_wmark_pages(zone) - low_wmark_pages(zone) -

2428

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2428

atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));

2429

}

2429

}

2430

}

2430

}

2431

2432

static void wake_all_kswapds(unsigned int order,

2432

static void wake_all_kswapds(unsigned int order,

2433

struct zonelist *zonelist,

2433

struct zonelist *zonelist,

2434

enum zone_type high_zoneidx,

2434

enum zone_type high_zoneidx,

2435

struct zone *preferred_zone)

2435

struct zone *preferred_zone)

2436

{

2436

{

2437

struct zoneref *z;

2437

struct zoneref *z;

2438

struct zone *zone;

2438

struct zone *zone;

2439

2440

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2440

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2441

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2441

wakeup_kswapd(zone, order, zone_idx(preferred_zone));

2442

}

2442

}

2443

2444

static inline int

2444

static inline int

2445

gfp_to_alloc_flags(gfp_t gfp_mask)

2445

gfp_to_alloc_flags(gfp_t gfp_mask)

2446

{

2446

{

2447

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2447

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2448

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2448

const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));

2449

2450

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2450

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2451

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2451

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2452

2453

/*

2453

/*

2454

* The caller may dip into page reserves a bit more if the caller

2454

* The caller may dip into page reserves a bit more if the caller

2455

* cannot run direct reclaim, or if the caller has realtime scheduling

2455

* cannot run direct reclaim, or if the caller has realtime scheduling

2456

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2456

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2457

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2457

* set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).

2458

*/

2458

*/

2459

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2459

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2460

2461

if (atomic) {

2461

if (atomic) {

2462

/*

2462

/*

2463

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2463

* Not worth trying to allocate harder for __GFP_NOMEMALLOC even

2464

* if it can't schedule.

2464

* if it can't schedule.

2465

*/

2465

*/

2466

if (!(gfp_mask & __GFP_NOMEMALLOC))

2466

if (!(gfp_mask & __GFP_NOMEMALLOC))

2467

alloc_flags |= ALLOC_HARDER;

2467

alloc_flags |= ALLOC_HARDER;

2468

/*

2468

/*

2469

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2469

* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the

2470

* comment for __cpuset_node_allowed_softwall().

2470

* comment for __cpuset_node_allowed_softwall().

2471

*/

2471

*/

2472

alloc_flags &= ~ALLOC_CPUSET;

2472

alloc_flags &= ~ALLOC_CPUSET;

2473

} else if (unlikely(rt_task(current)) && !in_interrupt())

2473

} else if (unlikely(rt_task(current)) && !in_interrupt())

2474

alloc_flags |= ALLOC_HARDER;

2474

alloc_flags |= ALLOC_HARDER;

2475

2476

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2476

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2477

if (gfp_mask & __GFP_MEMALLOC)

2477

if (gfp_mask & __GFP_MEMALLOC)

2478

alloc_flags |= ALLOC_NO_WATERMARKS;

2478

alloc_flags |= ALLOC_NO_WATERMARKS;

2479

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2479

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2480

alloc_flags |= ALLOC_NO_WATERMARKS;

2480

alloc_flags |= ALLOC_NO_WATERMARKS;

2481

else if (!in_interrupt() &&

2481

else if (!in_interrupt() &&

2482

((current->flags & PF_MEMALLOC) ||

2482

((current->flags & PF_MEMALLOC) ||

2483

unlikely(test_thread_flag(TIF_MEMDIE))))

2483

unlikely(test_thread_flag(TIF_MEMDIE))))

2484

alloc_flags |= ALLOC_NO_WATERMARKS;

2484

alloc_flags |= ALLOC_NO_WATERMARKS;

2485

}

2485

}

2486

#ifdef CONFIG_CMA

2486

#ifdef CONFIG_CMA

2487

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2487

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2488

alloc_flags |= ALLOC_CMA;

2488

alloc_flags |= ALLOC_CMA;

2489

#endif

2489

#endif

2490

return alloc_flags;

2490

return alloc_flags;

2491

}

2491

}

2492

2493

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2493

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2494

{

2494

{

2495

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2495

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2496

}

2496

}

2497

2498

static inline struct page *

2498

static inline struct page *

2499

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2499

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2500

struct zonelist *zonelist, enum zone_type high_zoneidx,

2500

struct zonelist *zonelist, enum zone_type high_zoneidx,

2501

nodemask_t *nodemask, struct zone *preferred_zone,

2501

nodemask_t *nodemask, struct zone *preferred_zone,

2502

int classzone_idx, int migratetype)

2502

int classzone_idx, int migratetype)

2503

{

2503

{

2504

const gfp_t wait = gfp_mask & __GFP_WAIT;

2504

const gfp_t wait = gfp_mask & __GFP_WAIT;

2505

struct page *page = NULL;

2505

struct page *page = NULL;

2506

int alloc_flags;

2506

int alloc_flags;

2507

unsigned long pages_reclaimed = 0;

2507

unsigned long pages_reclaimed = 0;

2508

unsigned long did_some_progress;

2508

unsigned long did_some_progress;

2509

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2509

enum migrate_mode migration_mode = MIGRATE_ASYNC;

2510

bool deferred_compaction = false;

2510

bool deferred_compaction = false;

2511

bool contended_compaction = false;

2511

bool contended_compaction = false;

2512

2513

/*

2513

/*

2514

* In the slowpath, we sanity check order to avoid ever trying to

2514

* In the slowpath, we sanity check order to avoid ever trying to

2515

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2515

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2516

* be using allocators in order of preference for an area that is

2516

* be using allocators in order of preference for an area that is

2517

* too large.

2517

* too large.

2518

*/

2518

*/

2519

if (order >= MAX_ORDER) {

2519

if (order >= MAX_ORDER) {

2520

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2520

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2521

return NULL;

2521

return NULL;

2522

}

2522

}

2523

2524

/*

2524

/*

2525

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2525

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2526

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2526

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2527

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2527

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2528

* using a larger set of nodes after it has established that the

2528

* using a larger set of nodes after it has established that the

2529

* allowed per node queues are empty and that nodes are

2529

* allowed per node queues are empty and that nodes are

2530

* over allocated.

2530

* over allocated.

2531

*/

2531

*/

2532

if (IS_ENABLED(CONFIG_NUMA) &&

2532

if (IS_ENABLED(CONFIG_NUMA) &&

2533

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2533

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2534

goto nopage;

2534

goto nopage;

2535

2536

restart:

2536

restart:

2537

if (!(gfp_mask & __GFP_NO_KSWAPD))

2537

if (!(gfp_mask & __GFP_NO_KSWAPD))

2538

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2538

wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);

2539

2540

/*

2540

/*

2541

* OK, we're below the kswapd watermark and have kicked background

2541

* OK, we're below the kswapd watermark and have kicked background

2542

* reclaim. Now things get more complex, so set up alloc_flags according

2542

* reclaim. Now things get more complex, so set up alloc_flags according

2543

* to how we want to proceed.

2543

* to how we want to proceed.

2544

*/

2544

*/

2545

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2545

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2546

2547

/*

2547

/*

2548

* Find the true preferred zone if the allocation is unconstrained by

2548

* Find the true preferred zone if the allocation is unconstrained by

2549

* cpusets.

2549

* cpusets.

2550

*/

2550

*/

2551

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2551

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {

2552

struct zoneref *preferred_zoneref;

2552

struct zoneref *preferred_zoneref;

2553

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2553

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2554

NULL,

2554

NULL,

2555

&preferred_zone);

2555

&preferred_zone);

2556

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2556

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2557

}

2557

}

2558

2559

rebalance:

2559

rebalance:

2560

/* This is the last chance, in general, before the goto nopage. */

2560

/* This is the last chance, in general, before the goto nopage. */

2561

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2561

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2562

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2562

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2563

preferred_zone, classzone_idx, migratetype);

2563

preferred_zone, classzone_idx, migratetype);

2564

if (page)

2564

if (page)

2565

goto got_pg;

2565

goto got_pg;

2566

2567

/* Allocate without watermarks if the context allows */

2567

/* Allocate without watermarks if the context allows */

2568

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2568

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2569

/*

2569

/*

2570

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2570

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2571

* the allocation is high priority and these type of

2571

* the allocation is high priority and these type of

2572

* allocations are system rather than user orientated

2572

* allocations are system rather than user orientated

2573

*/

2573

*/

2574

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2574

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2575

2576

page = __alloc_pages_high_priority(gfp_mask, order,

2576

page = __alloc_pages_high_priority(gfp_mask, order,

2577

zonelist, high_zoneidx, nodemask,

2577

zonelist, high_zoneidx, nodemask,

2578

preferred_zone, classzone_idx, migratetype);

2578

preferred_zone, classzone_idx, migratetype);

2579

if (page) {

2579

if (page) {

2580

goto got_pg;

2580

goto got_pg;

2581

}

2581

}

2582

}

2582

}

2583

2584

/* Atomic allocations - we can't balance anything */

2584

/* Atomic allocations - we can't balance anything */

2585

if (!wait)

2585

if (!wait)

2586

goto nopage;

2586

goto nopage;

2587

2588

/* Avoid recursion of direct reclaim */

2588

/* Avoid recursion of direct reclaim */

2589

if (current->flags & PF_MEMALLOC)

2589

if (current->flags & PF_MEMALLOC)

2590

goto nopage;

2590

goto nopage;

2591

2592

/* Avoid allocations with no watermarks from looping endlessly */

2592

/* Avoid allocations with no watermarks from looping endlessly */

2593

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2593

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2594

goto nopage;

2594

goto nopage;

2595

2596

/*

2596

/*

2597

* Try direct compaction. The first pass is asynchronous. Subsequent

2597

* Try direct compaction. The first pass is asynchronous. Subsequent

2598

* attempts after direct reclaim are synchronous

2598

* attempts after direct reclaim are synchronous

2599

*/

2599

*/

2600

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2600

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2601

high_zoneidx, nodemask, alloc_flags,

2601

high_zoneidx, nodemask, alloc_flags,

2602

preferred_zone,

2602

preferred_zone,

2603

classzone_idx, migratetype,

2603

classzone_idx, migratetype,

2604

migration_mode, &contended_compaction,

2604

migration_mode, &contended_compaction,

2605

&deferred_compaction,

2605

&deferred_compaction,

2606

&did_some_progress);

2606

&did_some_progress);

2607

if (page)

2607

if (page)

2608

goto got_pg;

2608

goto got_pg;

2609

migration_mode = MIGRATE_SYNC_LIGHT;

2609

migration_mode = MIGRATE_SYNC_LIGHT;

2610

2611

/*

2611

/*

2612

* If compaction is deferred for high-order allocations, it is because

2612

* If compaction is deferred for high-order allocations, it is because

2613

* sync compaction recently failed. In this is the case and the caller

2613

* sync compaction recently failed. In this is the case and the caller

2614

* requested a movable allocation that does not heavily disrupt the

2614

* requested a movable allocation that does not heavily disrupt the

2615

* system then fail the allocation instead of entering direct reclaim.

2615

* system then fail the allocation instead of entering direct reclaim.

2616

*/

2616

*/

2617

if ((deferred_compaction || contended_compaction) &&

2617

if ((deferred_compaction || contended_compaction) &&

2618

(gfp_mask & __GFP_NO_KSWAPD))

2618

(gfp_mask & __GFP_NO_KSWAPD))

2619

goto nopage;

2619

goto nopage;

2620

2621

/* Try direct reclaim and then allocating */

2621

/* Try direct reclaim and then allocating */

2622

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2622

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2623

zonelist, high_zoneidx,

2623

zonelist, high_zoneidx,

2624

nodemask,

2624

nodemask,

2625

alloc_flags, preferred_zone,

2625

alloc_flags, preferred_zone,

2626

classzone_idx, migratetype,

2626

classzone_idx, migratetype,

2627

&did_some_progress);

2627

&did_some_progress);

2628

if (page)

2628

if (page)

2629

goto got_pg;

2629

goto got_pg;

2630

2631

/*

2631

/*

2632

* If we failed to make any progress reclaiming, then we are

2632

* If we failed to make any progress reclaiming, then we are

2633

* running out of options and have to consider going OOM

2633

* running out of options and have to consider going OOM

2634

*/

2634

*/

2635

if (!did_some_progress) {

2635

if (!did_some_progress) {

2636

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2636

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2637

if (oom_killer_disabled)

2637

if (oom_killer_disabled)

2638

goto nopage;

2638

goto nopage;

2639

/* Coredumps can quickly deplete all memory reserves */

2639

/* Coredumps can quickly deplete all memory reserves */

2640

if ((current->flags & PF_DUMPCORE) &&

2640

if ((current->flags & PF_DUMPCORE) &&

2641

!(gfp_mask & __GFP_NOFAIL))

2641

!(gfp_mask & __GFP_NOFAIL))

2642

goto nopage;

2642

goto nopage;

2643

page = __alloc_pages_may_oom(gfp_mask, order,

2643

page = __alloc_pages_may_oom(gfp_mask, order,

2644

zonelist, high_zoneidx,

2644

zonelist, high_zoneidx,

2645

nodemask, preferred_zone,

2645

nodemask, preferred_zone,

2646

classzone_idx, migratetype);

2646

classzone_idx, migratetype);

2647

if (page)

2647

if (page)

2648

goto got_pg;

2648

goto got_pg;

2649

2650

if (!(gfp_mask & __GFP_NOFAIL)) {

2650

if (!(gfp_mask & __GFP_NOFAIL)) {

2651

/*

2651

/*

2652

* The oom killer is not called for high-order

2652

* The oom killer is not called for high-order

2653

* allocations that may fail, so if no progress

2653

* allocations that may fail, so if no progress

2654

* is being made, there are no other options and

2654

* is being made, there are no other options and

2655

* retrying is unlikely to help.

2655

* retrying is unlikely to help.

2656

*/

2656

*/

2657

if (order > PAGE_ALLOC_COSTLY_ORDER)

2657

if (order > PAGE_ALLOC_COSTLY_ORDER)

2658

goto nopage;

2658

goto nopage;

2659

/*

2659

/*

2660

* The oom killer is not called for lowmem

2660

* The oom killer is not called for lowmem

2661

* allocations to prevent needlessly killing

2661

* allocations to prevent needlessly killing

2662

* innocent tasks.

2662

* innocent tasks.

2663

*/

2663

*/

2664

if (high_zoneidx < ZONE_NORMAL)

2664

if (high_zoneidx < ZONE_NORMAL)

2665

goto nopage;

2665

goto nopage;

2666

}

2666

}

2667

2668

goto restart;

2668

goto restart;

2669

}

2669

}

2670

}

2670

}

2671

2672

/* Check if we should retry the allocation */

2672

/* Check if we should retry the allocation */

2673

pages_reclaimed += did_some_progress;

2673

pages_reclaimed += did_some_progress;

2674

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2674

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2675

pages_reclaimed)) {

2675

pages_reclaimed)) {

2676

/* Wait for some write requests to complete then retry */

2676

/* Wait for some write requests to complete then retry */

2677

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2677

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2678

goto rebalance;

2678

goto rebalance;

2679

} else {

2679

} else {

2680

/*

2680

/*

2681

* High-order allocations do not necessarily loop after

2681

* High-order allocations do not necessarily loop after

2682

* direct reclaim and reclaim/compaction depends on compaction

2682

* direct reclaim and reclaim/compaction depends on compaction

2683

* being called after reclaim so call directly if necessary

2683

* being called after reclaim so call directly if necessary

2684

*/

2684

*/

2685

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2685

page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,

2686

high_zoneidx, nodemask, alloc_flags,

2686

high_zoneidx, nodemask, alloc_flags,

2687

preferred_zone,

2687

preferred_zone,

2688

classzone_idx, migratetype,

2688

classzone_idx, migratetype,

2689

migration_mode, &contended_compaction,

2689

migration_mode, &contended_compaction,

2690

&deferred_compaction,

2690

&deferred_compaction,

2691

&did_some_progress);

2691

&did_some_progress);

2692

if (page)

2692

if (page)

2693

goto got_pg;

2693

goto got_pg;

2694

}

2694

}

2695

2696

nopage:

2696

nopage:

2697

warn_alloc_failed(gfp_mask, order, NULL);

2697

warn_alloc_failed(gfp_mask, order, NULL);

2698

return page;

2698

return page;

2699

got_pg:

2699

got_pg:

2700

if (kmemcheck_enabled)

2700

if (kmemcheck_enabled)

2701

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2701

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2702

2703

return page;

2703

return page;

2704

}

2704

}

2705

2706

/*

2706

/*

2707

* This is the 'heart' of the zoned buddy allocator.

2707

* This is the 'heart' of the zoned buddy allocator.

2708

*/

2708

*/

2709

struct page *

2709

struct page *

2710

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2710

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2711

struct zonelist *zonelist, nodemask_t *nodemask)

2711

struct zonelist *zonelist, nodemask_t *nodemask)

2712

{

2712

{

2713

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2713

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2714

struct zone *preferred_zone;

2714

struct zone *preferred_zone;

2715

struct zoneref *preferred_zoneref;

2715

struct zoneref *preferred_zoneref;

2716

struct page *page = NULL;

2716

struct page *page = NULL;

2717

int migratetype = allocflags_to_migratetype(gfp_mask);

2717

int migratetype = allocflags_to_migratetype(gfp_mask);

2718

unsigned int cpuset_mems_cookie;

2718

unsigned int cpuset_mems_cookie;

2719

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2719

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;

2720

struct mem_cgroup *memcg = NULL;

2720

struct mem_cgroup *memcg = NULL;

2721

int classzone_idx;

2721

int classzone_idx;

2722

2723

gfp_mask &= gfp_allowed_mask;

2723

gfp_mask &= gfp_allowed_mask;

2724

2725

lockdep_trace_alloc(gfp_mask);

2725

lockdep_trace_alloc(gfp_mask);

2726

2727

might_sleep_if(gfp_mask & __GFP_WAIT);

2727

might_sleep_if(gfp_mask & __GFP_WAIT);

2728

2729

if (should_fail_alloc_page(gfp_mask, order))

2729

if (should_fail_alloc_page(gfp_mask, order))

2730

return NULL;

2730

return NULL;

2731

2732

/*

2732

/*

2733

* Check the zones suitable for the gfp_mask contain at least one

2733

* Check the zones suitable for the gfp_mask contain at least one

2734

* valid zone. It's possible to have an empty zonelist as a result

2734

* valid zone. It's possible to have an empty zonelist as a result

2735

* of GFP_THISNODE and a memoryless node

2735

* of GFP_THISNODE and a memoryless node

2736

*/

2736

*/

2737

if (unlikely(!zonelist->_zonerefs->zone))

2737

if (unlikely(!zonelist->_zonerefs->zone))

2738

return NULL;

2738

return NULL;

2739

2740

/*

2740

/*

2741

* Will only have any effect when __GFP_KMEMCG is set. This is

2741

* Will only have any effect when __GFP_KMEMCG is set. This is

2742

* verified in the (always inline) callee

2742

* verified in the (always inline) callee

2743

*/

2743

*/

2744

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2744

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2745

return NULL;

2745

return NULL;

2746

2747

retry_cpuset:

2747

retry_cpuset:

2748

cpuset_mems_cookie = read_mems_allowed_begin();

2748

cpuset_mems_cookie = read_mems_allowed_begin();

2749

2750

/* The preferred zone is used for statistics later */

2750

/* The preferred zone is used for statistics later */

2751

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2751

preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,

2752

nodemask ? : &cpuset_current_mems_allowed,

2752

nodemask ? : &cpuset_current_mems_allowed,

2753

&preferred_zone);

2753

&preferred_zone);

2754

if (!preferred_zone)

2754

if (!preferred_zone)

2755

goto out;

2755

goto out;

2756

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2756

classzone_idx = zonelist_zone_idx(preferred_zoneref);

2757

2758

#ifdef CONFIG_CMA

2758

#ifdef CONFIG_CMA

2759

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2759

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2760

alloc_flags |= ALLOC_CMA;

2760

alloc_flags |= ALLOC_CMA;

2761

#endif

2761

#endif

2762

retry:

2762

retry:

2763

/* First allocation attempt */

2763

/* First allocation attempt */

2764

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2764

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2765

zonelist, high_zoneidx, alloc_flags,

2765

zonelist, high_zoneidx, alloc_flags,

2766

preferred_zone, classzone_idx, migratetype);

2766

preferred_zone, classzone_idx, migratetype);

2767

if (unlikely(!page)) {

2767

if (unlikely(!page)) {

2768

/*

2768

/*

2769

* The first pass makes sure allocations are spread

2769

* The first pass makes sure allocations are spread

2770

* fairly within the local node. However, the local

2770

* fairly within the local node. However, the local

2771

* node might have free pages left after the fairness

2771

* node might have free pages left after the fairness

2772

* batches are exhausted, and remote zones haven't

2772

* batches are exhausted, and remote zones haven't

2773

* even been considered yet. Try once more without

2773

* even been considered yet. Try once more without

2774

* fairness, and include remote zones now, before

2774

* fairness, and include remote zones now, before

2775

* entering the slowpath and waking kswapd: prefer

2775

* entering the slowpath and waking kswapd: prefer

2776

* spilling to a remote zone over swapping locally.

2776

* spilling to a remote zone over swapping locally.

2777

*/

2777

*/

2778

if (alloc_flags & ALLOC_FAIR) {

2778

if (alloc_flags & ALLOC_FAIR) {

2779

reset_alloc_batches(zonelist, high_zoneidx,

2779

reset_alloc_batches(zonelist, high_zoneidx,

2780

preferred_zone);

2780

preferred_zone);

2781

alloc_flags &= ~ALLOC_FAIR;

2781

alloc_flags &= ~ALLOC_FAIR;

2782

goto retry;

2782

goto retry;

2783

}

2783

}

2784

/*

2784

/*

2785

* Runtime PM, block IO and its error handling path

2785

* Runtime PM, block IO and its error handling path

2786

* can deadlock because I/O on the device might not

2786

* can deadlock because I/O on the device might not

2787

* complete.

2787

* complete.

2788

*/

2788

*/

2789

gfp_mask = memalloc_noio_flags(gfp_mask);

2789

gfp_mask = memalloc_noio_flags(gfp_mask);

2790

page = __alloc_pages_slowpath(gfp_mask, order,

2790

page = __alloc_pages_slowpath(gfp_mask, order,

2791

zonelist, high_zoneidx, nodemask,

2791

zonelist, high_zoneidx, nodemask,

2792

preferred_zone, classzone_idx, migratetype);

2792

preferred_zone, classzone_idx, migratetype);

2793

}

2793

}

2794

2795

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2795

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2796

2797

out:

2797

out:

2798

/*

2798

/*

2799

* When updating a task's mems_allowed, it is possible to race with

2799

* When updating a task's mems_allowed, it is possible to race with

2800

* parallel threads in such a way that an allocation can fail while

2800

* parallel threads in such a way that an allocation can fail while

2801

* the mask is being updated. If a page allocation is about to fail,

2801

* the mask is being updated. If a page allocation is about to fail,

2802

* check if the cpuset changed during allocation and if so, retry.

2802

* check if the cpuset changed during allocation and if so, retry.

2803

*/

2803

*/

2804

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2804

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2805

goto retry_cpuset;

2805

goto retry_cpuset;

2806

2807

memcg_kmem_commit_charge(page, memcg, order);

2807

memcg_kmem_commit_charge(page, memcg, order);

2808

2809

return page;

2809

return page;

2810

}

2810

}

2811

EXPORT_SYMBOL(__alloc_pages_nodemask);

2811

EXPORT_SYMBOL(__alloc_pages_nodemask);

2812

2813

/*

2813

/*

2814

* Common helper functions.

2814

* Common helper functions.

2815

*/

2815

*/

2816

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2816

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2817

{

2817

{

2818

struct page *page;

2818

struct page *page;

2819

2820

/*

2820

/*

2821

* __get_free_pages() returns a 32-bit address, which cannot represent

2821

* __get_free_pages() returns a 32-bit address, which cannot represent

2822

* a highmem page

2822

* a highmem page

2823

*/

2823

*/

2824

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2824

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2825

2826

page = alloc_pages(gfp_mask, order);

2826

page = alloc_pages(gfp_mask, order);

2827

if (!page)

2827

if (!page)

2828

return 0;

2828

return 0;

2829

return (unsigned long) page_address(page);

2829

return (unsigned long) page_address(page);

2830

}

2830

}

2831

EXPORT_SYMBOL(__get_free_pages);

2831

EXPORT_SYMBOL(__get_free_pages);

2832

2833

unsigned long get_zeroed_page(gfp_t gfp_mask)

2833

unsigned long get_zeroed_page(gfp_t gfp_mask)

2834

{

2834

{

2835

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2835

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2836

}

2836

}

2837

EXPORT_SYMBOL(get_zeroed_page);

2837

EXPORT_SYMBOL(get_zeroed_page);

2838

2839

void __free_pages(struct page *page, unsigned int order)

2839

void __free_pages(struct page *page, unsigned int order)

2840

{

2840

{

2841

if (put_page_testzero(page)) {

2841

if (put_page_testzero(page)) {

2842

if (order == 0)

2842

if (order == 0)

2843

free_hot_cold_page(page, false);

2843

free_hot_cold_page(page, false);

2844

else

2844

else

2845

__free_pages_ok(page, order);

2845

__free_pages_ok(page, order);

2846

}

2846

}

2847

}

2847

}

2848

2849

EXPORT_SYMBOL(__free_pages);

2849

EXPORT_SYMBOL(__free_pages);

2850

2851

void free_pages(unsigned long addr, unsigned int order)

2851

void free_pages(unsigned long addr, unsigned int order)

2852

{

2852

{

2853

if (addr != 0) {

2853

if (addr != 0) {

2854

VM_BUG_ON(!virt_addr_valid((void *)addr));

2854

VM_BUG_ON(!virt_addr_valid((void *)addr));

2855

__free_pages(virt_to_page((void *)addr), order);

2855

__free_pages(virt_to_page((void *)addr), order);

2856

}

2856

}

2857

}

2857

}

2858

2859

EXPORT_SYMBOL(free_pages);

2859

EXPORT_SYMBOL(free_pages);

2860

2861

/*

2861

/*

2862

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2862

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2863

* pages allocated with __GFP_KMEMCG.

2863

* pages allocated with __GFP_KMEMCG.

2864

*

2864

*

2865

* Those pages are accounted to a particular memcg, embedded in the

2865

* Those pages are accounted to a particular memcg, embedded in the

2866

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2866

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2867

* for that information only to find out that it is NULL for users who have no

2867

* for that information only to find out that it is NULL for users who have no

2868

* interest in that whatsoever, we provide these functions.

2868

* interest in that whatsoever, we provide these functions.

2869

*

2869

*

2870

* The caller knows better which flags it relies on.

2870

* The caller knows better which flags it relies on.

2871

*/

2871

*/

2872

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2872

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2873

{

2873

{

2874

memcg_kmem_uncharge_pages(page, order);

2874

memcg_kmem_uncharge_pages(page, order);

2875

__free_pages(page, order);

2875

__free_pages(page, order);

2876

}

2876

}

2877

2878

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2878

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2879

{

2879

{

2880

if (addr != 0) {

2880

if (addr != 0) {

2881

VM_BUG_ON(!virt_addr_valid((void *)addr));

2881

VM_BUG_ON(!virt_addr_valid((void *)addr));

2882

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2882

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2883

}

2883

}

2884

}

2884

}

2885

2886

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2886

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2887

{

2887

{

2888

if (addr) {

2888

if (addr) {

2889

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2889

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2890

unsigned long used = addr + PAGE_ALIGN(size);

2890

unsigned long used = addr + PAGE_ALIGN(size);

2891

2892

split_page(virt_to_page((void *)addr), order);

2892

split_page(virt_to_page((void *)addr), order);

2893

while (used < alloc_end) {

2893

while (used < alloc_end) {

2894

free_page(used);

2894

free_page(used);

2895

used += PAGE_SIZE;

2895

used += PAGE_SIZE;

2896

}

2896

}

2897

}

2897

}

2898

return (void *)addr;

2898

return (void *)addr;

2899

}

2899

}

2900

2901

/**

2901

/**

2902

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2902

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2903

* @size: the number of bytes to allocate

2903

* @size: the number of bytes to allocate

2904

* @gfp_mask: GFP flags for the allocation

2904

* @gfp_mask: GFP flags for the allocation

2905

*

2905

*

2906

* This function is similar to alloc_pages(), except that it allocates the

2906

* This function is similar to alloc_pages(), except that it allocates the

2907

* minimum number of pages to satisfy the request. alloc_pages() can only

2907

* minimum number of pages to satisfy the request. alloc_pages() can only

2908

* allocate memory in power-of-two pages.

2908

* allocate memory in power-of-two pages.

2909

*

2909

*

2910

* This function is also limited by MAX_ORDER.

2910

* This function is also limited by MAX_ORDER.

2911

*

2911

*

2912

* Memory allocated by this function must be released by free_pages_exact().

2912

* Memory allocated by this function must be released by free_pages_exact().

2913

*/

2913

*/

2914

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2914

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2915

{

2915

{

2916

unsigned int order = get_order(size);

2916

unsigned int order = get_order(size);

2917

unsigned long addr;

2917

unsigned long addr;

2918

2919

addr = __get_free_pages(gfp_mask, order);

2919

addr = __get_free_pages(gfp_mask, order);

2920

return make_alloc_exact(addr, order, size);

2920

return make_alloc_exact(addr, order, size);

2921

}

2921

}

2922

EXPORT_SYMBOL(alloc_pages_exact);

2922

EXPORT_SYMBOL(alloc_pages_exact);

2923

2924

/**

2924

/**

2925

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2925

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2926

* pages on a node.

2926

* pages on a node.

2927

* @nid: the preferred node ID where memory should be allocated

2927

* @nid: the preferred node ID where memory should be allocated

2928

* @size: the number of bytes to allocate

2928

* @size: the number of bytes to allocate

2929

* @gfp_mask: GFP flags for the allocation

2929

* @gfp_mask: GFP flags for the allocation

2930

*

2930

*

2931

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2931

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2932

* back.

2932

* back.

2933

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2933

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2934

* but is not exact.

2934

* but is not exact.

2935

*/

2935

*/

2936

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2936

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2937

{

2937

{

2938

unsigned order = get_order(size);

2938

unsigned order = get_order(size);

2939

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2939

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2940

if (!p)

2940

if (!p)

2941

return NULL;

2941

return NULL;

2942

return make_alloc_exact((unsigned long)page_address(p), order, size);

2942

return make_alloc_exact((unsigned long)page_address(p), order, size);

2943

}

2943

}

2944

EXPORT_SYMBOL(alloc_pages_exact_nid);

2944

EXPORT_SYMBOL(alloc_pages_exact_nid);

2945

2946

/**

2946

/**

2947

* free_pages_exact - release memory allocated via alloc_pages_exact()

2947

* free_pages_exact - release memory allocated via alloc_pages_exact()

2948

* @virt: the value returned by alloc_pages_exact.

2948

* @virt: the value returned by alloc_pages_exact.

2949

* @size: size of allocation, same value as passed to alloc_pages_exact().

2949

* @size: size of allocation, same value as passed to alloc_pages_exact().

2950

*

2950

*

2951

* Release the memory allocated by a previous call to alloc_pages_exact.

2951

* Release the memory allocated by a previous call to alloc_pages_exact.

2952

*/

2952

*/

2953

void free_pages_exact(void *virt, size_t size)

2953

void free_pages_exact(void *virt, size_t size)

2954

{

2954

{

2955

unsigned long addr = (unsigned long)virt;

2955

unsigned long addr = (unsigned long)virt;

2956

unsigned long end = addr + PAGE_ALIGN(size);

2956

unsigned long end = addr + PAGE_ALIGN(size);

2957

2958

while (addr < end) {

2958

while (addr < end) {

2959

free_page(addr);

2959

free_page(addr);

2960

addr += PAGE_SIZE;

2960

addr += PAGE_SIZE;

2961

}

2961

}

2962

}

2962

}

2963

EXPORT_SYMBOL(free_pages_exact);

2963

EXPORT_SYMBOL(free_pages_exact);

2964

2965

/**

2965

/**

2966

* nr_free_zone_pages - count number of pages beyond high watermark

2966

* nr_free_zone_pages - count number of pages beyond high watermark

2967

* @offset: The zone index of the highest zone

2967

* @offset: The zone index of the highest zone

2968

*

2968

*

2969

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2969

* nr_free_zone_pages() counts the number of counts pages which are beyond the

2970

* high watermark within all zones at or below a given zone index. For each

2970

* high watermark within all zones at or below a given zone index. For each

2971

* zone, the number of pages is calculated as:

2971

* zone, the number of pages is calculated as:

2972

* managed_pages - high_pages

2972

* managed_pages - high_pages

2973

*/

2973

*/

2974

static unsigned long nr_free_zone_pages(int offset)

2974

static unsigned long nr_free_zone_pages(int offset)

2975

{

2975

{

2976

struct zoneref *z;

2976

struct zoneref *z;

2977

struct zone *zone;

2977

struct zone *zone;

2978

2979

/* Just pick one node, since fallback list is circular */

2979

/* Just pick one node, since fallback list is circular */

2980

unsigned long sum = 0;

2980

unsigned long sum = 0;

2981

2982

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2982

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2983

2984

for_each_zone_zonelist(zone, z, zonelist, offset) {

2984

for_each_zone_zonelist(zone, z, zonelist, offset) {

2985

unsigned long size = zone->managed_pages;

2985

unsigned long size = zone->managed_pages;

2986

unsigned long high = high_wmark_pages(zone);

2986

unsigned long high = high_wmark_pages(zone);

2987

if (size > high)

2987

if (size > high)

2988

sum += size - high;

2988

sum += size - high;

2989

}

2989

}

2990

2991

return sum;

2991

return sum;

2992

}

2992

}

2993

2994

/**

2994

/**

2995

* nr_free_buffer_pages - count number of pages beyond high watermark

2995

* nr_free_buffer_pages - count number of pages beyond high watermark

2996

*

2996

*

2997

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2997

* nr_free_buffer_pages() counts the number of pages which are beyond the high

2998

* watermark within ZONE_DMA and ZONE_NORMAL.

2998

* watermark within ZONE_DMA and ZONE_NORMAL.

2999

*/

2999

*/

3000

unsigned long nr_free_buffer_pages(void)

3000

unsigned long nr_free_buffer_pages(void)

3001

{

3001

{

3002

return nr_free_zone_pages(gfp_zone(GFP_USER));

3002

return nr_free_zone_pages(gfp_zone(GFP_USER));

3003

}

3003

}

3004

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

3004

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

3005

3006

/**

3006

/**

3007

* nr_free_pagecache_pages - count number of pages beyond high watermark

3007

* nr_free_pagecache_pages - count number of pages beyond high watermark

3008

*

3008

*

3009

* nr_free_pagecache_pages() counts the number of pages which are beyond the

3009

* nr_free_pagecache_pages() counts the number of pages which are beyond the

3010

* high watermark within all zones.

3010

* high watermark within all zones.

3011

*/

3011

*/

3012

unsigned long nr_free_pagecache_pages(void)

3012

unsigned long nr_free_pagecache_pages(void)

3013

{

3013

{

3014

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3014

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

3015

}

3015

}

3016

3017

static inline void show_node(struct zone *zone)

3017

static inline void show_node(struct zone *zone)

3018

{

3018

{

3019

if (IS_ENABLED(CONFIG_NUMA))

3019

if (IS_ENABLED(CONFIG_NUMA))

3020

printk("Node %d ", zone_to_nid(zone));

3020

printk("Node %d ", zone_to_nid(zone));

3021

}

3021

}

3022

3023

void si_meminfo(struct sysinfo *val)

3023

void si_meminfo(struct sysinfo *val)

3024

{

3024

{

3025

val->totalram = totalram_pages;

3025

val->totalram = totalram_pages;

3026

val->sharedram = 0;

3026

val->sharedram = 0;

3027

val->freeram = global_page_state(NR_FREE_PAGES);

3027

val->freeram = global_page_state(NR_FREE_PAGES);

3028

val->bufferram = nr_blockdev_pages();

3028

val->bufferram = nr_blockdev_pages();

3029

val->totalhigh = totalhigh_pages;

3029

val->totalhigh = totalhigh_pages;

3030

val->freehigh = nr_free_highpages();

3030

val->freehigh = nr_free_highpages();

3031

val->mem_unit = PAGE_SIZE;

3031

val->mem_unit = PAGE_SIZE;

3032

}

3032

}

3033

3034

EXPORT_SYMBOL(si_meminfo);

3034

EXPORT_SYMBOL(si_meminfo);

3035

3036

#ifdef CONFIG_NUMA

3036

#ifdef CONFIG_NUMA

3037

void si_meminfo_node(struct sysinfo *val, int nid)

3037

void si_meminfo_node(struct sysinfo *val, int nid)

3038

{

3038

{

3039

int zone_type; /* needs to be signed */

3039

int zone_type; /* needs to be signed */

3040

unsigned long managed_pages = 0;

3040

unsigned long managed_pages = 0;

3041

pg_data_t *pgdat = NODE_DATA(nid);

3041

pg_data_t *pgdat = NODE_DATA(nid);

3042

3043

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3043

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)

3044

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3044

managed_pages += pgdat->node_zones[zone_type].managed_pages;

3045

val->totalram = managed_pages;

3045

val->totalram = managed_pages;

3046

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3046

val->freeram = node_page_state(nid, NR_FREE_PAGES);

3047

#ifdef CONFIG_HIGHMEM

3047

#ifdef CONFIG_HIGHMEM

3048

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3048

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;

3049

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3049

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

3050

NR_FREE_PAGES);

3050

NR_FREE_PAGES);

3051

#else

3051

#else

3052

val->totalhigh = 0;

3052

val->totalhigh = 0;

3053

val->freehigh = 0;

3053

val->freehigh = 0;

3054

#endif

3054

#endif

3055

val->mem_unit = PAGE_SIZE;

3055

val->mem_unit = PAGE_SIZE;

3056

}

3056

}

3057

#endif

3057

#endif

3058

3059

/*

3059

/*

3060

* Determine whether the node should be displayed or not, depending on whether

3060

* Determine whether the node should be displayed or not, depending on whether

3061

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3061

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

3062

*/

3062

*/

3063

bool skip_free_areas_node(unsigned int flags, int nid)

3063

bool skip_free_areas_node(unsigned int flags, int nid)

3064

{

3064

{

3065

bool ret = false;

3065

bool ret = false;

3066

unsigned int cpuset_mems_cookie;

3066

unsigned int cpuset_mems_cookie;

3067

3068

if (!(flags & SHOW_MEM_FILTER_NODES))

3068

if (!(flags & SHOW_MEM_FILTER_NODES))

3069

goto out;

3069

goto out;

3070

3071

do {

3071

do {

3072

cpuset_mems_cookie = read_mems_allowed_begin();

3072

cpuset_mems_cookie = read_mems_allowed_begin();

3073

ret = !node_isset(nid, cpuset_current_mems_allowed);

3073

ret = !node_isset(nid, cpuset_current_mems_allowed);

3074

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3074

} while (read_mems_allowed_retry(cpuset_mems_cookie));

3075

out:

3075

out:

3076

return ret;

3076

return ret;

3077

}

3077

}

3078

3079

#define K(x) ((x) << (PAGE_SHIFT-10))

3079

#define K(x) ((x) << (PAGE_SHIFT-10))

3080

3081

static void show_migration_types(unsigned char type)

3081

static void show_migration_types(unsigned char type)

3082

{

3082

{

3083

static const char types[MIGRATE_TYPES] = {

3083

static const char types[MIGRATE_TYPES] = {

3084

[MIGRATE_UNMOVABLE] = 'U',

3084

[MIGRATE_UNMOVABLE] = 'U',

3085

[MIGRATE_RECLAIMABLE] = 'E',

3085

[MIGRATE_RECLAIMABLE] = 'E',

3086

[MIGRATE_MOVABLE] = 'M',

3086

[MIGRATE_MOVABLE] = 'M',

3087

[MIGRATE_RESERVE] = 'R',

3087

[MIGRATE_RESERVE] = 'R',

3088

#ifdef CONFIG_CMA

3088

#ifdef CONFIG_CMA

3089

[MIGRATE_CMA] = 'C',

3089

[MIGRATE_CMA] = 'C',

3090

#endif

3090

#endif

3091

#ifdef CONFIG_MEMORY_ISOLATION

3091

#ifdef CONFIG_MEMORY_ISOLATION

3092

[MIGRATE_ISOLATE] = 'I',

3092

[MIGRATE_ISOLATE] = 'I',

3093

#endif

3093

#endif

3094

};

3094

};

3095

char tmp[MIGRATE_TYPES + 1];

3095

char tmp[MIGRATE_TYPES + 1];

3096

char *p = tmp;

3096

char *p = tmp;

3097

int i;

3097

int i;

3098

3099

for (i = 0; i < MIGRATE_TYPES; i++) {

3099

for (i = 0; i < MIGRATE_TYPES; i++) {

3100

if (type & (1 << i))

3100

if (type & (1 << i))

3101

*p++ = types[i];

3101

*p++ = types[i];

3102

}

3102

}

3103

3104

*p = '\0';

3104

*p = '\0';

3105

printk("(%s) ", tmp);

3105

printk("(%s) ", tmp);

3106

}

3106

}

3107

3108

/*

3108

/*

3109

* Show free area list (used inside shift_scroll-lock stuff)

3109

* Show free area list (used inside shift_scroll-lock stuff)

3110

* We also calculate the percentage fragmentation. We do this by counting the

3110

* We also calculate the percentage fragmentation. We do this by counting the

3111

* memory on each free list with the exception of the first item on the list.

3111

* memory on each free list with the exception of the first item on the list.

3112

* Suppresses nodes that are not allowed by current's cpuset if

3112

* Suppresses nodes that are not allowed by current's cpuset if

3113

* SHOW_MEM_FILTER_NODES is passed.

3113

* SHOW_MEM_FILTER_NODES is passed.

3114

*/

3114

*/

3115

void show_free_areas(unsigned int filter)

3115

void show_free_areas(unsigned int filter)

3116

{

3116

{

3117

int cpu;

3117

int cpu;

3118

struct zone *zone;

3118

struct zone *zone;

3119

3120

for_each_populated_zone(zone) {

3120

for_each_populated_zone(zone) {

3121

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3121

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3122

continue;

3122

continue;

3123

show_node(zone);

3123

show_node(zone);

3124

printk("%s per-cpu:\n", zone->name);

3124

printk("%s per-cpu:\n", zone->name);

3125

3126

for_each_online_cpu(cpu) {

3126

for_each_online_cpu(cpu) {

3127

struct per_cpu_pageset *pageset;

3127

struct per_cpu_pageset *pageset;

3128

3129

pageset = per_cpu_ptr(zone->pageset, cpu);

3129

pageset = per_cpu_ptr(zone->pageset, cpu);

3130

3131

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3131

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

3132

cpu, pageset->pcp.high,

3132

cpu, pageset->pcp.high,

3133

pageset->pcp.batch, pageset->pcp.count);

3133

pageset->pcp.batch, pageset->pcp.count);

3134

}

3134

}

3135

}

3135

}

3136

3137

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3137

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

3138

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3138

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

3139

" unevictable:%lu"

3139

" unevictable:%lu"

3140

" dirty:%lu writeback:%lu unstable:%lu\n"

3140

" dirty:%lu writeback:%lu unstable:%lu\n"

3141

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3141

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

3142

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3142

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

3143

" free_cma:%lu\n",

3143

" free_cma:%lu\n",

3144

global_page_state(NR_ACTIVE_ANON),

3144

global_page_state(NR_ACTIVE_ANON),

3145

global_page_state(NR_INACTIVE_ANON),

3145

global_page_state(NR_INACTIVE_ANON),

3146

global_page_state(NR_ISOLATED_ANON),

3146

global_page_state(NR_ISOLATED_ANON),

3147

global_page_state(NR_ACTIVE_FILE),

3147

global_page_state(NR_ACTIVE_FILE),

3148

global_page_state(NR_INACTIVE_FILE),

3148

global_page_state(NR_INACTIVE_FILE),

3149

global_page_state(NR_ISOLATED_FILE),

3149

global_page_state(NR_ISOLATED_FILE),

3150

global_page_state(NR_UNEVICTABLE),

3150

global_page_state(NR_UNEVICTABLE),

3151

global_page_state(NR_FILE_DIRTY),

3151

global_page_state(NR_FILE_DIRTY),

3152

global_page_state(NR_WRITEBACK),

3152

global_page_state(NR_WRITEBACK),

3153

global_page_state(NR_UNSTABLE_NFS),

3153

global_page_state(NR_UNSTABLE_NFS),

3154

global_page_state(NR_FREE_PAGES),

3154

global_page_state(NR_FREE_PAGES),

3155

global_page_state(NR_SLAB_RECLAIMABLE),

3155

global_page_state(NR_SLAB_RECLAIMABLE),

3156

global_page_state(NR_SLAB_UNRECLAIMABLE),

3156

global_page_state(NR_SLAB_UNRECLAIMABLE),

3157

global_page_state(NR_FILE_MAPPED),

3157

global_page_state(NR_FILE_MAPPED),

3158

global_page_state(NR_SHMEM),

3158

global_page_state(NR_SHMEM),

3159

global_page_state(NR_PAGETABLE),

3159

global_page_state(NR_PAGETABLE),

3160

global_page_state(NR_BOUNCE),

3160

global_page_state(NR_BOUNCE),

3161

global_page_state(NR_FREE_CMA_PAGES));

3161

global_page_state(NR_FREE_CMA_PAGES));

3162

3163

for_each_populated_zone(zone) {

3163

for_each_populated_zone(zone) {

3164

int i;

3164

int i;

3165

3166

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3166

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3167

continue;

3167

continue;

3168

show_node(zone);

3168

show_node(zone);

3169

printk("%s"

3169

printk("%s"

3170

" free:%lukB"

3170

" free:%lukB"

3171

" min:%lukB"

3171

" min:%lukB"

3172

" low:%lukB"

3172

" low:%lukB"

3173

" high:%lukB"

3173

" high:%lukB"

3174

" active_anon:%lukB"

3174

" active_anon:%lukB"

3175

" inactive_anon:%lukB"

3175

" inactive_anon:%lukB"

3176

" active_file:%lukB"

3176

" active_file:%lukB"

3177

" inactive_file:%lukB"

3177

" inactive_file:%lukB"

3178

" unevictable:%lukB"

3178

" unevictable:%lukB"

3179

" isolated(anon):%lukB"

3179

" isolated(anon):%lukB"

3180

" isolated(file):%lukB"

3180

" isolated(file):%lukB"

3181

" present:%lukB"

3181

" present:%lukB"

3182

" managed:%lukB"

3182

" managed:%lukB"

3183

" mlocked:%lukB"

3183

" mlocked:%lukB"

3184

" dirty:%lukB"

3184

" dirty:%lukB"

3185

" writeback:%lukB"

3185

" writeback:%lukB"

3186

" mapped:%lukB"

3186

" mapped:%lukB"

3187

" shmem:%lukB"

3187

" shmem:%lukB"

3188

" slab_reclaimable:%lukB"

3188

" slab_reclaimable:%lukB"

3189

" slab_unreclaimable:%lukB"

3189

" slab_unreclaimable:%lukB"

3190

" kernel_stack:%lukB"

3190

" kernel_stack:%lukB"

3191

" pagetables:%lukB"

3191

" pagetables:%lukB"

3192

" unstable:%lukB"

3192

" unstable:%lukB"

3193

" bounce:%lukB"

3193

" bounce:%lukB"

3194

" free_cma:%lukB"

3194

" free_cma:%lukB"

3195

" writeback_tmp:%lukB"

3195

" writeback_tmp:%lukB"

3196

" pages_scanned:%lu"

3196

" pages_scanned:%lu"

3197

" all_unreclaimable? %s"

3197

" all_unreclaimable? %s"

3198

"\n",

3198

"\n",

3199

zone->name,

3199

zone->name,

3200

K(zone_page_state(zone, NR_FREE_PAGES)),

3200

K(zone_page_state(zone, NR_FREE_PAGES)),

3201

K(min_wmark_pages(zone)),

3201

K(min_wmark_pages(zone)),

3202

K(low_wmark_pages(zone)),

3202

K(low_wmark_pages(zone)),

3203

K(high_wmark_pages(zone)),

3203

K(high_wmark_pages(zone)),

3204

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3204

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3205

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3205

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3206

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3206

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3207

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3207

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3208

K(zone_page_state(zone, NR_UNEVICTABLE)),

3208

K(zone_page_state(zone, NR_UNEVICTABLE)),

3209

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3209

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3210

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3210

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3211

K(zone->present_pages),

3211

K(zone->present_pages),

3212

K(zone->managed_pages),

3212

K(zone->managed_pages),

3213

K(zone_page_state(zone, NR_MLOCK)),

3213

K(zone_page_state(zone, NR_MLOCK)),

3214

K(zone_page_state(zone, NR_FILE_DIRTY)),

3214

K(zone_page_state(zone, NR_FILE_DIRTY)),

3215

K(zone_page_state(zone, NR_WRITEBACK)),

3215

K(zone_page_state(zone, NR_WRITEBACK)),

3216

K(zone_page_state(zone, NR_FILE_MAPPED)),

3216

K(zone_page_state(zone, NR_FILE_MAPPED)),

3217

K(zone_page_state(zone, NR_SHMEM)),

3217

K(zone_page_state(zone, NR_SHMEM)),

3218

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3218

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3219

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3219

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3220

zone_page_state(zone, NR_KERNEL_STACK) *

3220

zone_page_state(zone, NR_KERNEL_STACK) *

3221

THREAD_SIZE / 1024,

3221

THREAD_SIZE / 1024,

3222

K(zone_page_state(zone, NR_PAGETABLE)),

3222

K(zone_page_state(zone, NR_PAGETABLE)),

3223

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3223

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3224

K(zone_page_state(zone, NR_BOUNCE)),

3224

K(zone_page_state(zone, NR_BOUNCE)),

3225

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3225

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3226

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3226

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3227

K(zone_page_state(zone, NR_PAGES_SCANNED)),

3227

K(zone_page_state(zone, NR_PAGES_SCANNED)),

3228

(!zone_reclaimable(zone) ? "yes" : "no")

3228

(!zone_reclaimable(zone) ? "yes" : "no")

3229

);

3229

);

3230

printk("lowmem_reserve[]:");

3230

printk("lowmem_reserve[]:");

3231

for (i = 0; i < MAX_NR_ZONES; i++)

3231

for (i = 0; i < MAX_NR_ZONES; i++)

3232

printk(" %ld", zone->lowmem_reserve[i]);

3232

printk(" %ld", zone->lowmem_reserve[i]);

3233

printk("\n");

3233

printk("\n");

3234

}

3234

}

3235

3236

for_each_populated_zone(zone) {

3236

for_each_populated_zone(zone) {

3237

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3237

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3238

unsigned char types[MAX_ORDER];

3238

unsigned char types[MAX_ORDER];

3239

3240

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3240

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3241

continue;

3241

continue;

3242

show_node(zone);

3242

show_node(zone);

3243

printk("%s: ", zone->name);

3243

printk("%s: ", zone->name);

3244

3245

spin_lock_irqsave(&zone->lock, flags);

3245

spin_lock_irqsave(&zone->lock, flags);

3246

for (order = 0; order < MAX_ORDER; order++) {

3246

for (order = 0; order < MAX_ORDER; order++) {

3247

struct free_area *area = &zone->free_area[order];

3247

struct free_area *area = &zone->free_area[order];

3248

int type;

3248

int type;

3249

3250

nr[order] = area->nr_free;

3250

nr[order] = area->nr_free;

3251

total += nr[order] << order;

3251

total += nr[order] << order;

3252

3253

types[order] = 0;

3253

types[order] = 0;

3254

for (type = 0; type < MIGRATE_TYPES; type++) {

3254

for (type = 0; type < MIGRATE_TYPES; type++) {

3255

if (!list_empty(&area->free_list[type]))

3255

if (!list_empty(&area->free_list[type]))

3256

types[order] |= 1 << type;

3256

types[order] |= 1 << type;

3257

}

3257

}

3258

}

3258

}

3259

spin_unlock_irqrestore(&zone->lock, flags);

3259

spin_unlock_irqrestore(&zone->lock, flags);

3260

for (order = 0; order < MAX_ORDER; order++) {

3260

for (order = 0; order < MAX_ORDER; order++) {

3261

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3261

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3262

if (nr[order])

3262

if (nr[order])

3263

show_migration_types(types[order]);

3263

show_migration_types(types[order]);

3264

}

3264

}

3265

printk("= %lukB\n", K(total));

3265

printk("= %lukB\n", K(total));

3266

}

3266

}

3267

3268

hugetlb_show_meminfo();

3268

hugetlb_show_meminfo();

3269

3270

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3270

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3271

3272

show_swap_cache_info();

3272

show_swap_cache_info();

3273

}

3273

}

3274

3275

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3275

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3276

{

3276

{

3277

zoneref->zone = zone;

3277

zoneref->zone = zone;

3278

zoneref->zone_idx = zone_idx(zone);

3278

zoneref->zone_idx = zone_idx(zone);

3279

}

3279

}

3280

3281

/*

3281

/*

3282

* Builds allocation fallback zone lists.

3282

* Builds allocation fallback zone lists.

3283

*

3283

*

3284

* Add all populated zones of a node to the zonelist.

3284

* Add all populated zones of a node to the zonelist.

3285

*/

3285

*/

3286

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3286

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3287

int nr_zones)

3287

int nr_zones)

3288

{

3288

{

3289

struct zone *zone;

3289

struct zone *zone;

3290

enum zone_type zone_type = MAX_NR_ZONES;

3290

enum zone_type zone_type = MAX_NR_ZONES;

3291

3292

do {

3292

do {

3293

zone_type--;

3293

zone_type--;

3294

zone = pgdat->node_zones + zone_type;

3294

zone = pgdat->node_zones + zone_type;

3295

if (populated_zone(zone)) {

3295

if (populated_zone(zone)) {

3296

zoneref_set_zone(zone,

3296

zoneref_set_zone(zone,

3297

&zonelist->_zonerefs[nr_zones++]);

3297

&zonelist->_zonerefs[nr_zones++]);

3298

check_highest_zone(zone_type);

3298

check_highest_zone(zone_type);

3299

}

3299

}

3300

} while (zone_type);

3300

} while (zone_type);

3301

3302

return nr_zones;

3302

return nr_zones;

3303

}

3303

}

3304

3305

3306

/*

3306

/*

3307

* zonelist_order:

3307

* zonelist_order:

3308

* 0 = automatic detection of better ordering.

3308

* 0 = automatic detection of better ordering.

3309

* 1 = order by ([node] distance, -zonetype)

3309

* 1 = order by ([node] distance, -zonetype)

3310

* 2 = order by (-zonetype, [node] distance)

3310

* 2 = order by (-zonetype, [node] distance)

3311

*

3311

*

3312

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3312

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3313

* the same zonelist. So only NUMA can configure this param.

3313

* the same zonelist. So only NUMA can configure this param.

3314

*/

3314

*/

3315

#define ZONELIST_ORDER_DEFAULT 0

3315

#define ZONELIST_ORDER_DEFAULT 0

3316

#define ZONELIST_ORDER_NODE 1

3316

#define ZONELIST_ORDER_NODE 1

3317

#define ZONELIST_ORDER_ZONE 2

3317

#define ZONELIST_ORDER_ZONE 2

3318

3319

/* zonelist order in the kernel.

3319

/* zonelist order in the kernel.

3320

* set_zonelist_order() will set this to NODE or ZONE.

3320

* set_zonelist_order() will set this to NODE or ZONE.

3321

*/

3321

*/

3322

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3322

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3323

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3323

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3324

3325

3326

#ifdef CONFIG_NUMA

3326

#ifdef CONFIG_NUMA

3327

/* The value user specified ....changed by config */

3327

/* The value user specified ....changed by config */

3328

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3328

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3329

/* string for sysctl */

3329

/* string for sysctl */

3330

#define NUMA_ZONELIST_ORDER_LEN 16

3330

#define NUMA_ZONELIST_ORDER_LEN 16

3331

char numa_zonelist_order[16] = "default";

3331

char numa_zonelist_order[16] = "default";

3332

3333

/*

3333

/*

3334

* interface for configure zonelist ordering.

3334

* interface for configure zonelist ordering.

3335

* command line option "numa_zonelist_order"

3335

* command line option "numa_zonelist_order"

3336

* = "[dD]efault - default, automatic configuration.

3336

* = "[dD]efault - default, automatic configuration.

3337

* = "[nN]ode - order by node locality, then by zone within node

3337

* = "[nN]ode - order by node locality, then by zone within node

3338

* = "[zZ]one - order by zone, then by locality within zone

3338

* = "[zZ]one - order by zone, then by locality within zone

3339

*/

3339

*/

3340

3341

static int __parse_numa_zonelist_order(char *s)

3341

static int __parse_numa_zonelist_order(char *s)

3342

{

3342

{

3343

if (*s == 'd' || *s == 'D') {

3343

if (*s == 'd' || *s == 'D') {

3344

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3344

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3345

} else if (*s == 'n' || *s == 'N') {

3345

} else if (*s == 'n' || *s == 'N') {

3346

user_zonelist_order = ZONELIST_ORDER_NODE;

3346

user_zonelist_order = ZONELIST_ORDER_NODE;

3347

} else if (*s == 'z' || *s == 'Z') {

3347

} else if (*s == 'z' || *s == 'Z') {

3348

user_zonelist_order = ZONELIST_ORDER_ZONE;

3348

user_zonelist_order = ZONELIST_ORDER_ZONE;

3349

} else {

3349

} else {

3350

printk(KERN_WARNING

3350

printk(KERN_WARNING

3351

"Ignoring invalid numa_zonelist_order value: "

3351

"Ignoring invalid numa_zonelist_order value: "

3352

"%s\n", s);

3352

"%s\n", s);

3353

return -EINVAL;

3353

return -EINVAL;

3354

}

3354

}

3355

return 0;

3355

return 0;

3356

}

3356

}

3357

3358

static __init int setup_numa_zonelist_order(char *s)

3358

static __init int setup_numa_zonelist_order(char *s)

3359

{

3359

{

3360

int ret;

3360

int ret;

3361

3362

if (!s)

3362

if (!s)

3363

return 0;

3363

return 0;

3364

3365

ret = __parse_numa_zonelist_order(s);

3365

ret = __parse_numa_zonelist_order(s);

3366

if (ret == 0)

3366

if (ret == 0)

3367

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3367

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3368

3369

return ret;

3369

return ret;

3370

}

3370

}

3371

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3371

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3372

3373

/*

3373

/*

3374

* sysctl handler for numa_zonelist_order

3374

* sysctl handler for numa_zonelist_order

3375

*/

3375

*/

3376

int numa_zonelist_order_handler(ctl_table *table, int write,

3376

int numa_zonelist_order_handler(ctl_table *table, int write,

3377

void __user *buffer, size_t *length,

3377

void __user *buffer, size_t *length,

3378

loff_t *ppos)

3378

loff_t *ppos)

3379

{

3379

{

3380

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3380

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3381

int ret;

3381

int ret;

3382

static DEFINE_MUTEX(zl_order_mutex);

3382

static DEFINE_MUTEX(zl_order_mutex);

3383

3384

mutex_lock(&zl_order_mutex);

3384

mutex_lock(&zl_order_mutex);

3385

if (write) {

3385

if (write) {

3386

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3386

if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {

3387

ret = -EINVAL;

3387

ret = -EINVAL;

3388

goto out;

3388

goto out;

3389

}

3389

}

3390

strcpy(saved_string, (char *)table->data);

3390

strcpy(saved_string, (char *)table->data);

3391

}

3391

}

3392

ret = proc_dostring(table, write, buffer, length, ppos);

3392

ret = proc_dostring(table, write, buffer, length, ppos);

3393

if (ret)

3393

if (ret)

3394

goto out;

3394

goto out;

3395

if (write) {

3395

if (write) {

3396

int oldval = user_zonelist_order;

3396

int oldval = user_zonelist_order;

3397

3398

ret = __parse_numa_zonelist_order((char *)table->data);

3398

ret = __parse_numa_zonelist_order((char *)table->data);

3399

if (ret) {

3399

if (ret) {

3400

/*

3400

/*

3401

* bogus value. restore saved string

3401

* bogus value. restore saved string

3402

*/

3402

*/

3403

strncpy((char *)table->data, saved_string,

3403

strncpy((char *)table->data, saved_string,

3404

NUMA_ZONELIST_ORDER_LEN);

3404

NUMA_ZONELIST_ORDER_LEN);

3405

user_zonelist_order = oldval;

3405

user_zonelist_order = oldval;

3406

} else if (oldval != user_zonelist_order) {

3406

} else if (oldval != user_zonelist_order) {

3407

mutex_lock(&zonelists_mutex);

3407

mutex_lock(&zonelists_mutex);

3408

build_all_zonelists(NULL, NULL);

3408

build_all_zonelists(NULL, NULL);

3409

mutex_unlock(&zonelists_mutex);

3409

mutex_unlock(&zonelists_mutex);

3410

}

3410

}

3411

}

3411

}

3412

out:

3412

out:

3413

mutex_unlock(&zl_order_mutex);

3413

mutex_unlock(&zl_order_mutex);

3414

return ret;

3414

return ret;

3415

}

3415

}

3416

3417

3418

#define MAX_NODE_LOAD (nr_online_nodes)

3418

#define MAX_NODE_LOAD (nr_online_nodes)

3419

static int node_load[MAX_NUMNODES];

3419

static int node_load[MAX_NUMNODES];

3420

3421

/**

3421

/**

3422

* find_next_best_node - find the next node that should appear in a given node's fallback list

3422

* find_next_best_node - find the next node that should appear in a given node's fallback list

3423

* @node: node whose fallback list we're appending

3423

* @node: node whose fallback list we're appending

3424

* @used_node_mask: nodemask_t of already used nodes

3424

* @used_node_mask: nodemask_t of already used nodes

3425

*

3425

*

3426

* We use a number of factors to determine which is the next node that should

3426

* We use a number of factors to determine which is the next node that should

3427

* appear on a given node's fallback list. The node should not have appeared

3427

* appear on a given node's fallback list. The node should not have appeared

3428

* already in @node's fallback list, and it should be the next closest node

3428

* already in @node's fallback list, and it should be the next closest node

3429

* according to the distance array (which contains arbitrary distance values

3429

* according to the distance array (which contains arbitrary distance values

3430

* from each node to each node in the system), and should also prefer nodes

3430

* from each node to each node in the system), and should also prefer nodes

3431

* with no CPUs, since presumably they'll have very little allocation pressure

3431

* with no CPUs, since presumably they'll have very little allocation pressure

3432

* on them otherwise.

3432

* on them otherwise.

3433

* It returns -1 if no node is found.

3433

* It returns -1 if no node is found.

3434

*/

3434

*/

3435

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3435

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3436

{

3436

{

3437

int n, val;

3437

int n, val;

3438

int min_val = INT_MAX;

3438

int min_val = INT_MAX;

3439

int best_node = NUMA_NO_NODE;

3439

int best_node = NUMA_NO_NODE;

3440

const struct cpumask *tmp = cpumask_of_node(0);

3440

const struct cpumask *tmp = cpumask_of_node(0);

3441

3442

/* Use the local node if we haven't already */

3442

/* Use the local node if we haven't already */

3443

if (!node_isset(node, *used_node_mask)) {

3443

if (!node_isset(node, *used_node_mask)) {

3444

node_set(node, *used_node_mask);

3444

node_set(node, *used_node_mask);

3445

return node;

3445

return node;

3446

}

3446

}

3447

3448

for_each_node_state(n, N_MEMORY) {

3448

for_each_node_state(n, N_MEMORY) {

3449

3450

/* Don't want a node to appear more than once */

3450

/* Don't want a node to appear more than once */

3451

if (node_isset(n, *used_node_mask))

3451

if (node_isset(n, *used_node_mask))

3452

continue;

3452

continue;

3453

3454

/* Use the distance array to find the distance */

3454

/* Use the distance array to find the distance */

3455

val = node_distance(node, n);

3455

val = node_distance(node, n);

3456

3457

/* Penalize nodes under us ("prefer the next node") */

3457

/* Penalize nodes under us ("prefer the next node") */

3458

val += (n < node);

3458

val += (n < node);

3459

3460

/* Give preference to headless and unused nodes */

3460

/* Give preference to headless and unused nodes */

3461

tmp = cpumask_of_node(n);

3461

tmp = cpumask_of_node(n);

3462

if (!cpumask_empty(tmp))

3462

if (!cpumask_empty(tmp))

3463

val += PENALTY_FOR_NODE_WITH_CPUS;

3463

val += PENALTY_FOR_NODE_WITH_CPUS;

3464

3465

/* Slight preference for less loaded node */

3465

/* Slight preference for less loaded node */

3466

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3466

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3467

val += node_load[n];

3467

val += node_load[n];

3468

3469

if (val < min_val) {

3469

if (val < min_val) {

3470

min_val = val;

3470

min_val = val;

3471

best_node = n;

3471

best_node = n;

3472

}

3472

}

3473

}

3473

}

3474

3475

if (best_node >= 0)

3475

if (best_node >= 0)

3476

node_set(best_node, *used_node_mask);

3476

node_set(best_node, *used_node_mask);

3477

3478

return best_node;

3478

return best_node;

3479

}

3479

}

3480

3481

3482

/*

3482

/*

3483

* Build zonelists ordered by node and zones within node.

3483

* Build zonelists ordered by node and zones within node.

3484

* This results in maximum locality--normal zone overflows into local

3484

* This results in maximum locality--normal zone overflows into local

3485

* DMA zone, if any--but risks exhausting DMA zone.

3485

* DMA zone, if any--but risks exhausting DMA zone.

3486

*/

3486

*/

3487

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3487

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3488

{

3488

{

3489

int j;

3489

int j;

3490

struct zonelist *zonelist;

3490

struct zonelist *zonelist;

3491

3492

zonelist = &pgdat->node_zonelists[0];

3492

zonelist = &pgdat->node_zonelists[0];

3493

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3493

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3494

;

3494

;

3495

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3495

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3496

zonelist->_zonerefs[j].zone = NULL;

3496

zonelist->_zonerefs[j].zone = NULL;

3497

zonelist->_zonerefs[j].zone_idx = 0;

3497

zonelist->_zonerefs[j].zone_idx = 0;

3498

}

3498

}

3499

3500

/*

3500

/*

3501

* Build gfp_thisnode zonelists

3501

* Build gfp_thisnode zonelists

3502

*/

3502

*/

3503

static void build_thisnode_zonelists(pg_data_t *pgdat)

3503

static void build_thisnode_zonelists(pg_data_t *pgdat)

3504

{

3504

{

3505

int j;

3505

int j;

3506

struct zonelist *zonelist;

3506

struct zonelist *zonelist;

3507

3508

zonelist = &pgdat->node_zonelists[1];

3508

zonelist = &pgdat->node_zonelists[1];

3509

j = build_zonelists_node(pgdat, zonelist, 0);

3509

j = build_zonelists_node(pgdat, zonelist, 0);

3510

zonelist->_zonerefs[j].zone = NULL;

3510

zonelist->_zonerefs[j].zone = NULL;

3511

zonelist->_zonerefs[j].zone_idx = 0;

3511

zonelist->_zonerefs[j].zone_idx = 0;

3512

}

3512

}

3513

3514

/*

3514

/*

3515

* Build zonelists ordered by zone and nodes within zones.

3515

* Build zonelists ordered by zone and nodes within zones.

3516

* This results in conserving DMA zone[s] until all Normal memory is

3516

* This results in conserving DMA zone[s] until all Normal memory is

3517

* exhausted, but results in overflowing to remote node while memory

3517

* exhausted, but results in overflowing to remote node while memory

3518

* may still exist in local DMA zone.

3518

* may still exist in local DMA zone.

3519

*/

3519

*/

3520

static int node_order[MAX_NUMNODES];

3520

static int node_order[MAX_NUMNODES];

3521

3522

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3522

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3523

{

3523

{

3524

int pos, j, node;

3524

int pos, j, node;

3525

int zone_type; /* needs to be signed */

3525

int zone_type; /* needs to be signed */

3526

struct zone *z;

3526

struct zone *z;

3527

struct zonelist *zonelist;

3527

struct zonelist *zonelist;

3528

3529

zonelist = &pgdat->node_zonelists[0];

3529

zonelist = &pgdat->node_zonelists[0];

3530

pos = 0;

3530

pos = 0;

3531

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3531

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3532

for (j = 0; j < nr_nodes; j++) {

3532

for (j = 0; j < nr_nodes; j++) {

3533

node = node_order[j];

3533

node = node_order[j];

3534

z = &NODE_DATA(node)->node_zones[zone_type];

3534

z = &NODE_DATA(node)->node_zones[zone_type];

3535

if (populated_zone(z)) {

3535

if (populated_zone(z)) {

3536

zoneref_set_zone(z,

3536

zoneref_set_zone(z,

3537

&zonelist->_zonerefs[pos++]);

3537

&zonelist->_zonerefs[pos++]);

3538

check_highest_zone(zone_type);

3538

check_highest_zone(zone_type);

3539

}

3539

}

3540

}

3540

}

3541

}

3541

}

3542

zonelist->_zonerefs[pos].zone = NULL;

3542

zonelist->_zonerefs[pos].zone = NULL;

3543

zonelist->_zonerefs[pos].zone_idx = 0;

3543

zonelist->_zonerefs[pos].zone_idx = 0;

3544

}

3544

}

3545

3546

static int default_zonelist_order(void)

3546

static int default_zonelist_order(void)

3547

{

3547

{

3548

int nid, zone_type;

3548

int nid, zone_type;

3549

unsigned long low_kmem_size, total_size;

3549

unsigned long low_kmem_size, total_size;

3550

struct zone *z;

3550

struct zone *z;

3551

int average_size;

3551

int average_size;

3552

/*

3552

/*

3553

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3553

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3554

* If they are really small and used heavily, the system can fall

3554

* If they are really small and used heavily, the system can fall

3555

* into OOM very easily.

3555

* into OOM very easily.

3556

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3556

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3557

*/

3557

*/

3558

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3558

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3559

low_kmem_size = 0;

3559

low_kmem_size = 0;

3560

total_size = 0;

3560

total_size = 0;

3561

for_each_online_node(nid) {

3561

for_each_online_node(nid) {

3562

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3562

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3563

z = &NODE_DATA(nid)->node_zones[zone_type];

3563

z = &NODE_DATA(nid)->node_zones[zone_type];

3564

if (populated_zone(z)) {

3564

if (populated_zone(z)) {

3565

if (zone_type < ZONE_NORMAL)

3565

if (zone_type < ZONE_NORMAL)

3566

low_kmem_size += z->managed_pages;

3566

low_kmem_size += z->managed_pages;

3567

total_size += z->managed_pages;

3567

total_size += z->managed_pages;

3568

} else if (zone_type == ZONE_NORMAL) {

3568

} else if (zone_type == ZONE_NORMAL) {

3569

/*

3569

/*

3570

* If any node has only lowmem, then node order

3570

* If any node has only lowmem, then node order

3571

* is preferred to allow kernel allocations

3571

* is preferred to allow kernel allocations

3572

* locally; otherwise, they can easily infringe

3572

* locally; otherwise, they can easily infringe

3573

* on other nodes when there is an abundance of

3573

* on other nodes when there is an abundance of

3574

* lowmem available to allocate from.

3574

* lowmem available to allocate from.

3575

*/

3575

*/

3576

return ZONELIST_ORDER_NODE;

3576

return ZONELIST_ORDER_NODE;

3577

}

3577

}

3578

}

3578

}

3579

}

3579

}

3580

if (!low_kmem_size || /* there are no DMA area. */

3580

if (!low_kmem_size || /* there are no DMA area. */

3581

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3581

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3582

return ZONELIST_ORDER_NODE;

3582

return ZONELIST_ORDER_NODE;

3583

/*

3583

/*

3584

* look into each node's config.

3584

* look into each node's config.

3585

* If there is a node whose DMA/DMA32 memory is very big area on

3585

* If there is a node whose DMA/DMA32 memory is very big area on

3586

* local memory, NODE_ORDER may be suitable.

3586

* local memory, NODE_ORDER may be suitable.

3587

*/

3587

*/

3588

average_size = total_size /

3588

average_size = total_size /

3589

(nodes_weight(node_states[N_MEMORY]) + 1);

3589

(nodes_weight(node_states[N_MEMORY]) + 1);

3590

for_each_online_node(nid) {

3590

for_each_online_node(nid) {

3591

low_kmem_size = 0;

3591

low_kmem_size = 0;

3592

total_size = 0;

3592

total_size = 0;

3593

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3593

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3594

z = &NODE_DATA(nid)->node_zones[zone_type];

3594

z = &NODE_DATA(nid)->node_zones[zone_type];

3595

if (populated_zone(z)) {

3595

if (populated_zone(z)) {

3596

if (zone_type < ZONE_NORMAL)

3596

if (zone_type < ZONE_NORMAL)

3597

low_kmem_size += z->present_pages;

3597

low_kmem_size += z->present_pages;

3598

total_size += z->present_pages;

3598

total_size += z->present_pages;

3599

}

3599

}

3600

}

3600

}

3601

if (low_kmem_size &&

3601

if (low_kmem_size &&

3602

total_size > average_size && /* ignore small node */

3602

total_size > average_size && /* ignore small node */

3603

low_kmem_size > total_size * 70/100)

3603

low_kmem_size > total_size * 70/100)

3604

return ZONELIST_ORDER_NODE;

3604

return ZONELIST_ORDER_NODE;

3605

}

3605

}

3606

return ZONELIST_ORDER_ZONE;

3606

return ZONELIST_ORDER_ZONE;

3607

}

3607

}

3608

3609

static void set_zonelist_order(void)

3609

static void set_zonelist_order(void)

3610

{

3610

{

3611

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3611

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3612

current_zonelist_order = default_zonelist_order();

3612

current_zonelist_order = default_zonelist_order();

3613

else

3613

else

3614

current_zonelist_order = user_zonelist_order;

3614

current_zonelist_order = user_zonelist_order;

3615

}

3615

}

3616

3617

static void build_zonelists(pg_data_t *pgdat)

3617

static void build_zonelists(pg_data_t *pgdat)

3618

{

3618

{

3619

int j, node, load;

3619

int j, node, load;

3620

enum zone_type i;

3620

enum zone_type i;

3621

nodemask_t used_mask;

3621

nodemask_t used_mask;

3622

int local_node, prev_node;

3622

int local_node, prev_node;

3623

struct zonelist *zonelist;

3623

struct zonelist *zonelist;

3624

int order = current_zonelist_order;

3624

int order = current_zonelist_order;

3625

3626

/* initialize zonelists */

3626

/* initialize zonelists */

3627

for (i = 0; i < MAX_ZONELISTS; i++) {

3627

for (i = 0; i < MAX_ZONELISTS; i++) {

3628

zonelist = pgdat->node_zonelists + i;

3628

zonelist = pgdat->node_zonelists + i;

3629

zonelist->_zonerefs[0].zone = NULL;

3629

zonelist->_zonerefs[0].zone = NULL;

3630

zonelist->_zonerefs[0].zone_idx = 0;

3630

zonelist->_zonerefs[0].zone_idx = 0;

3631

}

3631

}

3632

3633

/* NUMA-aware ordering of nodes */

3633

/* NUMA-aware ordering of nodes */

3634

local_node = pgdat->node_id;

3634

local_node = pgdat->node_id;

3635

load = nr_online_nodes;

3635

load = nr_online_nodes;

3636

prev_node = local_node;

3636

prev_node = local_node;

3637

nodes_clear(used_mask);

3637

nodes_clear(used_mask);

3638

3639

memset(node_order, 0, sizeof(node_order));

3639

memset(node_order, 0, sizeof(node_order));

3640

j = 0;

3640

j = 0;

3641

3642

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3642

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3643

/*

3643

/*

3644

* We don't want to pressure a particular node.

3644

* We don't want to pressure a particular node.

3645

* So adding penalty to the first node in same

3645

* So adding penalty to the first node in same

3646

* distance group to make it round-robin.

3646

* distance group to make it round-robin.

3647

*/

3647

*/

3648

if (node_distance(local_node, node) !=

3648

if (node_distance(local_node, node) !=

3649

node_distance(local_node, prev_node))

3649

node_distance(local_node, prev_node))

3650

node_load[node] = load;

3650

node_load[node] = load;

3651

3652

prev_node = node;

3652

prev_node = node;

3653

load--;

3653

load--;

3654

if (order == ZONELIST_ORDER_NODE)

3654

if (order == ZONELIST_ORDER_NODE)

3655

build_zonelists_in_node_order(pgdat, node);

3655

build_zonelists_in_node_order(pgdat, node);

3656

else

3656

else

3657

node_order[j++] = node; /* remember order */

3657

node_order[j++] = node; /* remember order */

3658

}

3658

}

3659

3660

if (order == ZONELIST_ORDER_ZONE) {

3660

if (order == ZONELIST_ORDER_ZONE) {

3661

/* calculate node order -- i.e., DMA last! */

3661

/* calculate node order -- i.e., DMA last! */

3662

build_zonelists_in_zone_order(pgdat, j);

3662

build_zonelists_in_zone_order(pgdat, j);

3663

}

3663

}

3664

3665

build_thisnode_zonelists(pgdat);

3665

build_thisnode_zonelists(pgdat);

3666

}

3666

}

3667

3668

/* Construct the zonelist performance cache - see further mmzone.h */

3668

/* Construct the zonelist performance cache - see further mmzone.h */

3669

static void build_zonelist_cache(pg_data_t *pgdat)

3669

static void build_zonelist_cache(pg_data_t *pgdat)

3670

{

3670

{

3671

struct zonelist *zonelist;

3671

struct zonelist *zonelist;

3672

struct zonelist_cache *zlc;

3672

struct zonelist_cache *zlc;

3673

struct zoneref *z;

3673

struct zoneref *z;

3674

3675

zonelist = &pgdat->node_zonelists[0];

3675

zonelist = &pgdat->node_zonelists[0];

3676

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3676

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3677

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3677

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3678

for (z = zonelist->_zonerefs; z->zone; z++)

3678

for (z = zonelist->_zonerefs; z->zone; z++)

3679

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3679

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3680

}

3680

}

3681

3682

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3682

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3683

/*

3683

/*

3684

* Return node id of node used for "local" allocations.

3684

* Return node id of node used for "local" allocations.

3685

* I.e., first node id of first zone in arg node's generic zonelist.

3685

* I.e., first node id of first zone in arg node's generic zonelist.

3686

* Used for initializing percpu 'numa_mem', which is used primarily

3686

* Used for initializing percpu 'numa_mem', which is used primarily

3687

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3687

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3688

*/

3688

*/

3689

int local_memory_node(int node)

3689

int local_memory_node(int node)

3690

{

3690

{

3691

struct zone *zone;

3691

struct zone *zone;

3692

3693

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3693

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3694

gfp_zone(GFP_KERNEL),

3694

gfp_zone(GFP_KERNEL),

3695

NULL,

3695

NULL,

3696

&zone);

3696

&zone);

3697

return zone->node;

3697

return zone->node;

3698

}

3698

}

3699

#endif

3699

#endif

3700

3701

#else /* CONFIG_NUMA */

3701

#else /* CONFIG_NUMA */

3702

3703

static void set_zonelist_order(void)

3703

static void set_zonelist_order(void)

3704

{

3704

{

3705

current_zonelist_order = ZONELIST_ORDER_ZONE;

3705

current_zonelist_order = ZONELIST_ORDER_ZONE;

3706

}

3706

}

3707

3708

static void build_zonelists(pg_data_t *pgdat)

3708

static void build_zonelists(pg_data_t *pgdat)

3709

{

3709

{

3710

int node, local_node;

3710

int node, local_node;

3711

enum zone_type j;

3711

enum zone_type j;

3712

struct zonelist *zonelist;

3712

struct zonelist *zonelist;

3713

3714

local_node = pgdat->node_id;

3714

local_node = pgdat->node_id;

3715

3716

zonelist = &pgdat->node_zonelists[0];

3716

zonelist = &pgdat->node_zonelists[0];

3717

j = build_zonelists_node(pgdat, zonelist, 0);

3717

j = build_zonelists_node(pgdat, zonelist, 0);

3718

3719

/*

3719

/*

3720

* Now we build the zonelist so that it contains the zones

3720

* Now we build the zonelist so that it contains the zones

3721

* of all the other nodes.

3721

* of all the other nodes.

3722

* We don't want to pressure a particular node, so when

3722

* We don't want to pressure a particular node, so when

3723

* building the zones for node N, we make sure that the

3723

* building the zones for node N, we make sure that the

3724

* zones coming right after the local ones are those from

3724

* zones coming right after the local ones are those from

3725

* node N+1 (modulo N)

3725

* node N+1 (modulo N)

3726

*/

3726

*/

3727

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3727

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3728

if (!node_online(node))

3728

if (!node_online(node))

3729

continue;

3729

continue;

3730

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3730

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3731

}

3731

}

3732

for (node = 0; node < local_node; node++) {

3732

for (node = 0; node < local_node; node++) {

3733

if (!node_online(node))

3733

if (!node_online(node))

3734

continue;

3734

continue;

3735

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3735

j = build_zonelists_node(NODE_DATA(node), zonelist, j);

3736

}

3736

}

3737

3738

zonelist->_zonerefs[j].zone = NULL;

3738

zonelist->_zonerefs[j].zone = NULL;

3739

zonelist->_zonerefs[j].zone_idx = 0;

3739

zonelist->_zonerefs[j].zone_idx = 0;

3740

}

3740

}

3741

3742

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3742

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3743

static void build_zonelist_cache(pg_data_t *pgdat)

3743

static void build_zonelist_cache(pg_data_t *pgdat)

3744

{

3744

{

3745

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3745

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3746

}

3746

}

3747

3748

#endif /* CONFIG_NUMA */

3748

#endif /* CONFIG_NUMA */

3749

3750

/*

3750

/*

3751

* Boot pageset table. One per cpu which is going to be used for all

3751

* Boot pageset table. One per cpu which is going to be used for all

3752

* zones and all nodes. The parameters will be set in such a way

3752

* zones and all nodes. The parameters will be set in such a way

3753

* that an item put on a list will immediately be handed over to

3753

* that an item put on a list will immediately be handed over to

3754

* the buddy list. This is safe since pageset manipulation is done

3754

* the buddy list. This is safe since pageset manipulation is done

3755

* with interrupts disabled.

3755

* with interrupts disabled.

3756

*

3756

*

3757

* The boot_pagesets must be kept even after bootup is complete for

3757

* The boot_pagesets must be kept even after bootup is complete for

3758

* unused processors and/or zones. They do play a role for bootstrapping

3758

* unused processors and/or zones. They do play a role for bootstrapping

3759

* hotplugged processors.

3759

* hotplugged processors.

3760

*

3760

*

3761

* zoneinfo_show() and maybe other functions do

3761

* zoneinfo_show() and maybe other functions do

3762

* not check if the processor is online before following the pageset pointer.

3762

* not check if the processor is online before following the pageset pointer.

3763

* Other parts of the kernel may not check if the zone is available.

3763

* Other parts of the kernel may not check if the zone is available.

3764

*/

3764

*/

3765

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3765

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3766

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3766

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3767

static void setup_zone_pageset(struct zone *zone);

3767

static void setup_zone_pageset(struct zone *zone);

3768

3769

/*

3769

/*

3770

* Global mutex to protect against size modification of zonelists

3770

* Global mutex to protect against size modification of zonelists

3771

* as well as to serialize pageset setup for the new populated zone.

3771

* as well as to serialize pageset setup for the new populated zone.

3772

*/

3772

*/

3773

DEFINE_MUTEX(zonelists_mutex);

3773

DEFINE_MUTEX(zonelists_mutex);

3774

3775

/* return values int ....just for stop_machine() */

3775

/* return values int ....just for stop_machine() */

3776

static int __build_all_zonelists(void *data)

3776

static int __build_all_zonelists(void *data)

3777

{

3777

{

3778

int nid;

3778

int nid;

3779

int cpu;

3779

int cpu;

3780

pg_data_t *self = data;

3780

pg_data_t *self = data;

3781

3782

#ifdef CONFIG_NUMA

3782

#ifdef CONFIG_NUMA

3783

memset(node_load, 0, sizeof(node_load));

3783

memset(node_load, 0, sizeof(node_load));

3784

#endif

3784

#endif

3785

3786

if (self && !node_online(self->node_id)) {

3786

if (self && !node_online(self->node_id)) {

3787

build_zonelists(self);

3787

build_zonelists(self);

3788

build_zonelist_cache(self);

3788

build_zonelist_cache(self);

3789

}

3789

}

3790

3791

for_each_online_node(nid) {

3791

for_each_online_node(nid) {

3792

pg_data_t *pgdat = NODE_DATA(nid);

3792

pg_data_t *pgdat = NODE_DATA(nid);

3793

3794

build_zonelists(pgdat);

3794

build_zonelists(pgdat);

3795

build_zonelist_cache(pgdat);

3795

build_zonelist_cache(pgdat);

3796

}

3796

}

3797

3798

/*

3798

/*

3799

* Initialize the boot_pagesets that are going to be used

3799

* Initialize the boot_pagesets that are going to be used

3800

* for bootstrapping processors. The real pagesets for

3800

* for bootstrapping processors. The real pagesets for

3801

* each zone will be allocated later when the per cpu

3801

* each zone will be allocated later when the per cpu

3802

* allocator is available.

3802

* allocator is available.

3803

*

3803

*

3804

* boot_pagesets are used also for bootstrapping offline

3804

* boot_pagesets are used also for bootstrapping offline

3805

* cpus if the system is already booted because the pagesets

3805

* cpus if the system is already booted because the pagesets

3806

* are needed to initialize allocators on a specific cpu too.

3806

* are needed to initialize allocators on a specific cpu too.

3807

* F.e. the percpu allocator needs the page allocator which

3807

* F.e. the percpu allocator needs the page allocator which

3808

* needs the percpu allocator in order to allocate its pagesets

3808

* needs the percpu allocator in order to allocate its pagesets

3809

* (a chicken-egg dilemma).

3809

* (a chicken-egg dilemma).

3810

*/

3810

*/

3811

for_each_possible_cpu(cpu) {

3811

for_each_possible_cpu(cpu) {

3812

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3812

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3813

3814

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3814

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3815

/*

3815

/*

3816

* We now know the "local memory node" for each node--

3816

* We now know the "local memory node" for each node--

3817

* i.e., the node of the first zone in the generic zonelist.

3817

* i.e., the node of the first zone in the generic zonelist.

3818

* Set up numa_mem percpu variable for on-line cpus. During

3818

* Set up numa_mem percpu variable for on-line cpus. During

3819

* boot, only the boot cpu should be on-line; we'll init the

3819

* boot, only the boot cpu should be on-line; we'll init the

3820

* secondary cpus' numa_mem as they come on-line. During

3820

* secondary cpus' numa_mem as they come on-line. During

3821

* node/memory hotplug, we'll fixup all on-line cpus.

3821

* node/memory hotplug, we'll fixup all on-line cpus.

3822

*/

3822

*/

3823

if (cpu_online(cpu))

3823

if (cpu_online(cpu))

3824

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3824

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3825

#endif

3825

#endif

3826

}

3826

}

3827

3828

return 0;

3828

return 0;

3829

}

3829

}

3830

3831

/*

3831

/*

3832

* Called with zonelists_mutex held always

3832

* Called with zonelists_mutex held always

3833

* unless system_state == SYSTEM_BOOTING.

3833

* unless system_state == SYSTEM_BOOTING.

3834

*/

3834

*/

3835

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3835

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3836

{

3836

{

3837

set_zonelist_order();

3837

set_zonelist_order();

3838

3839

if (system_state == SYSTEM_BOOTING) {

3839

if (system_state == SYSTEM_BOOTING) {

3840

__build_all_zonelists(NULL);

3840

__build_all_zonelists(NULL);

3841

mminit_verify_zonelist();

3841

mminit_verify_zonelist();

3842

cpuset_init_current_mems_allowed();

3842

cpuset_init_current_mems_allowed();

3843

} else {

3843

} else {

3844

#ifdef CONFIG_MEMORY_HOTPLUG

3844

#ifdef CONFIG_MEMORY_HOTPLUG

3845

if (zone)

3845

if (zone)

3846

setup_zone_pageset(zone);

3846

setup_zone_pageset(zone);

3847

#endif

3847

#endif

3848

/* we have to stop all cpus to guarantee there is no user

3848

/* we have to stop all cpus to guarantee there is no user

3849

of zonelist */

3849

of zonelist */

3850

stop_machine(__build_all_zonelists, pgdat, NULL);

3850

stop_machine(__build_all_zonelists, pgdat, NULL);

3851

/* cpuset refresh routine should be here */

3851

/* cpuset refresh routine should be here */

3852

}

3852

}

3853

vm_total_pages = nr_free_pagecache_pages();

3853

vm_total_pages = nr_free_pagecache_pages();

3854

/*

3854

/*

3855

* Disable grouping by mobility if the number of pages in the

3855

* Disable grouping by mobility if the number of pages in the

3856

* system is too low to allow the mechanism to work. It would be

3856

* system is too low to allow the mechanism to work. It would be

3857

* more accurate, but expensive to check per-zone. This check is

3857

* more accurate, but expensive to check per-zone. This check is

3858

* made on memory-hotadd so a system can start with mobility

3858

* made on memory-hotadd so a system can start with mobility

3859

* disabled and enable it later

3859

* disabled and enable it later

3860

*/

3860

*/

3861

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3861

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3862

page_group_by_mobility_disabled = 1;

3862

page_group_by_mobility_disabled = 1;

3863

else

3863

else

3864

page_group_by_mobility_disabled = 0;

3864

page_group_by_mobility_disabled = 0;

3865

3866

printk("Built %i zonelists in %s order, mobility grouping %s. "

3866

printk("Built %i zonelists in %s order, mobility grouping %s. "

3867

"Total pages: %ld\n",

3867

"Total pages: %ld\n",

3868

nr_online_nodes,

3868

nr_online_nodes,

3869

zonelist_order_name[current_zonelist_order],

3869

zonelist_order_name[current_zonelist_order],

3870

page_group_by_mobility_disabled ? "off" : "on",

3870

page_group_by_mobility_disabled ? "off" : "on",

3871

vm_total_pages);

3871

vm_total_pages);

3872

#ifdef CONFIG_NUMA

3872

#ifdef CONFIG_NUMA

3873

printk("Policy zone: %s\n", zone_names[policy_zone]);

3873

printk("Policy zone: %s\n", zone_names[policy_zone]);

3874

#endif

3874

#endif

3875

}

3875

}

3876

3877

/*

3877

/*

3878

* Helper functions to size the waitqueue hash table.

3878

* Helper functions to size the waitqueue hash table.

3879

* Essentially these want to choose hash table sizes sufficiently

3879

* Essentially these want to choose hash table sizes sufficiently

3880

* large so that collisions trying to wait on pages are rare.

3880

* large so that collisions trying to wait on pages are rare.

3881

* But in fact, the number of active page waitqueues on typical

3881

* But in fact, the number of active page waitqueues on typical

3882

* systems is ridiculously low, less than 200. So this is even

3882

* systems is ridiculously low, less than 200. So this is even

3883

* conservative, even though it seems large.

3883

* conservative, even though it seems large.

3884

*

3884

*

3885

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3885

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3886

* waitqueues, i.e. the size of the waitq table given the number of pages.

3886

* waitqueues, i.e. the size of the waitq table given the number of pages.

3887

*/

3887

*/

3888

#define PAGES_PER_WAITQUEUE 256

3888

#define PAGES_PER_WAITQUEUE 256

3889

3890

#ifndef CONFIG_MEMORY_HOTPLUG

3890

#ifndef CONFIG_MEMORY_HOTPLUG

3891

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3891

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3892

{

3892

{

3893

unsigned long size = 1;

3893

unsigned long size = 1;

3894

3895

pages /= PAGES_PER_WAITQUEUE;

3895

pages /= PAGES_PER_WAITQUEUE;

3896

3897

while (size < pages)

3897

while (size < pages)

3898

size <<= 1;

3898

size <<= 1;

3899

3900

/*

3900

/*

3901

* Once we have dozens or even hundreds of threads sleeping

3901

* Once we have dozens or even hundreds of threads sleeping

3902

* on IO we've got bigger problems than wait queue collision.

3902

* on IO we've got bigger problems than wait queue collision.

3903

* Limit the size of the wait table to a reasonable size.

3903

* Limit the size of the wait table to a reasonable size.

3904

*/

3904

*/

3905

size = min(size, 4096UL);

3905

size = min(size, 4096UL);

3906

3907

return max(size, 4UL);

3907

return max(size, 4UL);

3908

}

3908

}

3909

#else

3909

#else

3910

/*

3910

/*

3911

* A zone's size might be changed by hot-add, so it is not possible to determine

3911

* A zone's size might be changed by hot-add, so it is not possible to determine

3912

* a suitable size for its wait_table. So we use the maximum size now.

3912

* a suitable size for its wait_table. So we use the maximum size now.

3913

*

3913

*

3914

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3914

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3915

*

3915

*

3916

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3916

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3917

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3917

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3918

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3918

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3919

*

3919

*

3920

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3920

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3921

* or more by the traditional way. (See above). It equals:

3921

* or more by the traditional way. (See above). It equals:

3922

*

3922

*

3923

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3923

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3924

* ia64(16K page size) : = ( 8G + 4M)byte.

3924

* ia64(16K page size) : = ( 8G + 4M)byte.

3925

* powerpc (64K page size) : = (32G +16M)byte.

3925

* powerpc (64K page size) : = (32G +16M)byte.

3926

*/

3926

*/

3927

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3927

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3928

{

3928

{

3929

return 4096UL;

3929

return 4096UL;

3930

}

3930

}

3931

#endif

3931

#endif

3932

3933

/*

3933

/*

3934

* This is an integer logarithm so that shifts can be used later

3934

* This is an integer logarithm so that shifts can be used later

3935

* to extract the more random high bits from the multiplicative

3935

* to extract the more random high bits from the multiplicative

3936

* hash function before the remainder is taken.

3936

* hash function before the remainder is taken.

3937

*/

3937

*/

3938

static inline unsigned long wait_table_bits(unsigned long size)

3938

static inline unsigned long wait_table_bits(unsigned long size)

3939

{

3939

{

3940

return ffz(~size);

3940

return ffz(~size);

3941

}

3941

}

3942

3943

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3943

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3944

3945

/*

3945

/*

3946

* Check if a pageblock contains reserved pages

3946

* Check if a pageblock contains reserved pages

3947

*/

3947

*/

3948

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3948

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3949

{

3949

{

3950

unsigned long pfn;

3950

unsigned long pfn;

3951

3952

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3952

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3953

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3953

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3954

return 1;

3954

return 1;

3955

}

3955

}

3956

return 0;

3956

return 0;

3957

}

3957

}

3958

3959

/*

3959

/*

3960

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3960

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3961

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3961

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3962

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3962

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3963

* higher will lead to a bigger reserve which will get freed as contiguous

3963

* higher will lead to a bigger reserve which will get freed as contiguous

3964

* blocks as reclaim kicks in

3964

* blocks as reclaim kicks in

3965

*/

3965

*/

3966

static void setup_zone_migrate_reserve(struct zone *zone)

3966

static void setup_zone_migrate_reserve(struct zone *zone)

3967

{

3967

{

3968

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3968

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3969

struct page *page;

3969

struct page *page;

3970

unsigned long block_migratetype;

3970

unsigned long block_migratetype;

3971

int reserve;

3971

int reserve;

3972

int old_reserve;

3972

int old_reserve;

3973

3974

/*

3974

/*

3975

* Get the start pfn, end pfn and the number of blocks to reserve

3975

* Get the start pfn, end pfn and the number of blocks to reserve

3976

* We have to be careful to be aligned to pageblock_nr_pages to

3976

* We have to be careful to be aligned to pageblock_nr_pages to

3977

* make sure that we always check pfn_valid for the first page in

3977

* make sure that we always check pfn_valid for the first page in

3978

* the block.

3978

* the block.

3979

*/

3979

*/

3980

start_pfn = zone->zone_start_pfn;

3980

start_pfn = zone->zone_start_pfn;

3981

end_pfn = zone_end_pfn(zone);

3981

end_pfn = zone_end_pfn(zone);

3982

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3982

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3983

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3983

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3984

pageblock_order;

3984

pageblock_order;

3985

3986

/*

3986

/*

3987

* Reserve blocks are generally in place to help high-order atomic

3987

* Reserve blocks are generally in place to help high-order atomic

3988

* allocations that are short-lived. A min_free_kbytes value that

3988

* allocations that are short-lived. A min_free_kbytes value that

3989

* would result in more than 2 reserve blocks for atomic allocations

3989

* would result in more than 2 reserve blocks for atomic allocations

3990

* is assumed to be in place to help anti-fragmentation for the

3990

* is assumed to be in place to help anti-fragmentation for the

3991

* future allocation of hugepages at runtime.

3991

* future allocation of hugepages at runtime.

3992

*/

3992

*/

3993

reserve = min(2, reserve);

3993

reserve = min(2, reserve);

3994

old_reserve = zone->nr_migrate_reserve_block;

3994

old_reserve = zone->nr_migrate_reserve_block;

3995

3996

/* When memory hot-add, we almost always need to do nothing */

3996

/* When memory hot-add, we almost always need to do nothing */

3997

if (reserve == old_reserve)

3997

if (reserve == old_reserve)

3998

return;

3998

return;

3999

zone->nr_migrate_reserve_block = reserve;

3999

zone->nr_migrate_reserve_block = reserve;

4000

4001

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

4001

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

4002

if (!pfn_valid(pfn))

4002

if (!pfn_valid(pfn))

4003

continue;

4003

continue;

4004

page = pfn_to_page(pfn);

4004

page = pfn_to_page(pfn);

4005

4006

/* Watch out for overlapping nodes */

4006

/* Watch out for overlapping nodes */

4007

if (page_to_nid(page) != zone_to_nid(zone))

4007

if (page_to_nid(page) != zone_to_nid(zone))

4008

continue;

4008

continue;

4009

4010

block_migratetype = get_pageblock_migratetype(page);

4010

block_migratetype = get_pageblock_migratetype(page);

4011

4012

/* Only test what is necessary when the reserves are not met */

4012

/* Only test what is necessary when the reserves are not met */

4013

if (reserve > 0) {

4013

if (reserve > 0) {

4014

/*

4014

/*

4015

* Blocks with reserved pages will never free, skip

4015

* Blocks with reserved pages will never free, skip

4016

* them.

4016

* them.

4017

*/

4017

*/

4018

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4018

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

4019

if (pageblock_is_reserved(pfn, block_end_pfn))

4019

if (pageblock_is_reserved(pfn, block_end_pfn))

4020

continue;

4020

continue;

4021

4022

/* If this block is reserved, account for it */

4022

/* If this block is reserved, account for it */

4023

if (block_migratetype == MIGRATE_RESERVE) {

4023

if (block_migratetype == MIGRATE_RESERVE) {

4024

reserve--;

4024

reserve--;

4025

continue;

4025

continue;

4026

}

4026

}

4027

4028

/* Suitable for reserving if this block is movable */

4028

/* Suitable for reserving if this block is movable */

4029

if (block_migratetype == MIGRATE_MOVABLE) {

4029

if (block_migratetype == MIGRATE_MOVABLE) {

4030

set_pageblock_migratetype(page,

4030

set_pageblock_migratetype(page,

4031

MIGRATE_RESERVE);

4031

MIGRATE_RESERVE);

4032

move_freepages_block(zone, page,

4032

move_freepages_block(zone, page,

4033

MIGRATE_RESERVE);

4033

MIGRATE_RESERVE);

4034

reserve--;

4034

reserve--;

4035

continue;

4035

continue;

4036

}

4036

}

4037

} else if (!old_reserve) {

4037

} else if (!old_reserve) {

4038

/*

4038

/*

4039

* At boot time we don't need to scan the whole zone

4039

* At boot time we don't need to scan the whole zone

4040

* for turning off MIGRATE_RESERVE.

4040

* for turning off MIGRATE_RESERVE.

4041

*/

4041

*/

4042

break;

4042

break;

4043

}

4043

}

4044

4045

/*

4045

/*

4046

* If the reserve is met and this is a previous reserved block,

4046

* If the reserve is met and this is a previous reserved block,

4047

* take it back

4047

* take it back

4048

*/

4048

*/

4049

if (block_migratetype == MIGRATE_RESERVE) {

4049

if (block_migratetype == MIGRATE_RESERVE) {

4050

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4050

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4051

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4051

move_freepages_block(zone, page, MIGRATE_MOVABLE);

4052

}

4052

}

4053

}

4053

}

4054

}

4054

}

4055

4056

/*

4056

/*

4057

* Initially all pages are reserved - free ones are freed

4057

* Initially all pages are reserved - free ones are freed

4058

* up by free_all_bootmem() once the early boot process is

4058

* up by free_all_bootmem() once the early boot process is

4059

* done. Non-atomic initialization, single-pass.

4059

* done. Non-atomic initialization, single-pass.

4060

*/

4060

*/

4061

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4061

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

4062

unsigned long start_pfn, enum memmap_context context)

4062

unsigned long start_pfn, enum memmap_context context)

4063

{

4063

{

4064

struct page *page;

4064

struct page *page;

4065

unsigned long end_pfn = start_pfn + size;

4065

unsigned long end_pfn = start_pfn + size;

4066

unsigned long pfn;

4066

unsigned long pfn;

4067

struct zone *z;

4067

struct zone *z;

4068

4069

if (highest_memmap_pfn < end_pfn - 1)

4069

if (highest_memmap_pfn < end_pfn - 1)

4070

highest_memmap_pfn = end_pfn - 1;

4070

highest_memmap_pfn = end_pfn - 1;

4071

4072

z = &NODE_DATA(nid)->node_zones[zone];

4072

z = &NODE_DATA(nid)->node_zones[zone];

4073

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4073

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

4074

/*

4074

/*

4075

* There can be holes in boot-time mem_map[]s

4075

* There can be holes in boot-time mem_map[]s

4076

* handed to this function. They do not

4076

* handed to this function. They do not

4077

* exist on hotplugged memory.

4077

* exist on hotplugged memory.

4078

*/

4078

*/

4079

if (context == MEMMAP_EARLY) {

4079

if (context == MEMMAP_EARLY) {

4080

if (!early_pfn_valid(pfn))

4080

if (!early_pfn_valid(pfn))

4081

continue;

4081

continue;

4082

if (!early_pfn_in_nid(pfn, nid))

4082

if (!early_pfn_in_nid(pfn, nid))

4083

continue;

4083

continue;

4084

}

4084

}

4085

page = pfn_to_page(pfn);

4085

page = pfn_to_page(pfn);

4086

set_page_links(page, zone, nid, pfn);

4086

set_page_links(page, zone, nid, pfn);

4087

mminit_verify_page_links(page, zone, nid, pfn);

4087

mminit_verify_page_links(page, zone, nid, pfn);

4088

init_page_count(page);

4088

init_page_count(page);

4089

page_mapcount_reset(page);

4089

page_mapcount_reset(page);

4090

page_nid_reset_last(page);

4090

page_nid_reset_last(page);

4091

SetPageReserved(page);

4091

SetPageReserved(page);

4092

/*

4092

/*

4093

* Mark the block movable so that blocks are reserved for

4093

* Mark the block movable so that blocks are reserved for

4094

* movable at startup. This will force kernel allocations

4094

* movable at startup. This will force kernel allocations

4095

* to reserve their blocks rather than leaking throughout

4095

* to reserve their blocks rather than leaking throughout

4096

* the address space during boot when many long-lived

4096

* the address space during boot when many long-lived

4097

* kernel allocations are made. Later some blocks near

4097

* kernel allocations are made. Later some blocks near

4098

* the start are marked MIGRATE_RESERVE by

4098

* the start are marked MIGRATE_RESERVE by

4099

* setup_zone_migrate_reserve()

4099

* setup_zone_migrate_reserve()

4100

*

4100

*

4101

* bitmap is created for zone's valid pfn range. but memmap

4101

* bitmap is created for zone's valid pfn range. but memmap

4102

* can be created for invalid pages (for alignment)

4102

* can be created for invalid pages (for alignment)

4103

* check here not to call set_pageblock_migratetype() against

4103

* check here not to call set_pageblock_migratetype() against

4104

* pfn out of zone.

4104

* pfn out of zone.

4105

*/

4105

*/

4106

if ((z->zone_start_pfn <= pfn)

4106

if ((z->zone_start_pfn <= pfn)

4107

&& (pfn < zone_end_pfn(z))

4107

&& (pfn < zone_end_pfn(z))

4108

&& !(pfn & (pageblock_nr_pages - 1)))

4108

&& !(pfn & (pageblock_nr_pages - 1)))

4109

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4109

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

4110

4111

INIT_LIST_HEAD(&page->lru);

4111

INIT_LIST_HEAD(&page->lru);

4112

#ifdef WANT_PAGE_VIRTUAL

4112

#ifdef WANT_PAGE_VIRTUAL

4113

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4113

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

4114

if (!is_highmem_idx(zone))

4114

if (!is_highmem_idx(zone))

4115

set_page_address(page, __va(pfn << PAGE_SHIFT));

4115

set_page_address(page, __va(pfn << PAGE_SHIFT));

4116

#endif

4116

#endif

4117

}

4117

}

4118

}

4118

}

4119

4120

static void __meminit zone_init_free_lists(struct zone *zone)

4120

static void __meminit zone_init_free_lists(struct zone *zone)

4121

{

4121

{

4122

unsigned int order, t;

4122

unsigned int order, t;

4123

for_each_migratetype_order(order, t) {

4123

for_each_migratetype_order(order, t) {

4124

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4124

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

4125

zone->free_area[order].nr_free = 0;

4125

zone->free_area[order].nr_free = 0;

4126

}

4126

}

4127

}

4127

}

4128

4129

#ifndef __HAVE_ARCH_MEMMAP_INIT

4129

#ifndef __HAVE_ARCH_MEMMAP_INIT

4130

#define memmap_init(size, nid, zone, start_pfn) \

4130

#define memmap_init(size, nid, zone, start_pfn) \

4131

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4131

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

4132

#endif

4132

#endif

4133

4134

static int zone_batchsize(struct zone *zone)

4134

static int zone_batchsize(struct zone *zone)

4135

{

4135

{

4136

#ifdef CONFIG_MMU

4136

#ifdef CONFIG_MMU

4137

int batch;

4137

int batch;

4138

4139

/*

4139

/*

4140

* The per-cpu-pages pools are set to around 1000th of the

4140

* The per-cpu-pages pools are set to around 1000th of the

4141

* size of the zone. But no more than 1/2 of a meg.

4141

* size of the zone. But no more than 1/2 of a meg.

4142

*

4142

*

4143

* OK, so we don't know how big the cache is. So guess.

4143

* OK, so we don't know how big the cache is. So guess.

4144

*/

4144

*/

4145

batch = zone->managed_pages / 1024;

4145

batch = zone->managed_pages / 1024;

4146

if (batch * PAGE_SIZE > 512 * 1024)

4146

if (batch * PAGE_SIZE > 512 * 1024)

4147

batch = (512 * 1024) / PAGE_SIZE;

4147

batch = (512 * 1024) / PAGE_SIZE;

4148

batch /= 4; /* We effectively *= 4 below */

4148

batch /= 4; /* We effectively *= 4 below */

4149

if (batch < 1)

4149

if (batch < 1)

4150

batch = 1;

4150

batch = 1;

4151

4152

/*

4152

/*

4153

* Clamp the batch to a 2^n - 1 value. Having a power

4153

* Clamp the batch to a 2^n - 1 value. Having a power

4154

* of 2 value was found to be more likely to have

4154

* of 2 value was found to be more likely to have

4155

* suboptimal cache aliasing properties in some cases.

4155

* suboptimal cache aliasing properties in some cases.

4156

*

4156

*

4157

* For example if 2 tasks are alternately allocating

4157

* For example if 2 tasks are alternately allocating

4158

* batches of pages, one task can end up with a lot

4158

* batches of pages, one task can end up with a lot

4159

* of pages of one half of the possible page colors

4159

* of pages of one half of the possible page colors

4160

* and the other with pages of the other colors.

4160

* and the other with pages of the other colors.

4161

*/

4161

*/

4162

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4162

batch = rounddown_pow_of_two(batch + batch/2) - 1;

4163

4164

return batch;

4164

return batch;

4165

4166

#else

4166

#else

4167

/* The deferral and batching of frees should be suppressed under NOMMU

4167

/* The deferral and batching of frees should be suppressed under NOMMU

4168

* conditions.

4168

* conditions.

4169

*

4169

*

4170

* The problem is that NOMMU needs to be able to allocate large chunks

4170

* The problem is that NOMMU needs to be able to allocate large chunks

4171

* of contiguous memory as there's no hardware page translation to

4171

* of contiguous memory as there's no hardware page translation to

4172

* assemble apparent contiguous memory from discontiguous pages.

4172

* assemble apparent contiguous memory from discontiguous pages.

4173

*

4173

*

4174

* Queueing large contiguous runs of pages for batching, however,

4174

* Queueing large contiguous runs of pages for batching, however,

4175

* causes the pages to actually be freed in smaller chunks. As there

4175

* causes the pages to actually be freed in smaller chunks. As there

4176

* can be a significant delay between the individual batches being

4176

* can be a significant delay between the individual batches being

4177

* recycled, this leads to the once large chunks of space being

4177

* recycled, this leads to the once large chunks of space being

4178

* fragmented and becoming unavailable for high-order allocations.

4178

* fragmented and becoming unavailable for high-order allocations.

4179

*/

4179

*/

4180

return 0;

4180

return 0;

4181

#endif

4181

#endif

4182

}

4182

}

4183

4184

/*

4184

/*

4185

* pcp->high and pcp->batch values are related and dependent on one another:

4185

* pcp->high and pcp->batch values are related and dependent on one another:

4186

* ->batch must never be higher then ->high.

4186

* ->batch must never be higher then ->high.

4187

* The following function updates them in a safe manner without read side

4187

* The following function updates them in a safe manner without read side

4188

* locking.

4188

* locking.

4189

*

4189

*

4190

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4190

* Any new users of pcp->batch and pcp->high should ensure they can cope with

4191

* those fields changing asynchronously (acording the the above rule).

4191

* those fields changing asynchronously (acording the the above rule).

4192

*

4192

*

4193

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4193

* mutex_is_locked(&pcp_batch_high_lock) required when calling this function

4194

* outside of boot time (or some other assurance that no concurrent updaters

4194

* outside of boot time (or some other assurance that no concurrent updaters

4195

* exist).

4195

* exist).

4196

*/

4196

*/

4197

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4197

static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,

4198

unsigned long batch)

4198

unsigned long batch)

4199

{

4199

{

4200

/* start with a fail safe value for batch */

4200

/* start with a fail safe value for batch */

4201

pcp->batch = 1;

4201

pcp->batch = 1;

4202

smp_wmb();

4202

smp_wmb();

4203

4204

/* Update high, then batch, in order */

4204

/* Update high, then batch, in order */

4205

pcp->high = high;

4205

pcp->high = high;

4206

smp_wmb();

4206

smp_wmb();

4207

4208

pcp->batch = batch;

4208

pcp->batch = batch;

4209

}

4209

}

4210

4211

/* a companion to pageset_set_high() */

4211

/* a companion to pageset_set_high() */

4212

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4212

static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)

4213

{

4213

{

4214

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4214

pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));

4215

}

4215

}

4216

4217

static void pageset_init(struct per_cpu_pageset *p)

4217

static void pageset_init(struct per_cpu_pageset *p)

4218

{

4218

{

4219

struct per_cpu_pages *pcp;

4219

struct per_cpu_pages *pcp;

4220

int migratetype;

4220

int migratetype;

4221

4222

memset(p, 0, sizeof(*p));

4222

memset(p, 0, sizeof(*p));

4223

4224

pcp = &p->pcp;

4224

pcp = &p->pcp;

4225

pcp->count = 0;

4225

pcp->count = 0;

4226

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4226

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

4227

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4227

INIT_LIST_HEAD(&pcp->lists[migratetype]);

4228

}

4228

}

4229

4230

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4230

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

4231

{

4231

{

4232

pageset_init(p);

4232

pageset_init(p);

4233

pageset_set_batch(p, batch);

4233

pageset_set_batch(p, batch);

4234

}

4234

}

4235

4236

/*

4236

/*

4237

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4237

* pageset_set_high() sets the high water mark for hot per_cpu_pagelist

4238

* to the value high for the pageset p.

4238

* to the value high for the pageset p.

4239

*/

4239

*/

4240

static void pageset_set_high(struct per_cpu_pageset *p,

4240

static void pageset_set_high(struct per_cpu_pageset *p,

4241

unsigned long high)

4241

unsigned long high)

4242

{

4242

{

4243

unsigned long batch = max(1UL, high / 4);

4243

unsigned long batch = max(1UL, high / 4);

4244

if ((high / 4) > (PAGE_SHIFT * 8))

4244

if ((high / 4) > (PAGE_SHIFT * 8))

4245

batch = PAGE_SHIFT * 8;

4245

batch = PAGE_SHIFT * 8;

4246

4247

pageset_update(&p->pcp, high, batch);

4247

pageset_update(&p->pcp, high, batch);

4248

}

4248

}

4249

4250

static void pageset_set_high_and_batch(struct zone *zone,

4250

static void pageset_set_high_and_batch(struct zone *zone,

4251

struct per_cpu_pageset *pcp)

4251

struct per_cpu_pageset *pcp)

4252

{

4252

{

4253

if (percpu_pagelist_fraction)

4253

if (percpu_pagelist_fraction)

4254

pageset_set_high(pcp,

4254

pageset_set_high(pcp,

4255

(zone->managed_pages /

4255

(zone->managed_pages /

4256

percpu_pagelist_fraction));

4256

percpu_pagelist_fraction));

4257

else

4257

else

4258

pageset_set_batch(pcp, zone_batchsize(zone));

4258

pageset_set_batch(pcp, zone_batchsize(zone));

4259

}

4259

}

4260

4261

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4261

static void __meminit zone_pageset_init(struct zone *zone, int cpu)

4262

{

4262

{

4263

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4263

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4264

4265

pageset_init(pcp);

4265

pageset_init(pcp);

4266

pageset_set_high_and_batch(zone, pcp);

4266

pageset_set_high_and_batch(zone, pcp);

4267

}

4267

}

4268

4269

static void __meminit setup_zone_pageset(struct zone *zone)

4269

static void __meminit setup_zone_pageset(struct zone *zone)

4270

{

4270

{

4271

int cpu;

4271

int cpu;

4272

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4272

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4273

for_each_possible_cpu(cpu)

4273

for_each_possible_cpu(cpu)

4274

zone_pageset_init(zone, cpu);

4274

zone_pageset_init(zone, cpu);

4275

}

4275

}

4276

4277

/*

4277

/*

4278

* Allocate per cpu pagesets and initialize them.

4278

* Allocate per cpu pagesets and initialize them.

4279

* Before this call only boot pagesets were available.

4279

* Before this call only boot pagesets were available.

4280

*/

4280

*/

4281

void __init setup_per_cpu_pageset(void)

4281

void __init setup_per_cpu_pageset(void)

4282

{

4282

{

4283

struct zone *zone;

4283

struct zone *zone;

4284

4285

for_each_populated_zone(zone)

4285

for_each_populated_zone(zone)

4286

setup_zone_pageset(zone);

4286

setup_zone_pageset(zone);

4287

}

4287

}

4288

4289

static noinline __init_refok

4289

static noinline __init_refok

4290

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4290

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4291

{

4291

{

4292

int i;

4292

int i;

4293

struct pglist_data *pgdat = zone->zone_pgdat;

4293

struct pglist_data *pgdat = zone->zone_pgdat;

4294

size_t alloc_size;

4294

size_t alloc_size;

4295

4296

/*

4296

/*

4297

* The per-page waitqueue mechanism uses hashed waitqueues

4297

* The per-page waitqueue mechanism uses hashed waitqueues

4298

* per zone.

4298

* per zone.

4299

*/

4299

*/

4300

zone->wait_table_hash_nr_entries =

4300

zone->wait_table_hash_nr_entries =

4301

wait_table_hash_nr_entries(zone_size_pages);

4301

wait_table_hash_nr_entries(zone_size_pages);

4302

zone->wait_table_bits =

4302

zone->wait_table_bits =

4303

wait_table_bits(zone->wait_table_hash_nr_entries);

4303

wait_table_bits(zone->wait_table_hash_nr_entries);

4304

alloc_size = zone->wait_table_hash_nr_entries

4304

alloc_size = zone->wait_table_hash_nr_entries

4305

* sizeof(wait_queue_head_t);

4305

* sizeof(wait_queue_head_t);

4306

4307

if (!slab_is_available()) {

4307

if (!slab_is_available()) {

4308

zone->wait_table = (wait_queue_head_t *)

4308

zone->wait_table = (wait_queue_head_t *)

4309

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4309

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4310

} else {

4310

} else {

4311

/*

4311

/*

4312

* This case means that a zone whose size was 0 gets new memory

4312

* This case means that a zone whose size was 0 gets new memory

4313

* via memory hot-add.

4313

* via memory hot-add.

4314

* But it may be the case that a new node was hot-added. In

4314

* But it may be the case that a new node was hot-added. In

4315

* this case vmalloc() will not be able to use this new node's

4315

* this case vmalloc() will not be able to use this new node's

4316

* memory - this wait_table must be initialized to use this new

4316

* memory - this wait_table must be initialized to use this new

4317

* node itself as well.

4317

* node itself as well.

4318

* To use this new node's memory, further consideration will be

4318

* To use this new node's memory, further consideration will be

4319

* necessary.

4319

* necessary.

4320

*/

4320

*/

4321

zone->wait_table = vmalloc(alloc_size);

4321

zone->wait_table = vmalloc(alloc_size);

4322

}

4322

}

4323

if (!zone->wait_table)

4323

if (!zone->wait_table)

4324

return -ENOMEM;

4324

return -ENOMEM;

4325

4326

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4326

for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4327

init_waitqueue_head(zone->wait_table + i);

4327

init_waitqueue_head(zone->wait_table + i);

4328

4329

return 0;

4329

return 0;

4330

}

4330

}

4331

4332

static __meminit void zone_pcp_init(struct zone *zone)

4332

static __meminit void zone_pcp_init(struct zone *zone)

4333

{

4333

{

4334

/*

4334

/*

4335

* per cpu subsystem is not up at this point. The following code

4335

* per cpu subsystem is not up at this point. The following code

4336

* relies on the ability of the linker to provide the

4336

* relies on the ability of the linker to provide the

4337

* offset of a (static) per cpu variable into the per cpu area.

4337

* offset of a (static) per cpu variable into the per cpu area.

4338

*/

4338

*/

4339

zone->pageset = &boot_pageset;

4339

zone->pageset = &boot_pageset;

4340

4341

if (zone->present_pages)

4341

if (zone->present_pages)

4342

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4342

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4343

zone->name, zone->present_pages,

4343

zone->name, zone->present_pages,

4344

zone_batchsize(zone));

4344

zone_batchsize(zone));

4345

}

4345

}

4346

4347

int __meminit init_currently_empty_zone(struct zone *zone,

4347

int __meminit init_currently_empty_zone(struct zone *zone,

4348

unsigned long zone_start_pfn,

4348

unsigned long zone_start_pfn,

4349

unsigned long size,

4349

unsigned long size,

4350

enum memmap_context context)

4350

enum memmap_context context)

4351

{

4351

{

4352

struct pglist_data *pgdat = zone->zone_pgdat;

4352

struct pglist_data *pgdat = zone->zone_pgdat;

4353

int ret;

4353

int ret;

4354

ret = zone_wait_table_init(zone, size);

4354

ret = zone_wait_table_init(zone, size);

4355

if (ret)

4355

if (ret)

4356

return ret;

4356

return ret;

4357

pgdat->nr_zones = zone_idx(zone) + 1;

4357

pgdat->nr_zones = zone_idx(zone) + 1;

4358

4359

zone->zone_start_pfn = zone_start_pfn;

4359

zone->zone_start_pfn = zone_start_pfn;

4360

4361

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4361

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4362

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4362

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4363

pgdat->node_id,

4363

pgdat->node_id,

4364

(unsigned long)zone_idx(zone),

4364

(unsigned long)zone_idx(zone),

4365

zone_start_pfn, (zone_start_pfn + size));

4365

zone_start_pfn, (zone_start_pfn + size));

4366

4367

zone_init_free_lists(zone);

4367

zone_init_free_lists(zone);

4368

4369

return 0;

4369

return 0;

4370

}

4370

}

4371

4372

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4372

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4373

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4373

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4374

/*

4374

/*

4375

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4375

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4376

* Architectures may implement their own version but if add_active_range()

4376

* Architectures may implement their own version but if add_active_range()

4377

* was used and there are no special requirements, this is a convenient

4377

* was used and there are no special requirements, this is a convenient

4378

* alternative

4378

* alternative

4379

*/

4379

*/

4380

int __meminit __early_pfn_to_nid(unsigned long pfn)

4380

int __meminit __early_pfn_to_nid(unsigned long pfn)

4381

{

4381

{

4382

unsigned long start_pfn, end_pfn;

4382

unsigned long start_pfn, end_pfn;

4383

int nid;

4383

int nid;

4384

/*

4384

/*

4385

* NOTE: The following SMP-unsafe globals are only used early in boot

4385

* NOTE: The following SMP-unsafe globals are only used early in boot

4386

* when the kernel is running single-threaded.

4386

* when the kernel is running single-threaded.

4387

*/

4387

*/

4388

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4388

static unsigned long __meminitdata last_start_pfn, last_end_pfn;

4389

static int __meminitdata last_nid;

4389

static int __meminitdata last_nid;

4390

4391

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4391

if (last_start_pfn <= pfn && pfn < last_end_pfn)

4392

return last_nid;

4392

return last_nid;

4393

4394

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4394

nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);

4395

if (nid != -1) {

4395

if (nid != -1) {

4396

last_start_pfn = start_pfn;

4396

last_start_pfn = start_pfn;

4397

last_end_pfn = end_pfn;

4397

last_end_pfn = end_pfn;

4398

last_nid = nid;

4398

last_nid = nid;

4399

}

4399

}

4400

4401

return nid;

4401

return nid;

4402

}

4402

}

4403

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4403

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4404

4405

int __meminit early_pfn_to_nid(unsigned long pfn)

4405

int __meminit early_pfn_to_nid(unsigned long pfn)

4406

{

4406

{

4407

int nid;

4407

int nid;

4408

4409

nid = __early_pfn_to_nid(pfn);

4409

nid = __early_pfn_to_nid(pfn);

4410

if (nid >= 0)

4410

if (nid >= 0)

4411

return nid;

4411

return nid;

4412

/* just returns 0 */

4412

/* just returns 0 */

4413

return 0;

4413

return 0;

4414

}

4414

}

4415

4416

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4416

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4417

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4417

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4418

{

4418

{

4419

int nid;

4419

int nid;

4420

4421

nid = __early_pfn_to_nid(pfn);

4421

nid = __early_pfn_to_nid(pfn);

4422

if (nid >= 0 && nid != node)

4422

if (nid >= 0 && nid != node)

4423

return false;

4423

return false;

4424

return true;

4424

return true;

4425

}

4425

}

4426

#endif

4426

#endif

4427

4428

/**

4428

/**

4429

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4429

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4430

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4430

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4431

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4431

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4432

*

4432

*

4433

* If an architecture guarantees that all ranges registered with

4433

* If an architecture guarantees that all ranges registered with

4434

* add_active_ranges() contain no holes and may be freed, this

4434

* add_active_ranges() contain no holes and may be freed, this

4435

* this function may be used instead of calling free_bootmem() manually.

4435

* this function may be used instead of calling free_bootmem() manually.

4436

*/

4436

*/

4437

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4437

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4438

{

4438

{

4439

unsigned long start_pfn, end_pfn;

4439

unsigned long start_pfn, end_pfn;

4440

int i, this_nid;

4440

int i, this_nid;

4441

4442

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4442

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4443

start_pfn = min(start_pfn, max_low_pfn);

4443

start_pfn = min(start_pfn, max_low_pfn);

4444

end_pfn = min(end_pfn, max_low_pfn);

4444

end_pfn = min(end_pfn, max_low_pfn);

4445

4446

if (start_pfn < end_pfn)

4446

if (start_pfn < end_pfn)

4447

free_bootmem_node(NODE_DATA(this_nid),

4447

free_bootmem_node(NODE_DATA(this_nid),

4448

PFN_PHYS(start_pfn),

4448

PFN_PHYS(start_pfn),

4449

(end_pfn - start_pfn) << PAGE_SHIFT);

4449

(end_pfn - start_pfn) << PAGE_SHIFT);

4450

}

4450

}

4451

}

4451

}

4452

4453

/**

4453

/**

4454

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4454

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4455

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4455

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4456

*

4456

*

4457

* If an architecture guarantees that all ranges registered with

4457

* If an architecture guarantees that all ranges registered with

4458

* add_active_ranges() contain no holes and may be freed, this

4458

* add_active_ranges() contain no holes and may be freed, this

4459

* function may be used instead of calling memory_present() manually.

4459

* function may be used instead of calling memory_present() manually.

4460

*/

4460

*/

4461

void __init sparse_memory_present_with_active_regions(int nid)

4461

void __init sparse_memory_present_with_active_regions(int nid)

4462

{

4462

{

4463

unsigned long start_pfn, end_pfn;

4463

unsigned long start_pfn, end_pfn;

4464

int i, this_nid;

4464

int i, this_nid;

4465

4466

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4466

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4467

memory_present(this_nid, start_pfn, end_pfn);

4467

memory_present(this_nid, start_pfn, end_pfn);

4468

}

4468

}

4469

4470

/**

4470

/**

4471

* get_pfn_range_for_nid - Return the start and end page frames for a node

4471

* get_pfn_range_for_nid - Return the start and end page frames for a node

4472

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4472

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4473

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4473

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4474

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4474

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4475

*

4475

*

4476

* It returns the start and end page frame of a node based on information

4476

* It returns the start and end page frame of a node based on information

4477

* provided by an arch calling add_active_range(). If called for a node

4477

* provided by an arch calling add_active_range(). If called for a node

4478

* with no available memory, a warning is printed and the start and end

4478

* with no available memory, a warning is printed and the start and end

4479

* PFNs will be 0.

4479

* PFNs will be 0.

4480

*/

4480

*/

4481

void __meminit get_pfn_range_for_nid(unsigned int nid,

4481

void __meminit get_pfn_range_for_nid(unsigned int nid,

4482

unsigned long *start_pfn, unsigned long *end_pfn)

4482

unsigned long *start_pfn, unsigned long *end_pfn)

4483

{

4483

{

4484

unsigned long this_start_pfn, this_end_pfn;

4484

unsigned long this_start_pfn, this_end_pfn;

4485

int i;

4485

int i;

4486

4487

*start_pfn = -1UL;

4487

*start_pfn = -1UL;

4488

*end_pfn = 0;

4488

*end_pfn = 0;

4489

4490

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4490

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4491

*start_pfn = min(*start_pfn, this_start_pfn);

4491

*start_pfn = min(*start_pfn, this_start_pfn);

4492

*end_pfn = max(*end_pfn, this_end_pfn);

4492

*end_pfn = max(*end_pfn, this_end_pfn);

4493

}

4493

}

4494

4495

if (*start_pfn == -1UL)

4495

if (*start_pfn == -1UL)

4496

*start_pfn = 0;

4496

*start_pfn = 0;

4497

}

4497

}

4498

4499

/*

4499

/*

4500

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4500

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4501

* assumption is made that zones within a node are ordered in monotonic

4501

* assumption is made that zones within a node are ordered in monotonic

4502

* increasing memory addresses so that the "highest" populated zone is used

4502

* increasing memory addresses so that the "highest" populated zone is used

4503

*/

4503

*/

4504

static void __init find_usable_zone_for_movable(void)

4504

static void __init find_usable_zone_for_movable(void)

4505

{

4505

{

4506

int zone_index;

4506

int zone_index;

4507

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4507

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4508

if (zone_index == ZONE_MOVABLE)

4508

if (zone_index == ZONE_MOVABLE)

4509

continue;

4509

continue;

4510

4511

if (arch_zone_highest_possible_pfn[zone_index] >

4511

if (arch_zone_highest_possible_pfn[zone_index] >

4512

arch_zone_lowest_possible_pfn[zone_index])

4512

arch_zone_lowest_possible_pfn[zone_index])

4513

break;

4513

break;

4514

}

4514

}

4515

4516

VM_BUG_ON(zone_index == -1);

4516

VM_BUG_ON(zone_index == -1);

4517

movable_zone = zone_index;

4517

movable_zone = zone_index;

4518

}

4518

}

4519

4520

/*

4520

/*

4521

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4521

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4522

* because it is sized independent of architecture. Unlike the other zones,

4522

* because it is sized independent of architecture. Unlike the other zones,

4523

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4523

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4524

* in each node depending on the size of each node and how evenly kernelcore

4524

* in each node depending on the size of each node and how evenly kernelcore

4525

* is distributed. This helper function adjusts the zone ranges

4525

* is distributed. This helper function adjusts the zone ranges

4526

* provided by the architecture for a given node by using the end of the

4526

* provided by the architecture for a given node by using the end of the

4527

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4527

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4528

* zones within a node are in order of monotonic increases memory addresses

4528

* zones within a node are in order of monotonic increases memory addresses

4529

*/

4529

*/

4530

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4530

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4531

unsigned long zone_type,

4531

unsigned long zone_type,

4532

unsigned long node_start_pfn,

4532

unsigned long node_start_pfn,

4533

unsigned long node_end_pfn,

4533

unsigned long node_end_pfn,

4534

unsigned long *zone_start_pfn,

4534

unsigned long *zone_start_pfn,

4535

unsigned long *zone_end_pfn)

4535

unsigned long *zone_end_pfn)

4536

{

4536

{

4537

/* Only adjust if ZONE_MOVABLE is on this node */

4537

/* Only adjust if ZONE_MOVABLE is on this node */

4538

if (zone_movable_pfn[nid]) {

4538

if (zone_movable_pfn[nid]) {

4539

/* Size ZONE_MOVABLE */

4539

/* Size ZONE_MOVABLE */

4540

if (zone_type == ZONE_MOVABLE) {

4540

if (zone_type == ZONE_MOVABLE) {

4541

*zone_start_pfn = zone_movable_pfn[nid];

4541

*zone_start_pfn = zone_movable_pfn[nid];

4542

*zone_end_pfn = min(node_end_pfn,

4542

*zone_end_pfn = min(node_end_pfn,

4543

arch_zone_highest_possible_pfn[movable_zone]);

4543

arch_zone_highest_possible_pfn[movable_zone]);

4544

4545

/* Adjust for ZONE_MOVABLE starting within this range */

4545

/* Adjust for ZONE_MOVABLE starting within this range */

4546

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4546

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4547

*zone_end_pfn > zone_movable_pfn[nid]) {

4547

*zone_end_pfn > zone_movable_pfn[nid]) {

4548

*zone_end_pfn = zone_movable_pfn[nid];

4548

*zone_end_pfn = zone_movable_pfn[nid];

4549

4550

/* Check if this whole range is within ZONE_MOVABLE */

4550

/* Check if this whole range is within ZONE_MOVABLE */

4551

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4551

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4552

*zone_start_pfn = *zone_end_pfn;

4552

*zone_start_pfn = *zone_end_pfn;

4553

}

4553

}

4554

}

4554

}

4555

4556

/*

4556

/*

4557

* Return the number of pages a zone spans in a node, including holes

4557

* Return the number of pages a zone spans in a node, including holes

4558

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4558

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4559

*/

4559

*/

4560

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4560

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4561

unsigned long zone_type,

4561

unsigned long zone_type,

4562

unsigned long node_start_pfn,

4562

unsigned long node_start_pfn,

4563

unsigned long node_end_pfn,

4563

unsigned long node_end_pfn,

4564

unsigned long *ignored)

4564

unsigned long *ignored)

4565

{

4565

{

4566

unsigned long zone_start_pfn, zone_end_pfn;

4566

unsigned long zone_start_pfn, zone_end_pfn;

4567

4568

/* Get the start and end of the zone */

4568

/* Get the start and end of the zone */

4569

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4569

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4570

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4570

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4571

adjust_zone_range_for_zone_movable(nid, zone_type,

4571

adjust_zone_range_for_zone_movable(nid, zone_type,

4572

node_start_pfn, node_end_pfn,

4572

node_start_pfn, node_end_pfn,

4573

&zone_start_pfn, &zone_end_pfn);

4573

&zone_start_pfn, &zone_end_pfn);

4574

4575

/* Check that this node has pages within the zone's required range */

4575

/* Check that this node has pages within the zone's required range */

4576

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4576

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4577

return 0;

4577

return 0;

4578

4579

/* Move the zone boundaries inside the node if necessary */

4579

/* Move the zone boundaries inside the node if necessary */

4580

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4580

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4581

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4581

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4582

4583

/* Return the spanned pages */

4583

/* Return the spanned pages */

4584

return zone_end_pfn - zone_start_pfn;

4584

return zone_end_pfn - zone_start_pfn;

4585

}

4585

}

4586

4587

/*

4587

/*

4588

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4588

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4589

* then all holes in the requested range will be accounted for.

4589

* then all holes in the requested range will be accounted for.

4590

*/

4590

*/

4591

unsigned long __meminit __absent_pages_in_range(int nid,

4591

unsigned long __meminit __absent_pages_in_range(int nid,

4592

unsigned long range_start_pfn,

4592

unsigned long range_start_pfn,

4593

unsigned long range_end_pfn)

4593

unsigned long range_end_pfn)

4594

{

4594

{

4595

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4595

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4596

unsigned long start_pfn, end_pfn;

4596

unsigned long start_pfn, end_pfn;

4597

int i;

4597

int i;

4598

4599

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4599

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4600

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4600

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4601

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4601

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4602

nr_absent -= end_pfn - start_pfn;

4602

nr_absent -= end_pfn - start_pfn;

4603

}

4603

}

4604

return nr_absent;

4604

return nr_absent;

4605

}

4605

}

4606

4607

/**

4607

/**

4608

* absent_pages_in_range - Return number of page frames in holes within a range

4608

* absent_pages_in_range - Return number of page frames in holes within a range

4609

* @start_pfn: The start PFN to start searching for holes

4609

* @start_pfn: The start PFN to start searching for holes

4610

* @end_pfn: The end PFN to stop searching for holes

4610

* @end_pfn: The end PFN to stop searching for holes

4611

*

4611

*

4612

* It returns the number of pages frames in memory holes within a range.

4612

* It returns the number of pages frames in memory holes within a range.

4613

*/

4613

*/

4614

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4614

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4615

unsigned long end_pfn)

4615

unsigned long end_pfn)

4616

{

4616

{

4617

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4617

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4618

}

4618

}

4619

4620

/* Return the number of page frames in holes in a zone on a node */

4620

/* Return the number of page frames in holes in a zone on a node */

4621

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4621

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4622

unsigned long zone_type,

4622

unsigned long zone_type,

4623

unsigned long node_start_pfn,

4623

unsigned long node_start_pfn,

4624

unsigned long node_end_pfn,

4624

unsigned long node_end_pfn,

4625

unsigned long *ignored)

4625

unsigned long *ignored)

4626

{

4626

{

4627

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4627

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4628

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4628

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4629

unsigned long zone_start_pfn, zone_end_pfn;

4629

unsigned long zone_start_pfn, zone_end_pfn;

4630

4631

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4631

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4632

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4632

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4633

4634

adjust_zone_range_for_zone_movable(nid, zone_type,

4634

adjust_zone_range_for_zone_movable(nid, zone_type,

4635

node_start_pfn, node_end_pfn,

4635

node_start_pfn, node_end_pfn,

4636

&zone_start_pfn, &zone_end_pfn);

4636

&zone_start_pfn, &zone_end_pfn);

4637

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4637

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4638

}

4638

}

4639

4640

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4640

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4641

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4641

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4642

unsigned long zone_type,

4642

unsigned long zone_type,

4643

unsigned long node_start_pfn,

4643

unsigned long node_start_pfn,

4644

unsigned long node_end_pfn,

4644

unsigned long node_end_pfn,

4645

unsigned long *zones_size)

4645

unsigned long *zones_size)

4646

{

4646

{

4647

return zones_size[zone_type];

4647

return zones_size[zone_type];

4648

}

4648

}

4649

4650

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4650

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4651

unsigned long zone_type,

4651

unsigned long zone_type,

4652

unsigned long node_start_pfn,

4652

unsigned long node_start_pfn,

4653

unsigned long node_end_pfn,

4653

unsigned long node_end_pfn,

4654

unsigned long *zholes_size)

4654

unsigned long *zholes_size)

4655

{

4655

{

4656

if (!zholes_size)

4656

if (!zholes_size)

4657

return 0;

4657

return 0;

4658

4659

return zholes_size[zone_type];

4659

return zholes_size[zone_type];

4660

}

4660

}

4661

4662

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4662

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4663

4664

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4664

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4665

unsigned long node_start_pfn,

4665

unsigned long node_start_pfn,

4666

unsigned long node_end_pfn,

4666

unsigned long node_end_pfn,

4667

unsigned long *zones_size,

4667

unsigned long *zones_size,

4668

unsigned long *zholes_size)

4668

unsigned long *zholes_size)

4669

{

4669

{

4670

unsigned long realtotalpages, totalpages = 0;

4670

unsigned long realtotalpages, totalpages = 0;

4671

enum zone_type i;

4671

enum zone_type i;

4672

4673

for (i = 0; i < MAX_NR_ZONES; i++)

4673

for (i = 0; i < MAX_NR_ZONES; i++)

4674

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4674

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4675

node_start_pfn,

4675

node_start_pfn,

4676

node_end_pfn,

4676

node_end_pfn,

4677

zones_size);

4677

zones_size);

4678

pgdat->node_spanned_pages = totalpages;

4678

pgdat->node_spanned_pages = totalpages;

4679

4680

realtotalpages = totalpages;

4680

realtotalpages = totalpages;

4681

for (i = 0; i < MAX_NR_ZONES; i++)

4681

for (i = 0; i < MAX_NR_ZONES; i++)

4682

realtotalpages -=

4682

realtotalpages -=

4683

zone_absent_pages_in_node(pgdat->node_id, i,

4683

zone_absent_pages_in_node(pgdat->node_id, i,

4684

node_start_pfn, node_end_pfn,

4684

node_start_pfn, node_end_pfn,

4685

zholes_size);

4685

zholes_size);

4686

pgdat->node_present_pages = realtotalpages;

4686

pgdat->node_present_pages = realtotalpages;

4687

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4687

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4688

realtotalpages);

4688

realtotalpages);

4689

}

4689

}

4690

4691

#ifndef CONFIG_SPARSEMEM

4691

#ifndef CONFIG_SPARSEMEM

4692

/*

4692

/*

4693

* Calculate the size of the zone->blockflags rounded to an unsigned long

4693

* Calculate the size of the zone->blockflags rounded to an unsigned long

4694

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4694

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4695

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4695

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4696

* round what is now in bits to nearest long in bits, then return it in

4696

* round what is now in bits to nearest long in bits, then return it in

4697

* bytes.

4697

* bytes.

4698

*/

4698

*/

4699

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4699

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4700

{

4700

{

4701

unsigned long usemapsize;

4701

unsigned long usemapsize;

4702

4703

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4703

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4704

usemapsize = roundup(zonesize, pageblock_nr_pages);

4704

usemapsize = roundup(zonesize, pageblock_nr_pages);

4705

usemapsize = usemapsize >> pageblock_order;

4705

usemapsize = usemapsize >> pageblock_order;

4706

usemapsize *= NR_PAGEBLOCK_BITS;

4706

usemapsize *= NR_PAGEBLOCK_BITS;

4707

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4707

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4708

4709

return usemapsize / 8;

4709

return usemapsize / 8;

4710

}

4710

}

4711

4712

static void __init setup_usemap(struct pglist_data *pgdat,

4712

static void __init setup_usemap(struct pglist_data *pgdat,

4713

struct zone *zone,

4713

struct zone *zone,

4714

unsigned long zone_start_pfn,

4714

unsigned long zone_start_pfn,

4715

unsigned long zonesize)

4715

unsigned long zonesize)

4716

{

4716

{

4717

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4717

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4718

zone->pageblock_flags = NULL;

4718

zone->pageblock_flags = NULL;

4719

if (usemapsize)

4719

if (usemapsize)

4720

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4720

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4721

usemapsize);

4721

usemapsize);

4722

}

4722

}

4723

#else

4723

#else

4724

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4724

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4725

unsigned long zone_start_pfn, unsigned long zonesize) {}

4725

unsigned long zone_start_pfn, unsigned long zonesize) {}

4726

#endif /* CONFIG_SPARSEMEM */

4726

#endif /* CONFIG_SPARSEMEM */

4727

4728

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4728

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4729

4730

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4730

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4731

void __paginginit set_pageblock_order(void)

4731

void __paginginit set_pageblock_order(void)

4732

{

4732

{

4733

unsigned int order;

4733

unsigned int order;

4734

4735

/* Check that pageblock_nr_pages has not already been setup */

4735

/* Check that pageblock_nr_pages has not already been setup */

4736

if (pageblock_order)

4736

if (pageblock_order)

4737

return;

4737

return;

4738

4739

if (HPAGE_SHIFT > PAGE_SHIFT)

4739

if (HPAGE_SHIFT > PAGE_SHIFT)

4740

order = HUGETLB_PAGE_ORDER;

4740

order = HUGETLB_PAGE_ORDER;

4741

else

4741

else

4742

order = MAX_ORDER - 1;

4742

order = MAX_ORDER - 1;

4743

4744

/*

4744

/*

4745

* Assume the largest contiguous order of interest is a huge page.

4745

* Assume the largest contiguous order of interest is a huge page.

4746

* This value may be variable depending on boot parameters on IA64 and

4746

* This value may be variable depending on boot parameters on IA64 and

4747

* powerpc.

4747

* powerpc.

4748

*/

4748

*/

4749

pageblock_order = order;

4749

pageblock_order = order;

4750

}

4750

}

4751

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4751

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4752

4753

/*

4753

/*

4754

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4754

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4755

* is unused as pageblock_order is set at compile-time. See

4755

* is unused as pageblock_order is set at compile-time. See

4756

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4756

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4757

* the kernel config

4757

* the kernel config

4758

*/

4758

*/

4759

void __paginginit set_pageblock_order(void)

4759

void __paginginit set_pageblock_order(void)

4760

{

4760

{

4761

}

4761

}

4762

4763

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4763

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4764

4765

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4765

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4766

unsigned long present_pages)

4766

unsigned long present_pages)

4767

{

4767

{

4768

unsigned long pages = spanned_pages;

4768

unsigned long pages = spanned_pages;

4769

4770

/*

4770

/*

4771

* Provide a more accurate estimation if there are holes within

4771

* Provide a more accurate estimation if there are holes within

4772

* the zone and SPARSEMEM is in use. If there are holes within the

4772

* the zone and SPARSEMEM is in use. If there are holes within the

4773

* zone, each populated memory region may cost us one or two extra

4773

* zone, each populated memory region may cost us one or two extra

4774

* memmap pages due to alignment because memmap pages for each

4774

* memmap pages due to alignment because memmap pages for each

4775

* populated regions may not naturally algined on page boundary.

4775

* populated regions may not naturally algined on page boundary.

4776

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4776

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4777

*/

4777

*/

4778

if (spanned_pages > present_pages + (present_pages >> 4) &&

4778

if (spanned_pages > present_pages + (present_pages >> 4) &&

4779

IS_ENABLED(CONFIG_SPARSEMEM))

4779

IS_ENABLED(CONFIG_SPARSEMEM))

4780

pages = present_pages;

4780

pages = present_pages;

4781

4782

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4782

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4783

}

4783

}

4784

4785

/*

4785

/*

4786

* Set up the zone data structures:

4786

* Set up the zone data structures:

4787

* - mark all pages reserved

4787

* - mark all pages reserved

4788

* - mark all memory queues empty

4788

* - mark all memory queues empty

4789

* - clear the memory bitmaps

4789

* - clear the memory bitmaps

4790

*

4790

*

4791

* NOTE: pgdat should get zeroed by caller.

4791

* NOTE: pgdat should get zeroed by caller.

4792

*/

4792

*/

4793

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4793

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4794

unsigned long node_start_pfn, unsigned long node_end_pfn,

4794

unsigned long node_start_pfn, unsigned long node_end_pfn,

4795

unsigned long *zones_size, unsigned long *zholes_size)

4795

unsigned long *zones_size, unsigned long *zholes_size)

4796

{

4796

{

4797

enum zone_type j;

4797

enum zone_type j;

4798

int nid = pgdat->node_id;

4798

int nid = pgdat->node_id;

4799

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4799

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4800

int ret;

4800

int ret;

4801

4802

pgdat_resize_init(pgdat);

4802

pgdat_resize_init(pgdat);

4803

#ifdef CONFIG_NUMA_BALANCING

4803

#ifdef CONFIG_NUMA_BALANCING

4804

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4804

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4805

pgdat->numabalancing_migrate_nr_pages = 0;

4805

pgdat->numabalancing_migrate_nr_pages = 0;

4806

pgdat->numabalancing_migrate_next_window = jiffies;

4806

pgdat->numabalancing_migrate_next_window = jiffies;

4807

#endif

4807

#endif

4808

init_waitqueue_head(&pgdat->kswapd_wait);

4808

init_waitqueue_head(&pgdat->kswapd_wait);

4809

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4809

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4810

pgdat_page_cgroup_init(pgdat);

4810

pgdat_page_cgroup_init(pgdat);

4811

4812

for (j = 0; j < MAX_NR_ZONES; j++) {

4812

for (j = 0; j < MAX_NR_ZONES; j++) {

4813

struct zone *zone = pgdat->node_zones + j;

4813

struct zone *zone = pgdat->node_zones + j;

4814

unsigned long size, realsize, freesize, memmap_pages;

4814

unsigned long size, realsize, freesize, memmap_pages;

4815

4816

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4816

size = zone_spanned_pages_in_node(nid, j, node_start_pfn,

4817

node_end_pfn, zones_size);

4817

node_end_pfn, zones_size);

4818

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4818

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4819

node_start_pfn,

4819

node_start_pfn,

4820

node_end_pfn,

4820

node_end_pfn,

4821

zholes_size);

4821

zholes_size);

4822

4823

/*

4823

/*

4824

* Adjust freesize so that it accounts for how much memory

4824

* Adjust freesize so that it accounts for how much memory

4825

* is used by this zone for memmap. This affects the watermark

4825

* is used by this zone for memmap. This affects the watermark

4826

* and per-cpu initialisations

4826

* and per-cpu initialisations

4827

*/

4827

*/

4828

memmap_pages = calc_memmap_size(size, realsize);

4828

memmap_pages = calc_memmap_size(size, realsize);

4829

if (freesize >= memmap_pages) {

4829

if (freesize >= memmap_pages) {

4830

freesize -= memmap_pages;

4830

freesize -= memmap_pages;

4831

if (memmap_pages)

4831

if (memmap_pages)

4832

printk(KERN_DEBUG

4832

printk(KERN_DEBUG

4833

" %s zone: %lu pages used for memmap\n",

4833

" %s zone: %lu pages used for memmap\n",

4834

zone_names[j], memmap_pages);

4834

zone_names[j], memmap_pages);

4835

} else

4835

} else

4836

printk(KERN_WARNING

4836

printk(KERN_WARNING

4837

" %s zone: %lu pages exceeds freesize %lu\n",

4837

" %s zone: %lu pages exceeds freesize %lu\n",

4838

zone_names[j], memmap_pages, freesize);

4838

zone_names[j], memmap_pages, freesize);

4839

4840

/* Account for reserved pages */

4840

/* Account for reserved pages */

4841

if (j == 0 && freesize > dma_reserve) {

4841

if (j == 0 && freesize > dma_reserve) {

4842

freesize -= dma_reserve;

4842

freesize -= dma_reserve;

4843

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4843

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4844

zone_names[0], dma_reserve);

4844

zone_names[0], dma_reserve);

4845

}

4845

}

4846

4847

if (!is_highmem_idx(j))

4847

if (!is_highmem_idx(j))

4848

nr_kernel_pages += freesize;

4848

nr_kernel_pages += freesize;

4849

/* Charge for highmem memmap if there are enough kernel pages */

4849

/* Charge for highmem memmap if there are enough kernel pages */

4850

else if (nr_kernel_pages > memmap_pages * 2)

4850

else if (nr_kernel_pages > memmap_pages * 2)

4851

nr_kernel_pages -= memmap_pages;

4851

nr_kernel_pages -= memmap_pages;

4852

nr_all_pages += freesize;

4852

nr_all_pages += freesize;

4853

4854

zone->spanned_pages = size;

4854

zone->spanned_pages = size;

4855

zone->present_pages = realsize;

4855

zone->present_pages = realsize;

4856

/*

4856

/*

4857

* Set an approximate value for lowmem here, it will be adjusted

4857

* Set an approximate value for lowmem here, it will be adjusted

4858

* when the bootmem allocator frees pages into the buddy system.

4858

* when the bootmem allocator frees pages into the buddy system.

4859

* And all highmem pages will be managed by the buddy system.

4859

* And all highmem pages will be managed by the buddy system.

4860

*/

4860

*/

4861

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4861

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4862

#ifdef CONFIG_NUMA

4862

#ifdef CONFIG_NUMA

4863

zone->node = nid;

4863

zone->node = nid;

4864

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4864

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4865

/ 100;

4865

/ 100;

4866

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4866

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4867

#endif

4867

#endif

4868

zone->name = zone_names[j];

4868

zone->name = zone_names[j];

4869

spin_lock_init(&zone->lock);

4869

spin_lock_init(&zone->lock);

4870

spin_lock_init(&zone->lru_lock);

4870

spin_lock_init(&zone->lru_lock);

4871

zone_seqlock_init(zone);

4871

zone_seqlock_init(zone);

4872

zone->zone_pgdat = pgdat;

4872

zone->zone_pgdat = pgdat;

4873

zone_pcp_init(zone);

4873

zone_pcp_init(zone);

4874

4875

/* For bootup, initialized properly in watermark setup */

4875

/* For bootup, initialized properly in watermark setup */

4876

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4876

mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);

4877

4878

lruvec_init(&zone->lruvec);

4878

lruvec_init(&zone->lruvec);

4879

if (!size)

4879

if (!size)

4880

continue;

4880

continue;

4881

4882

set_pageblock_order();

4882

set_pageblock_order();

4883

setup_usemap(pgdat, zone, zone_start_pfn, size);

4883

setup_usemap(pgdat, zone, zone_start_pfn, size);

4884

ret = init_currently_empty_zone(zone, zone_start_pfn,

4884

ret = init_currently_empty_zone(zone, zone_start_pfn,

4885

size, MEMMAP_EARLY);

4885

size, MEMMAP_EARLY);

4886

BUG_ON(ret);

4886

BUG_ON(ret);

4887

memmap_init(size, nid, j, zone_start_pfn);

4887

memmap_init(size, nid, j, zone_start_pfn);

4888

zone_start_pfn += size;

4888

zone_start_pfn += size;

4889

}

4889

}

4890

}

4890

}

4891

4892

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4892

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4893

{

4893

{

4894

/* Skip empty nodes */

4894

/* Skip empty nodes */

4895

if (!pgdat->node_spanned_pages)

4895

if (!pgdat->node_spanned_pages)

4896

return;

4896

return;

4897

4898

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4898

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4899

/* ia64 gets its own node_mem_map, before this, without bootmem */

4899

/* ia64 gets its own node_mem_map, before this, without bootmem */

4900

if (!pgdat->node_mem_map) {

4900

if (!pgdat->node_mem_map) {

4901

unsigned long size, start, end;

4901

unsigned long size, start, end;

4902

struct page *map;

4902

struct page *map;

4903

4904

/*

4904

/*

4905

* The zone's endpoints aren't required to be MAX_ORDER

4905

* The zone's endpoints aren't required to be MAX_ORDER

4906

* aligned but the node_mem_map endpoints must be in order

4906

* aligned but the node_mem_map endpoints must be in order

4907

* for the buddy allocator to function correctly.

4907

* for the buddy allocator to function correctly.

4908

*/

4908

*/

4909

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4909

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4910

end = pgdat_end_pfn(pgdat);

4910

end = pgdat_end_pfn(pgdat);

4911

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4911

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4912

size = (end - start) * sizeof(struct page);

4912

size = (end - start) * sizeof(struct page);

4913

map = alloc_remap(pgdat->node_id, size);

4913

map = alloc_remap(pgdat->node_id, size);

4914

if (!map)

4914

if (!map)

4915

map = alloc_bootmem_node_nopanic(pgdat, size);

4915

map = alloc_bootmem_node_nopanic(pgdat, size);

4916

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4916

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4917

}

4917

}

4918

#ifndef CONFIG_NEED_MULTIPLE_NODES

4918

#ifndef CONFIG_NEED_MULTIPLE_NODES

4919

/*

4919

/*

4920

* With no DISCONTIG, the global mem_map is just set as node 0's

4920

* With no DISCONTIG, the global mem_map is just set as node 0's

4921

*/

4921

*/

4922

if (pgdat == NODE_DATA(0)) {

4922

if (pgdat == NODE_DATA(0)) {

4923

mem_map = NODE_DATA(0)->node_mem_map;

4923

mem_map = NODE_DATA(0)->node_mem_map;

4924

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4924

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4925

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4925

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4926

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4926

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4927

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4927

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4928

}

4928

}

4929

#endif

4929

#endif

4930

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4930

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4931

}

4931

}

4932

4933

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4933

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4934

unsigned long node_start_pfn, unsigned long *zholes_size)

4934

unsigned long node_start_pfn, unsigned long *zholes_size)

4935

{

4935

{

4936

pg_data_t *pgdat = NODE_DATA(nid);

4936

pg_data_t *pgdat = NODE_DATA(nid);

4937

unsigned long start_pfn = 0;

4937

unsigned long start_pfn = 0;

4938

unsigned long end_pfn = 0;

4938

unsigned long end_pfn = 0;

4939

4940

/* pg_data_t should be reset to zero when it's allocated */

4940

/* pg_data_t should be reset to zero when it's allocated */

4941

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4941

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4942

4943

pgdat->node_id = nid;

4943

pgdat->node_id = nid;

4944

pgdat->node_start_pfn = node_start_pfn;

4944

pgdat->node_start_pfn = node_start_pfn;

4945

if (node_state(nid, N_MEMORY))

4945

if (node_state(nid, N_MEMORY))

4946

init_zone_allows_reclaim(nid);

4946

init_zone_allows_reclaim(nid);

4947

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4947

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4948

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4948

get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

4949

#endif

4949

#endif

4950

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4950

calculate_node_totalpages(pgdat, start_pfn, end_pfn,

4951

zones_size, zholes_size);

4951

zones_size, zholes_size);

4952

4953

alloc_node_mem_map(pgdat);

4953

alloc_node_mem_map(pgdat);

4954

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4954

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4955

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4955

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4956

nid, (unsigned long)pgdat,

4956

nid, (unsigned long)pgdat,

4957

(unsigned long)pgdat->node_mem_map);

4957

(unsigned long)pgdat->node_mem_map);

4958

#endif

4958

#endif

4959

4960

free_area_init_core(pgdat, start_pfn, end_pfn,

4960

free_area_init_core(pgdat, start_pfn, end_pfn,

4961

zones_size, zholes_size);

4961

zones_size, zholes_size);

4962

}

4962

}

4963

4964

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4964

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4965

4966

#if MAX_NUMNODES > 1

4966

#if MAX_NUMNODES > 1

4967

/*

4967

/*

4968

* Figure out the number of possible node ids.

4968

* Figure out the number of possible node ids.

4969

*/

4969

*/

4970

void __init setup_nr_node_ids(void)

4970

void __init setup_nr_node_ids(void)

4971

{

4971

{

4972

unsigned int node;

4972

unsigned int node;

4973

unsigned int highest = 0;

4973

unsigned int highest = 0;

4974

4975

for_each_node_mask(node, node_possible_map)

4975

for_each_node_mask(node, node_possible_map)

4976

highest = node;

4976

highest = node;

4977

nr_node_ids = highest + 1;

4977

nr_node_ids = highest + 1;

4978

}

4978

}

4979

#endif

4979

#endif

4980

4981

/**

4981

/**

4982

* node_map_pfn_alignment - determine the maximum internode alignment

4982

* node_map_pfn_alignment - determine the maximum internode alignment

4983

*

4983

*

4984

* This function should be called after node map is populated and sorted.

4984

* This function should be called after node map is populated and sorted.

4985

* It calculates the maximum power of two alignment which can distinguish

4985

* It calculates the maximum power of two alignment which can distinguish

4986

* all the nodes.

4986

* all the nodes.

4987

*

4987

*

4988

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4988

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4989

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4989

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4990

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4990

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4991

* shifted, 1GiB is enough and this function will indicate so.

4991

* shifted, 1GiB is enough and this function will indicate so.

4992

*

4992

*

4993

* This is used to test whether pfn -> nid mapping of the chosen memory

4993

* This is used to test whether pfn -> nid mapping of the chosen memory

4994

* model has fine enough granularity to avoid incorrect mapping for the

4994

* model has fine enough granularity to avoid incorrect mapping for the

4995

* populated node map.

4995

* populated node map.

4996

*

4996

*

4997

* Returns the determined alignment in pfn's. 0 if there is no alignment

4997

* Returns the determined alignment in pfn's. 0 if there is no alignment

4998

* requirement (single node).

4998

* requirement (single node).

4999

*/

4999

*/

5000

unsigned long __init node_map_pfn_alignment(void)

5000

unsigned long __init node_map_pfn_alignment(void)

5001

{

5001

{

5002

unsigned long accl_mask = 0, last_end = 0;

5002

unsigned long accl_mask = 0, last_end = 0;

5003

unsigned long start, end, mask;

5003

unsigned long start, end, mask;

5004

int last_nid = -1;

5004

int last_nid = -1;

5005

int i, nid;

5005

int i, nid;

5006

5007

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

5007

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

5008

if (!start || last_nid < 0 || last_nid == nid) {

5008

if (!start || last_nid < 0 || last_nid == nid) {

5009

last_nid = nid;

5009

last_nid = nid;

5010

last_end = end;

5010

last_end = end;

5011

continue;

5011

continue;

5012

}

5012

}

5013

5014

/*

5014

/*

5015

* Start with a mask granular enough to pin-point to the

5015

* Start with a mask granular enough to pin-point to the

5016

* start pfn and tick off bits one-by-one until it becomes

5016

* start pfn and tick off bits one-by-one until it becomes

5017

* too coarse to separate the current node from the last.

5017

* too coarse to separate the current node from the last.

5018

*/

5018

*/

5019

mask = ~((1 << __ffs(start)) - 1);

5019

mask = ~((1 << __ffs(start)) - 1);

5020

while (mask && last_end <= (start & (mask << 1)))

5020

while (mask && last_end <= (start & (mask << 1)))

5021

mask <<= 1;

5021

mask <<= 1;

5022

5023

/* accumulate all internode masks */

5023

/* accumulate all internode masks */

5024

accl_mask |= mask;

5024

accl_mask |= mask;

5025

}

5025

}

5026

5027

/* convert mask to number of pages */

5027

/* convert mask to number of pages */

5028

return ~accl_mask + 1;

5028

return ~accl_mask + 1;

5029

}

5029

}

5030

5031

/* Find the lowest pfn for a node */

5031

/* Find the lowest pfn for a node */

5032

static unsigned long __init find_min_pfn_for_node(int nid)

5032

static unsigned long __init find_min_pfn_for_node(int nid)

5033

{

5033

{

5034

unsigned long min_pfn = ULONG_MAX;

5034

unsigned long min_pfn = ULONG_MAX;

5035

unsigned long start_pfn;

5035

unsigned long start_pfn;

5036

int i;

5036

int i;

5037

5038

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5038

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

5039

min_pfn = min(min_pfn, start_pfn);

5039

min_pfn = min(min_pfn, start_pfn);

5040

5041

if (min_pfn == ULONG_MAX) {

5041

if (min_pfn == ULONG_MAX) {

5042

printk(KERN_WARNING

5042

printk(KERN_WARNING

5043

"Could not find start_pfn for node %d\n", nid);

5043

"Could not find start_pfn for node %d\n", nid);

5044

return 0;

5044

return 0;

5045

}

5045

}

5046

5047

return min_pfn;

5047

return min_pfn;

5048

}

5048

}

5049

5050

/**

5050

/**

5051

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5051

* find_min_pfn_with_active_regions - Find the minimum PFN registered

5052

*

5052

*

5053

* It returns the minimum PFN based on information provided via

5053

* It returns the minimum PFN based on information provided via

5054

* add_active_range().

5054

* add_active_range().

5055

*/

5055

*/

5056

unsigned long __init find_min_pfn_with_active_regions(void)

5056

unsigned long __init find_min_pfn_with_active_regions(void)

5057

{

5057

{

5058

return find_min_pfn_for_node(MAX_NUMNODES);

5058

return find_min_pfn_for_node(MAX_NUMNODES);

5059

}

5059

}

5060

5061

/*

5061

/*

5062

* early_calculate_totalpages()

5062

* early_calculate_totalpages()

5063

* Sum pages in active regions for movable zone.

5063

* Sum pages in active regions for movable zone.

5064

* Populate N_MEMORY for calculating usable_nodes.

5064

* Populate N_MEMORY for calculating usable_nodes.

5065

*/

5065

*/

5066

static unsigned long __init early_calculate_totalpages(void)

5066

static unsigned long __init early_calculate_totalpages(void)

5067

{

5067

{

5068

unsigned long totalpages = 0;

5068

unsigned long totalpages = 0;

5069

unsigned long start_pfn, end_pfn;

5069

unsigned long start_pfn, end_pfn;

5070

int i, nid;

5070

int i, nid;

5071

5072

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5072

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

5073

unsigned long pages = end_pfn - start_pfn;

5073

unsigned long pages = end_pfn - start_pfn;

5074

5075

totalpages += pages;

5075

totalpages += pages;

5076

if (pages)

5076

if (pages)

5077

node_set_state(nid, N_MEMORY);

5077

node_set_state(nid, N_MEMORY);

5078

}

5078

}

5079

return totalpages;

5079

return totalpages;

5080

}

5080

}

5081

5082

/*

5082

/*

5083

* Find the PFN the Movable zone begins in each node. Kernel memory

5083

* Find the PFN the Movable zone begins in each node. Kernel memory

5084

* is spread evenly between nodes as long as the nodes have enough

5084

* is spread evenly between nodes as long as the nodes have enough

5085

* memory. When they don't, some nodes will have more kernelcore than

5085

* memory. When they don't, some nodes will have more kernelcore than

5086

* others

5086

* others

5087

*/

5087

*/

5088

static void __init find_zone_movable_pfns_for_nodes(void)

5088

static void __init find_zone_movable_pfns_for_nodes(void)

5089

{

5089

{

5090

int i, nid;

5090

int i, nid;

5091

unsigned long usable_startpfn;

5091

unsigned long usable_startpfn;

5092

unsigned long kernelcore_node, kernelcore_remaining;

5092

unsigned long kernelcore_node, kernelcore_remaining;

5093

/* save the state before borrow the nodemask */

5093

/* save the state before borrow the nodemask */

5094

nodemask_t saved_node_state = node_states[N_MEMORY];

5094

nodemask_t saved_node_state = node_states[N_MEMORY];

5095

unsigned long totalpages = early_calculate_totalpages();

5095

unsigned long totalpages = early_calculate_totalpages();

5096

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5096

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

5097

5098

/*

5098

/*

5099

* If movablecore was specified, calculate what size of

5099

* If movablecore was specified, calculate what size of

5100

* kernelcore that corresponds so that memory usable for

5100

* kernelcore that corresponds so that memory usable for

5101

* any allocation type is evenly spread. If both kernelcore

5101

* any allocation type is evenly spread. If both kernelcore

5102

* and movablecore are specified, then the value of kernelcore

5102

* and movablecore are specified, then the value of kernelcore

5103

* will be used for required_kernelcore if it's greater than

5103

* will be used for required_kernelcore if it's greater than

5104

* what movablecore would have allowed.

5104

* what movablecore would have allowed.

5105

*/

5105

*/

5106

if (required_movablecore) {

5106

if (required_movablecore) {

5107

unsigned long corepages;

5107

unsigned long corepages;

5108

5109

/*

5109

/*

5110

* Round-up so that ZONE_MOVABLE is at least as large as what

5110

* Round-up so that ZONE_MOVABLE is at least as large as what

5111

* was requested by the user

5111

* was requested by the user

5112

*/

5112

*/

5113

required_movablecore =

5113

required_movablecore =

5114

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5114

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

5115

corepages = totalpages - required_movablecore;

5115

corepages = totalpages - required_movablecore;

5116

5117

required_kernelcore = max(required_kernelcore, corepages);

5117

required_kernelcore = max(required_kernelcore, corepages);

5118

}

5118

}

5119

5120

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5120

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

5121

if (!required_kernelcore)

5121

if (!required_kernelcore)

5122

goto out;

5122

goto out;

5123

5124

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5124

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

5125

find_usable_zone_for_movable();

5125

find_usable_zone_for_movable();

5126

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5126

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

5127

5128

restart:

5128

restart:

5129

/* Spread kernelcore memory as evenly as possible throughout nodes */

5129

/* Spread kernelcore memory as evenly as possible throughout nodes */

5130

kernelcore_node = required_kernelcore / usable_nodes;

5130

kernelcore_node = required_kernelcore / usable_nodes;

5131

for_each_node_state(nid, N_MEMORY) {

5131

for_each_node_state(nid, N_MEMORY) {

5132

unsigned long start_pfn, end_pfn;

5132

unsigned long start_pfn, end_pfn;

5133

5134

/*

5134

/*

5135

* Recalculate kernelcore_node if the division per node

5135

* Recalculate kernelcore_node if the division per node

5136

* now exceeds what is necessary to satisfy the requested

5136

* now exceeds what is necessary to satisfy the requested

5137

* amount of memory for the kernel

5137

* amount of memory for the kernel

5138

*/

5138

*/

5139

if (required_kernelcore < kernelcore_node)

5139

if (required_kernelcore < kernelcore_node)

5140

kernelcore_node = required_kernelcore / usable_nodes;

5140

kernelcore_node = required_kernelcore / usable_nodes;

5141

5142

/*

5142

/*

5143

* As the map is walked, we track how much memory is usable

5143

* As the map is walked, we track how much memory is usable

5144

* by the kernel using kernelcore_remaining. When it is

5144

* by the kernel using kernelcore_remaining. When it is

5145

* 0, the rest of the node is usable by ZONE_MOVABLE

5145

* 0, the rest of the node is usable by ZONE_MOVABLE

5146

*/

5146

*/

5147

kernelcore_remaining = kernelcore_node;

5147

kernelcore_remaining = kernelcore_node;

5148

5149

/* Go through each range of PFNs within this node */

5149

/* Go through each range of PFNs within this node */

5150

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5150

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

5151

unsigned long size_pages;

5151

unsigned long size_pages;

5152

5153

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5153

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

5154

if (start_pfn >= end_pfn)

5154

if (start_pfn >= end_pfn)

5155

continue;

5155

continue;

5156

5157

/* Account for what is only usable for kernelcore */

5157

/* Account for what is only usable for kernelcore */

5158

if (start_pfn < usable_startpfn) {

5158

if (start_pfn < usable_startpfn) {

5159

unsigned long kernel_pages;

5159

unsigned long kernel_pages;

5160

kernel_pages = min(end_pfn, usable_startpfn)

5160

kernel_pages = min(end_pfn, usable_startpfn)

5161

- start_pfn;

5161

- start_pfn;

5162

5163

kernelcore_remaining -= min(kernel_pages,

5163

kernelcore_remaining -= min(kernel_pages,

5164

kernelcore_remaining);

5164

kernelcore_remaining);

5165

required_kernelcore -= min(kernel_pages,

5165

required_kernelcore -= min(kernel_pages,

5166

required_kernelcore);

5166

required_kernelcore);

5167

5168

/* Continue if range is now fully accounted */

5168

/* Continue if range is now fully accounted */

5169

if (end_pfn <= usable_startpfn) {

5169

if (end_pfn <= usable_startpfn) {

5170

5171

/*

5171

/*

5172

* Push zone_movable_pfn to the end so

5172

* Push zone_movable_pfn to the end so

5173

* that if we have to rebalance

5173

* that if we have to rebalance

5174

* kernelcore across nodes, we will

5174

* kernelcore across nodes, we will

5175

* not double account here

5175

* not double account here

5176

*/

5176

*/

5177

zone_movable_pfn[nid] = end_pfn;

5177

zone_movable_pfn[nid] = end_pfn;

5178

continue;

5178

continue;

5179

}

5179

}

5180

start_pfn = usable_startpfn;

5180

start_pfn = usable_startpfn;

5181

}

5181

}

5182

5183

/*

5183

/*

5184

* The usable PFN range for ZONE_MOVABLE is from

5184

* The usable PFN range for ZONE_MOVABLE is from

5185

* start_pfn->end_pfn. Calculate size_pages as the

5185

* start_pfn->end_pfn. Calculate size_pages as the

5186

* number of pages used as kernelcore

5186

* number of pages used as kernelcore

5187

*/

5187

*/

5188

size_pages = end_pfn - start_pfn;

5188

size_pages = end_pfn - start_pfn;

5189

if (size_pages > kernelcore_remaining)

5189

if (size_pages > kernelcore_remaining)

5190

size_pages = kernelcore_remaining;

5190

size_pages = kernelcore_remaining;

5191

zone_movable_pfn[nid] = start_pfn + size_pages;

5191

zone_movable_pfn[nid] = start_pfn + size_pages;

5192

5193

/*

5193

/*

5194

* Some kernelcore has been met, update counts and

5194

* Some kernelcore has been met, update counts and

5195

* break if the kernelcore for this node has been

5195

* break if the kernelcore for this node has been

5196

* satisfied

5196

* satisfied

5197

*/

5197

*/

5198

required_kernelcore -= min(required_kernelcore,

5198

required_kernelcore -= min(required_kernelcore,

5199

size_pages);

5199

size_pages);

5200

kernelcore_remaining -= size_pages;

5200

kernelcore_remaining -= size_pages;

5201

if (!kernelcore_remaining)

5201

if (!kernelcore_remaining)

5202

break;

5202

break;

5203

}

5203

}

5204

}

5204

}

5205

5206

/*

5206

/*

5207

* If there is still required_kernelcore, we do another pass with one

5207

* If there is still required_kernelcore, we do another pass with one

5208

* less node in the count. This will push zone_movable_pfn[nid] further

5208

* less node in the count. This will push zone_movable_pfn[nid] further

5209

* along on the nodes that still have memory until kernelcore is

5209

* along on the nodes that still have memory until kernelcore is

5210

* satisfied

5210

* satisfied

5211

*/

5211

*/

5212

usable_nodes--;

5212

usable_nodes--;

5213

if (usable_nodes && required_kernelcore > usable_nodes)

5213

if (usable_nodes && required_kernelcore > usable_nodes)

5214

goto restart;

5214

goto restart;

5215

5216

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5216

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

5217

for (nid = 0; nid < MAX_NUMNODES; nid++)

5217

for (nid = 0; nid < MAX_NUMNODES; nid++)

5218

zone_movable_pfn[nid] =

5218

zone_movable_pfn[nid] =

5219

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5219

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

5220

5221

out:

5221

out:

5222

/* restore the node_state */

5222

/* restore the node_state */

5223

node_states[N_MEMORY] = saved_node_state;

5223

node_states[N_MEMORY] = saved_node_state;

5224

}

5224

}

5225

5226

/* Any regular or high memory on that node ? */

5226

/* Any regular or high memory on that node ? */

5227

static void check_for_memory(pg_data_t *pgdat, int nid)

5227

static void check_for_memory(pg_data_t *pgdat, int nid)

5228

{

5228

{

5229

enum zone_type zone_type;

5229

enum zone_type zone_type;

5230

5231

if (N_MEMORY == N_NORMAL_MEMORY)

5231

if (N_MEMORY == N_NORMAL_MEMORY)

5232

return;

5232

return;

5233

5234

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5234

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

5235

struct zone *zone = &pgdat->node_zones[zone_type];

5235

struct zone *zone = &pgdat->node_zones[zone_type];

5236

if (zone->present_pages) {

5236

if (zone->present_pages) {

5237

node_set_state(nid, N_HIGH_MEMORY);

5237

node_set_state(nid, N_HIGH_MEMORY);

5238

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5238

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

5239

zone_type <= ZONE_NORMAL)

5239

zone_type <= ZONE_NORMAL)

5240

node_set_state(nid, N_NORMAL_MEMORY);

5240

node_set_state(nid, N_NORMAL_MEMORY);

5241

break;

5241

break;

5242

}

5242

}

5243

}

5243

}

5244

}

5244

}

5245

5246

/**

5246

/**

5247

* free_area_init_nodes - Initialise all pg_data_t and zone data

5247

* free_area_init_nodes - Initialise all pg_data_t and zone data

5248

* @max_zone_pfn: an array of max PFNs for each zone

5248

* @max_zone_pfn: an array of max PFNs for each zone

5249

*

5249

*

5250

* This will call free_area_init_node() for each active node in the system.

5250

* This will call free_area_init_node() for each active node in the system.

5251

* Using the page ranges provided by add_active_range(), the size of each

5251

* Using the page ranges provided by add_active_range(), the size of each

5252

* zone in each node and their holes is calculated. If the maximum PFN

5252

* zone in each node and their holes is calculated. If the maximum PFN

5253

* between two adjacent zones match, it is assumed that the zone is empty.

5253

* between two adjacent zones match, it is assumed that the zone is empty.

5254

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5254

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

5255

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5255

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

5256

* starts where the previous one ended. For example, ZONE_DMA32 starts

5256

* starts where the previous one ended. For example, ZONE_DMA32 starts

5257

* at arch_max_dma_pfn.

5257

* at arch_max_dma_pfn.

5258

*/

5258

*/

5259

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5259

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

5260

{

5260

{

5261

unsigned long start_pfn, end_pfn;

5261

unsigned long start_pfn, end_pfn;

5262

int i, nid;

5262

int i, nid;

5263

5264

/* Record where the zone boundaries are */

5264

/* Record where the zone boundaries are */

5265

memset(arch_zone_lowest_possible_pfn, 0,

5265

memset(arch_zone_lowest_possible_pfn, 0,

5266

sizeof(arch_zone_lowest_possible_pfn));

5266

sizeof(arch_zone_lowest_possible_pfn));

5267

memset(arch_zone_highest_possible_pfn, 0,

5267

memset(arch_zone_highest_possible_pfn, 0,

5268

sizeof(arch_zone_highest_possible_pfn));

5268

sizeof(arch_zone_highest_possible_pfn));

5269

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5269

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

5270

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5270

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

5271

for (i = 1; i < MAX_NR_ZONES; i++) {

5271

for (i = 1; i < MAX_NR_ZONES; i++) {

5272

if (i == ZONE_MOVABLE)

5272

if (i == ZONE_MOVABLE)

5273

continue;

5273

continue;

5274

arch_zone_lowest_possible_pfn[i] =

5274

arch_zone_lowest_possible_pfn[i] =

5275

arch_zone_highest_possible_pfn[i-1];

5275

arch_zone_highest_possible_pfn[i-1];

5276

arch_zone_highest_possible_pfn[i] =

5276

arch_zone_highest_possible_pfn[i] =

5277

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5277

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

5278

}

5278

}

5279

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5279

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

5280

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5280

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

5281

5282

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5282

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

5283

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5283

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

5284

find_zone_movable_pfns_for_nodes();

5284

find_zone_movable_pfns_for_nodes();

5285

5286

/* Print out the zone ranges */

5286

/* Print out the zone ranges */

5287

printk("Zone ranges:\n");

5287

printk("Zone ranges:\n");

5288

for (i = 0; i < MAX_NR_ZONES; i++) {

5288

for (i = 0; i < MAX_NR_ZONES; i++) {

5289

if (i == ZONE_MOVABLE)

5289

if (i == ZONE_MOVABLE)

5290

continue;

5290

continue;

5291

printk(KERN_CONT " %-8s ", zone_names[i]);

5291

printk(KERN_CONT " %-8s ", zone_names[i]);

5292

if (arch_zone_lowest_possible_pfn[i] ==

5292

if (arch_zone_lowest_possible_pfn[i] ==

5293

arch_zone_highest_possible_pfn[i])

5293

arch_zone_highest_possible_pfn[i])

5294

printk(KERN_CONT "empty\n");

5294

printk(KERN_CONT "empty\n");

5295

else

5295

else

5296

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5296

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5297

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5297

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5298

(arch_zone_highest_possible_pfn[i]

5298

(arch_zone_highest_possible_pfn[i]

5299

<< PAGE_SHIFT) - 1);

5299

<< PAGE_SHIFT) - 1);

5300

}

5300

}

5301

5302

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5302

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5303

printk("Movable zone start for each node\n");

5303

printk("Movable zone start for each node\n");

5304

for (i = 0; i < MAX_NUMNODES; i++) {

5304

for (i = 0; i < MAX_NUMNODES; i++) {

5305

if (zone_movable_pfn[i])

5305

if (zone_movable_pfn[i])

5306

printk(" Node %d: %#010lx\n", i,

5306

printk(" Node %d: %#010lx\n", i,

5307

zone_movable_pfn[i] << PAGE_SHIFT);

5307

zone_movable_pfn[i] << PAGE_SHIFT);

5308

}

5308

}

5309

5310

/* Print out the early node map */

5310

/* Print out the early node map */

5311

printk("Early memory node ranges\n");

5311

printk("Early memory node ranges\n");

5312

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5312

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5313

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5313

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5314

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5314

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5315

5316

/* Initialise every node */

5316

/* Initialise every node */

5317

mminit_verify_pageflags_layout();

5317

mminit_verify_pageflags_layout();

5318

setup_nr_node_ids();

5318

setup_nr_node_ids();

5319

for_each_online_node(nid) {

5319

for_each_online_node(nid) {

5320

pg_data_t *pgdat = NODE_DATA(nid);

5320

pg_data_t *pgdat = NODE_DATA(nid);

5321

free_area_init_node(nid, NULL,

5321

free_area_init_node(nid, NULL,

5322

find_min_pfn_for_node(nid), NULL);

5322

find_min_pfn_for_node(nid), NULL);

5323

5324

/* Any memory on that node */

5324

/* Any memory on that node */

5325

if (pgdat->node_present_pages)

5325

if (pgdat->node_present_pages)

5326

node_set_state(nid, N_MEMORY);

5326

node_set_state(nid, N_MEMORY);

5327

check_for_memory(pgdat, nid);

5327

check_for_memory(pgdat, nid);

5328

}

5328

}

5329

}

5329

}

5330

5331

static int __init cmdline_parse_core(char *p, unsigned long *core)

5331

static int __init cmdline_parse_core(char *p, unsigned long *core)

5332

{

5332

{

5333

unsigned long long coremem;

5333

unsigned long long coremem;

5334

if (!p)

5334

if (!p)

5335

return -EINVAL;

5335

return -EINVAL;

5336

5337

coremem = memparse(p, &p);

5337

coremem = memparse(p, &p);

5338

*core = coremem >> PAGE_SHIFT;

5338

*core = coremem >> PAGE_SHIFT;

5339

5340

/* Paranoid check that UL is enough for the coremem value */

5340

/* Paranoid check that UL is enough for the coremem value */

5341

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5341

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5342

5343

return 0;

5343

return 0;

5344

}

5344

}

5345

5346

/*

5346

/*

5347

* kernelcore=size sets the amount of memory for use for allocations that

5347

* kernelcore=size sets the amount of memory for use for allocations that

5348

* cannot be reclaimed or migrated.

5348

* cannot be reclaimed or migrated.

5349

*/

5349

*/

5350

static int __init cmdline_parse_kernelcore(char *p)

5350

static int __init cmdline_parse_kernelcore(char *p)

5351

{

5351

{

5352

return cmdline_parse_core(p, &required_kernelcore);

5352

return cmdline_parse_core(p, &required_kernelcore);

5353

}

5353

}

5354

5355

/*

5355

/*

5356

* movablecore=size sets the amount of memory for use for allocations that

5356

* movablecore=size sets the amount of memory for use for allocations that

5357

* can be reclaimed or migrated.

5357

* can be reclaimed or migrated.

5358

*/

5358

*/

5359

static int __init cmdline_parse_movablecore(char *p)

5359

static int __init cmdline_parse_movablecore(char *p)

5360

{

5360

{

5361

return cmdline_parse_core(p, &required_movablecore);

5361

return cmdline_parse_core(p, &required_movablecore);

5362

}

5362

}

5363

5364

early_param("kernelcore", cmdline_parse_kernelcore);

5364

early_param("kernelcore", cmdline_parse_kernelcore);

5365

early_param("movablecore", cmdline_parse_movablecore);

5365

early_param("movablecore", cmdline_parse_movablecore);

5366

5367

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5367

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5368

5369

void adjust_managed_page_count(struct page *page, long count)

5369

void adjust_managed_page_count(struct page *page, long count)

5370

{

5370

{

5371

spin_lock(&managed_page_count_lock);

5371

spin_lock(&managed_page_count_lock);

5372

page_zone(page)->managed_pages += count;

5372

page_zone(page)->managed_pages += count;

5373

totalram_pages += count;

5373

totalram_pages += count;

5374

#ifdef CONFIG_HIGHMEM

5374

#ifdef CONFIG_HIGHMEM

5375

if (PageHighMem(page))

5375

if (PageHighMem(page))

5376

totalhigh_pages += count;

5376

totalhigh_pages += count;

5377

#endif

5377

#endif

5378

spin_unlock(&managed_page_count_lock);

5378

spin_unlock(&managed_page_count_lock);

5379

}

5379

}

5380

EXPORT_SYMBOL(adjust_managed_page_count);

5380

EXPORT_SYMBOL(adjust_managed_page_count);

5381

5382

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5382

unsigned long free_reserved_area(void *start, void *end, int poison, char *s)

5383

{

5383

{

5384

void *pos;

5384

void *pos;

5385

unsigned long pages = 0;

5385

unsigned long pages = 0;

5386

5387

start = (void *)PAGE_ALIGN((unsigned long)start);

5387

start = (void *)PAGE_ALIGN((unsigned long)start);

5388

end = (void *)((unsigned long)end & PAGE_MASK);

5388

end = (void *)((unsigned long)end & PAGE_MASK);

5389

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5389

for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {

5390

if ((unsigned int)poison <= 0xFF)

5390

if ((unsigned int)poison <= 0xFF)

5391

memset(pos, poison, PAGE_SIZE);

5391

memset(pos, poison, PAGE_SIZE);

5392

free_reserved_page(virt_to_page(pos));

5392

free_reserved_page(virt_to_page(pos));

5393

}

5393

}

5394

5395

if (pages && s)

5395

if (pages && s)

5396

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5396

pr_info("Freeing %s memory: %ldK (%p - %p)\n",

5397

s, pages << (PAGE_SHIFT - 10), start, end);

5397

s, pages << (PAGE_SHIFT - 10), start, end);

5398

5399

return pages;

5399

return pages;

5400

}

5400

}

5401

EXPORT_SYMBOL(free_reserved_area);

5401

EXPORT_SYMBOL(free_reserved_area);

5402

5403

#ifdef CONFIG_HIGHMEM

5403

#ifdef CONFIG_HIGHMEM

5404

void free_highmem_page(struct page *page)

5404

void free_highmem_page(struct page *page)

5405

{

5405

{

5406

__free_reserved_page(page);

5406

__free_reserved_page(page);

5407

totalram_pages++;

5407

totalram_pages++;

5408

page_zone(page)->managed_pages++;

5408

page_zone(page)->managed_pages++;

5409

totalhigh_pages++;

5409

totalhigh_pages++;

5410

}

5410

}

5411

#endif

5411

#endif

5412

5413

5414

void __init mem_init_print_info(const char *str)

5414

void __init mem_init_print_info(const char *str)

5415

{

5415

{

5416

unsigned long physpages, codesize, datasize, rosize, bss_size;

5416

unsigned long physpages, codesize, datasize, rosize, bss_size;

5417

unsigned long init_code_size, init_data_size;

5417

unsigned long init_code_size, init_data_size;

5418

5419

physpages = get_num_physpages();

5419

physpages = get_num_physpages();

5420

codesize = _etext - _stext;

5420

codesize = _etext - _stext;

5421

datasize = _edata - _sdata;

5421

datasize = _edata - _sdata;

5422

rosize = __end_rodata - __start_rodata;

5422

rosize = __end_rodata - __start_rodata;

5423

bss_size = __bss_stop - __bss_start;

5423

bss_size = __bss_stop - __bss_start;

5424

init_data_size = __init_end - __init_begin;

5424

init_data_size = __init_end - __init_begin;

5425

init_code_size = _einittext - _sinittext;

5425

init_code_size = _einittext - _sinittext;

5426

5427

/*

5427

/*

5428

* Detect special cases and adjust section sizes accordingly:

5428

* Detect special cases and adjust section sizes accordingly:

5429

* 1) .init.* may be embedded into .data sections

5429

* 1) .init.* may be embedded into .data sections

5430

* 2) .init.text.* may be out of [__init_begin, __init_end],

5430

* 2) .init.text.* may be out of [__init_begin, __init_end],

5431

* please refer to arch/tile/kernel/vmlinux.lds.S.

5431

* please refer to arch/tile/kernel/vmlinux.lds.S.

5432

* 3) .rodata.* may be embedded into .text or .data sections.

5432

* 3) .rodata.* may be embedded into .text or .data sections.

5433

*/

5433

*/

5434

#define adj_init_size(start, end, size, pos, adj) \

5434

#define adj_init_size(start, end, size, pos, adj) \

5435

do { \

5435

do { \

5436

if (start <= pos && pos < end && size > adj) \

5436

if (start <= pos && pos < end && size > adj) \

5437

size -= adj; \

5437

size -= adj; \

5438

} while (0)

5438

} while (0)

5439

5440

adj_init_size(__init_begin, __init_end, init_data_size,

5440

adj_init_size(__init_begin, __init_end, init_data_size,

5441

_sinittext, init_code_size);

5441

_sinittext, init_code_size);

5442

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5442

adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);

5443

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5443

adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);

5444

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5444

adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);

5445

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5445

adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);

5446

5447

#undef adj_init_size

5447

#undef adj_init_size

5448

5449

printk("Memory: %luK/%luK available "

5449

printk("Memory: %luK/%luK available "

5450

"(%luK kernel code, %luK rwdata, %luK rodata, "

5450

"(%luK kernel code, %luK rwdata, %luK rodata, "

5451

"%luK init, %luK bss, %luK reserved"

5451

"%luK init, %luK bss, %luK reserved"

5452

#ifdef CONFIG_HIGHMEM

5452

#ifdef CONFIG_HIGHMEM

5453

", %luK highmem"

5453

", %luK highmem"

5454

#endif

5454

#endif

5455

"%s%s)\n",

5455

"%s%s)\n",

5456

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5456

nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),

5457

codesize >> 10, datasize >> 10, rosize >> 10,

5457

codesize >> 10, datasize >> 10, rosize >> 10,

5458

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5458

(init_data_size + init_code_size) >> 10, bss_size >> 10,

5459

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5459

(physpages - totalram_pages) << (PAGE_SHIFT-10),

5460

#ifdef CONFIG_HIGHMEM

5460

#ifdef CONFIG_HIGHMEM

5461

totalhigh_pages << (PAGE_SHIFT-10),

5461

totalhigh_pages << (PAGE_SHIFT-10),

5462

#endif

5462

#endif

5463

str ? ", " : "", str ? str : "");

5463

str ? ", " : "", str ? str : "");

5464

}

5464

}

5465

5466

/**

5466

/**

5467

* set_dma_reserve - set the specified number of pages reserved in the first zone

5467

* set_dma_reserve - set the specified number of pages reserved in the first zone

5468

* @new_dma_reserve: The number of pages to mark reserved

5468

* @new_dma_reserve: The number of pages to mark reserved

5469

*

5469

*

5470

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5470

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5471

* In the DMA zone, a significant percentage may be consumed by kernel image

5471

* In the DMA zone, a significant percentage may be consumed by kernel image

5472

* and other unfreeable allocations which can skew the watermarks badly. This

5472

* and other unfreeable allocations which can skew the watermarks badly. This

5473

* function may optionally be used to account for unfreeable pages in the

5473

* function may optionally be used to account for unfreeable pages in the

5474

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5474

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5475

* smaller per-cpu batchsize.

5475

* smaller per-cpu batchsize.

5476

*/

5476

*/

5477

void __init set_dma_reserve(unsigned long new_dma_reserve)

5477

void __init set_dma_reserve(unsigned long new_dma_reserve)

5478

{

5478

{

5479

dma_reserve = new_dma_reserve;

5479

dma_reserve = new_dma_reserve;

5480

}

5480

}

5481

5482

void __init free_area_init(unsigned long *zones_size)

5482

void __init free_area_init(unsigned long *zones_size)

5483

{

5483

{

5484

free_area_init_node(0, zones_size,

5484

free_area_init_node(0, zones_size,

5485

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5485

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5486

}

5486

}

5487

5488

static int page_alloc_cpu_notify(struct notifier_block *self,

5488

static int page_alloc_cpu_notify(struct notifier_block *self,

5489

unsigned long action, void *hcpu)

5489

unsigned long action, void *hcpu)

5490

{

5490

{

5491

int cpu = (unsigned long)hcpu;

5491

int cpu = (unsigned long)hcpu;

5492

5493

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5493

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5494

lru_add_drain_cpu(cpu);

5494

lru_add_drain_cpu(cpu);

5495

drain_pages(cpu);

5495

drain_pages(cpu);

5496

5497

/*

5497

/*

5498

* Spill the event counters of the dead processor

5498

* Spill the event counters of the dead processor

5499

* into the current processors event counters.

5499

* into the current processors event counters.

5500

* This artificially elevates the count of the current

5500

* This artificially elevates the count of the current

5501

* processor.

5501

* processor.

5502

*/

5502

*/

5503

vm_events_fold_cpu(cpu);

5503

vm_events_fold_cpu(cpu);

5504

5505

/*

5505

/*

5506

* Zero the differential counters of the dead processor

5506

* Zero the differential counters of the dead processor

5507

* so that the vm statistics are consistent.

5507

* so that the vm statistics are consistent.

5508

*

5508

*

5509

* This is only okay since the processor is dead and cannot

5509

* This is only okay since the processor is dead and cannot

5510

* race with what we are doing.

5510

* race with what we are doing.

5511

*/

5511

*/

5512

cpu_vm_stats_fold(cpu);

5512

cpu_vm_stats_fold(cpu);

5513

}

5513

}

5514

return NOTIFY_OK;

5514

return NOTIFY_OK;

5515

}

5515

}

5516

5517

void __init page_alloc_init(void)

5517

void __init page_alloc_init(void)

5518

{

5518

{

5519

hotcpu_notifier(page_alloc_cpu_notify, 0);

5519

hotcpu_notifier(page_alloc_cpu_notify, 0);

5520

}

5520

}

5521

5522

/*

5522

/*

5523

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5523

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5524

* or min_free_kbytes changes.

5524

* or min_free_kbytes changes.

5525

*/

5525

*/

5526

static void calculate_totalreserve_pages(void)

5526

static void calculate_totalreserve_pages(void)

5527

{

5527

{

5528

struct pglist_data *pgdat;

5528

struct pglist_data *pgdat;

5529

unsigned long reserve_pages = 0;

5529

unsigned long reserve_pages = 0;

5530

enum zone_type i, j;

5530

enum zone_type i, j;

5531

5532

for_each_online_pgdat(pgdat) {

5532

for_each_online_pgdat(pgdat) {

5533

for (i = 0; i < MAX_NR_ZONES; i++) {

5533

for (i = 0; i < MAX_NR_ZONES; i++) {

5534

struct zone *zone = pgdat->node_zones + i;

5534

struct zone *zone = pgdat->node_zones + i;

5535

long max = 0;

5535

long max = 0;

5536

5537

/* Find valid and maximum lowmem_reserve in the zone */

5537

/* Find valid and maximum lowmem_reserve in the zone */

5538

for (j = i; j < MAX_NR_ZONES; j++) {

5538

for (j = i; j < MAX_NR_ZONES; j++) {

5539

if (zone->lowmem_reserve[j] > max)

5539

if (zone->lowmem_reserve[j] > max)

5540

max = zone->lowmem_reserve[j];

5540

max = zone->lowmem_reserve[j];

5541

}

5541

}

5542

5543

/* we treat the high watermark as reserved pages. */

5543

/* we treat the high watermark as reserved pages. */

5544

max += high_wmark_pages(zone);

5544

max += high_wmark_pages(zone);

5545

5546

if (max > zone->managed_pages)

5546

if (max > zone->managed_pages)

5547

max = zone->managed_pages;

5547

max = zone->managed_pages;

5548

reserve_pages += max;

5548

reserve_pages += max;

5549

/*

5549

/*

5550

* Lowmem reserves are not available to

5550

* Lowmem reserves are not available to

5551

* GFP_HIGHUSER page cache allocations and

5551

* GFP_HIGHUSER page cache allocations and

5552

* kswapd tries to balance zones to their high

5552

* kswapd tries to balance zones to their high

5553

* watermark. As a result, neither should be

5553

* watermark. As a result, neither should be

5554

* regarded as dirtyable memory, to prevent a

5554

* regarded as dirtyable memory, to prevent a

5555

* situation where reclaim has to clean pages

5555

* situation where reclaim has to clean pages

5556

* in order to balance the zones.

5556

* in order to balance the zones.

5557

*/

5557

*/

5558

zone->dirty_balance_reserve = max;

5558

zone->dirty_balance_reserve = max;

5559

}

5559

}

5560

}

5560

}

5561

dirty_balance_reserve = reserve_pages;

5561

dirty_balance_reserve = reserve_pages;

5562

totalreserve_pages = reserve_pages;

5562

totalreserve_pages = reserve_pages;

5563

}

5563

}

5564

5565

/*

5565

/*

5566

* setup_per_zone_lowmem_reserve - called whenever

5566

* setup_per_zone_lowmem_reserve - called whenever

5567

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5567

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5568

* has a correct pages reserved value, so an adequate number of

5568

* has a correct pages reserved value, so an adequate number of

5569

* pages are left in the zone after a successful __alloc_pages().

5569

* pages are left in the zone after a successful __alloc_pages().

5570

*/

5570

*/

5571

static void setup_per_zone_lowmem_reserve(void)

5571

static void setup_per_zone_lowmem_reserve(void)

5572

{

5572

{

5573

struct pglist_data *pgdat;

5573

struct pglist_data *pgdat;

5574

enum zone_type j, idx;

5574

enum zone_type j, idx;

5575

5576

for_each_online_pgdat(pgdat) {

5576

for_each_online_pgdat(pgdat) {

5577

for (j = 0; j < MAX_NR_ZONES; j++) {

5577

for (j = 0; j < MAX_NR_ZONES; j++) {

5578

struct zone *zone = pgdat->node_zones + j;

5578

struct zone *zone = pgdat->node_zones + j;

5579

unsigned long managed_pages = zone->managed_pages;

5579

unsigned long managed_pages = zone->managed_pages;

5580

5581

zone->lowmem_reserve[j] = 0;

5581

zone->lowmem_reserve[j] = 0;

5582

5583

idx = j;

5583

idx = j;

5584

while (idx) {

5584

while (idx) {

5585

struct zone *lower_zone;

5585

struct zone *lower_zone;

5586

5587

idx--;

5587

idx--;

5588

5589

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5589

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5590

sysctl_lowmem_reserve_ratio[idx] = 1;

5590

sysctl_lowmem_reserve_ratio[idx] = 1;

5591

5592

lower_zone = pgdat->node_zones + idx;

5592

lower_zone = pgdat->node_zones + idx;

5593

lower_zone->lowmem_reserve[j] = managed_pages /

5593

lower_zone->lowmem_reserve[j] = managed_pages /

5594

sysctl_lowmem_reserve_ratio[idx];

5594

sysctl_lowmem_reserve_ratio[idx];

5595

managed_pages += lower_zone->managed_pages;

5595

managed_pages += lower_zone->managed_pages;

5596

}

5596

}

5597

}

5597

}

5598

}

5598

}

5599

5600

/* update totalreserve_pages */

5600

/* update totalreserve_pages */

5601

calculate_totalreserve_pages();

5601

calculate_totalreserve_pages();

5602

}

5602

}

5603

5604

static void __setup_per_zone_wmarks(void)

5604

static void __setup_per_zone_wmarks(void)

5605

{

5605

{

5606

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5606

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5607

unsigned long lowmem_pages = 0;

5607

unsigned long lowmem_pages = 0;

5608

struct zone *zone;

5608

struct zone *zone;

5609

unsigned long flags;

5609

unsigned long flags;

5610

5611

/* Calculate total number of !ZONE_HIGHMEM pages */

5611

/* Calculate total number of !ZONE_HIGHMEM pages */

5612

for_each_zone(zone) {

5612

for_each_zone(zone) {

5613

if (!is_highmem(zone))

5613

if (!is_highmem(zone))

5614

lowmem_pages += zone->managed_pages;

5614

lowmem_pages += zone->managed_pages;

5615

}

5615

}

5616

5617

for_each_zone(zone) {

5617

for_each_zone(zone) {

5618

u64 tmp;

5618

u64 tmp;

5619

5620

spin_lock_irqsave(&zone->lock, flags);

5620

spin_lock_irqsave(&zone->lock, flags);

5621

tmp = (u64)pages_min * zone->managed_pages;

5621

tmp = (u64)pages_min * zone->managed_pages;

5622

do_div(tmp, lowmem_pages);

5622

do_div(tmp, lowmem_pages);

5623

if (is_highmem(zone)) {

5623

if (is_highmem(zone)) {

5624

/*

5624

/*

5625

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5625

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5626

* need highmem pages, so cap pages_min to a small

5626

* need highmem pages, so cap pages_min to a small

5627

* value here.

5627

* value here.

5628

*

5628

*

5629

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5629

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5630

* deltas controls asynch page reclaim, and so should

5630

* deltas controls asynch page reclaim, and so should

5631

* not be capped for highmem.

5631

* not be capped for highmem.

5632

*/

5632

*/

5633

unsigned long min_pages;

5633

unsigned long min_pages;

5634

5635

min_pages = zone->managed_pages / 1024;

5635

min_pages = zone->managed_pages / 1024;

5636

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5636

min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);

5637

zone->watermark[WMARK_MIN] = min_pages;

5637

zone->watermark[WMARK_MIN] = min_pages;

5638

} else {

5638

} else {

5639

/*

5639

/*

5640

* If it's a lowmem zone, reserve a number of pages

5640

* If it's a lowmem zone, reserve a number of pages

5641

* proportionate to the zone's size.

5641

* proportionate to the zone's size.

5642

*/

5642

*/

5643

zone->watermark[WMARK_MIN] = tmp;

5643

zone->watermark[WMARK_MIN] = tmp;

5644

}

5644

}

5645

5646

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5646

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5647

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5647

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5648

5649

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5649

__mod_zone_page_state(zone, NR_ALLOC_BATCH,

5650

high_wmark_pages(zone) -

5650

high_wmark_pages(zone) -

5651

low_wmark_pages(zone) -

5651

low_wmark_pages(zone) -

5652

zone_page_state(zone, NR_ALLOC_BATCH));

5652

zone_page_state(zone, NR_ALLOC_BATCH));

5653

5654

setup_zone_migrate_reserve(zone);

5654

setup_zone_migrate_reserve(zone);

5655

spin_unlock_irqrestore(&zone->lock, flags);

5655

spin_unlock_irqrestore(&zone->lock, flags);

5656

}

5656

}

5657

5658

/* update totalreserve_pages */

5658

/* update totalreserve_pages */

5659

calculate_totalreserve_pages();

5659

calculate_totalreserve_pages();

5660

}

5660

}

5661

5662

/**

5662

/**

5663

* setup_per_zone_wmarks - called when min_free_kbytes changes

5663

* setup_per_zone_wmarks - called when min_free_kbytes changes

5664

* or when memory is hot-{added|removed}

5664

* or when memory is hot-{added|removed}

5665

*

5665

*

5666

* Ensures that the watermark[min,low,high] values for each zone are set

5666

* Ensures that the watermark[min,low,high] values for each zone are set

5667

* correctly with respect to min_free_kbytes.

5667

* correctly with respect to min_free_kbytes.

5668

*/

5668

*/

5669

void setup_per_zone_wmarks(void)

5669

void setup_per_zone_wmarks(void)

5670

{

5670

{

5671

mutex_lock(&zonelists_mutex);

5671

mutex_lock(&zonelists_mutex);

5672

__setup_per_zone_wmarks();

5672

__setup_per_zone_wmarks();

5673

mutex_unlock(&zonelists_mutex);

5673

mutex_unlock(&zonelists_mutex);

5674

}

5674

}

5675

5676

/*

5676

/*

5677

* The inactive anon list should be small enough that the VM never has to

5677

* The inactive anon list should be small enough that the VM never has to

5678

* do too much work, but large enough that each inactive page has a chance

5678

* do too much work, but large enough that each inactive page has a chance

5679

* to be referenced again before it is swapped out.

5679

* to be referenced again before it is swapped out.

5680

*

5680

*

5681

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5681

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5682

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5682

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5683

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5683

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5684

* the anonymous pages are kept on the inactive list.

5684

* the anonymous pages are kept on the inactive list.

5685

*

5685

*

5686

* total target max

5686

* total target max

5687

* memory ratio inactive anon

5687

* memory ratio inactive anon

5688

* -------------------------------------

5688

* -------------------------------------

5689

* 10MB 1 5MB

5689

* 10MB 1 5MB

5690

* 100MB 1 50MB

5690

* 100MB 1 50MB

5691

* 1GB 3 250MB

5691

* 1GB 3 250MB

5692

* 10GB 10 0.9GB

5692

* 10GB 10 0.9GB

5693

* 100GB 31 3GB

5693

* 100GB 31 3GB

5694

* 1TB 101 10GB

5694

* 1TB 101 10GB

5695

* 10TB 320 32GB

5695

* 10TB 320 32GB

5696

*/

5696

*/

5697

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5697

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5698

{

5698

{

5699

unsigned int gb, ratio;

5699

unsigned int gb, ratio;

5700

5701

/* Zone size in gigabytes */

5701

/* Zone size in gigabytes */

5702

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5702

gb = zone->managed_pages >> (30 - PAGE_SHIFT);

5703

if (gb)

5703

if (gb)

5704

ratio = int_sqrt(10 * gb);

5704

ratio = int_sqrt(10 * gb);

5705

else

5705

else

5706

ratio = 1;

5706

ratio = 1;

5707

5708

zone->inactive_ratio = ratio;

5708

zone->inactive_ratio = ratio;

5709

}

5709

}

5710

5711

static void __meminit setup_per_zone_inactive_ratio(void)

5711

static void __meminit setup_per_zone_inactive_ratio(void)

5712

{

5712

{

5713

struct zone *zone;

5713

struct zone *zone;

5714

5715

for_each_zone(zone)

5715

for_each_zone(zone)

5716

calculate_zone_inactive_ratio(zone);

5716

calculate_zone_inactive_ratio(zone);

5717

}

5717

}

5718

5719

/*

5719

/*

5720

* Initialise min_free_kbytes.

5720

* Initialise min_free_kbytes.

5721

*

5721

*

5722

* For small machines we want it small (128k min). For large machines

5722

* For small machines we want it small (128k min). For large machines

5723

* we want it large (64MB max). But it is not linear, because network

5723

* we want it large (64MB max). But it is not linear, because network

5724

* bandwidth does not increase linearly with machine size. We use

5724

* bandwidth does not increase linearly with machine size. We use

5725

*

5725

*

5726

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5726

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5727

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5727

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5728

*

5728

*

5729

* which yields

5729

* which yields

5730

*

5730

*

5731

* 16MB: 512k

5731

* 16MB: 512k

5732

* 32MB: 724k

5732

* 32MB: 724k

5733

* 64MB: 1024k

5733

* 64MB: 1024k

5734

* 128MB: 1448k

5734

* 128MB: 1448k

5735

* 256MB: 2048k

5735

* 256MB: 2048k

5736

* 512MB: 2896k

5736

* 512MB: 2896k

5737

* 1024MB: 4096k

5737

* 1024MB: 4096k

5738

* 2048MB: 5792k

5738

* 2048MB: 5792k

5739

* 4096MB: 8192k

5739

* 4096MB: 8192k

5740

* 8192MB: 11584k

5740

* 8192MB: 11584k

5741

* 16384MB: 16384k

5741

* 16384MB: 16384k

5742

*/

5742

*/

5743

int __meminit init_per_zone_wmark_min(void)

5743

int __meminit init_per_zone_wmark_min(void)

5744

{

5744

{

5745

unsigned long lowmem_kbytes;

5745

unsigned long lowmem_kbytes;

5746

int new_min_free_kbytes;

5746

int new_min_free_kbytes;

5747

5748

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5748

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5749

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5749

new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5750

5751

if (new_min_free_kbytes > user_min_free_kbytes) {

5751

if (new_min_free_kbytes > user_min_free_kbytes) {

5752

min_free_kbytes = new_min_free_kbytes;

5752

min_free_kbytes = new_min_free_kbytes;

5753

if (min_free_kbytes < 128)

5753

if (min_free_kbytes < 128)

5754

min_free_kbytes = 128;

5754

min_free_kbytes = 128;

5755

if (min_free_kbytes > 65536)

5755

if (min_free_kbytes > 65536)

5756

min_free_kbytes = 65536;

5756

min_free_kbytes = 65536;

5757

} else {

5757

} else {

5758

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5758

pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",

5759

new_min_free_kbytes, user_min_free_kbytes);

5759

new_min_free_kbytes, user_min_free_kbytes);

5760

}

5760

}

5761

setup_per_zone_wmarks();

5761

setup_per_zone_wmarks();

5762

refresh_zone_stat_thresholds();

5762

refresh_zone_stat_thresholds();

5763

setup_per_zone_lowmem_reserve();

5763

setup_per_zone_lowmem_reserve();

5764

setup_per_zone_inactive_ratio();

5764

setup_per_zone_inactive_ratio();

5765

return 0;

5765

return 0;

5766

}

5766

}

5767

module_init(init_per_zone_wmark_min)

5767

module_init(init_per_zone_wmark_min)

5768

5769

/*

5769

/*

5770

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5770

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5771

* that we can call two helper functions whenever min_free_kbytes

5771

* that we can call two helper functions whenever min_free_kbytes

5772

* changes.

5772

* changes.

5773

*/

5773

*/

5774

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5774

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5775

void __user *buffer, size_t *length, loff_t *ppos)

5775

void __user *buffer, size_t *length, loff_t *ppos)

5776

{

5776

{

5777

int rc;

5777

int rc;

5778

5779

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5779

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5780

if (rc)

5780

if (rc)

5781

return rc;

5781

return rc;

5782

5783

if (write) {

5783

if (write) {

5784

user_min_free_kbytes = min_free_kbytes;

5784

user_min_free_kbytes = min_free_kbytes;

5785

setup_per_zone_wmarks();

5785

setup_per_zone_wmarks();

5786

}

5786

}

5787

return 0;

5787

return 0;

5788

}

5788

}

5789

5790

#ifdef CONFIG_NUMA

5790

#ifdef CONFIG_NUMA

5791

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5791

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5792

void __user *buffer, size_t *length, loff_t *ppos)

5792

void __user *buffer, size_t *length, loff_t *ppos)

5793

{

5793

{

5794

struct zone *zone;

5794

struct zone *zone;

5795

int rc;

5795

int rc;

5796

5797

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5797

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5798

if (rc)

5798

if (rc)

5799

return rc;

5799

return rc;

5800

5801

for_each_zone(zone)

5801

for_each_zone(zone)

5802

zone->min_unmapped_pages = (zone->managed_pages *

5802

zone->min_unmapped_pages = (zone->managed_pages *

5803

sysctl_min_unmapped_ratio) / 100;

5803

sysctl_min_unmapped_ratio) / 100;

5804

return 0;

5804

return 0;

5805

}

5805

}

5806

5807

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5807

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5808

void __user *buffer, size_t *length, loff_t *ppos)

5808

void __user *buffer, size_t *length, loff_t *ppos)

5809

{

5809

{

5810

struct zone *zone;

5810

struct zone *zone;

5811

int rc;

5811

int rc;

5812

5813

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5813

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5814

if (rc)

5814

if (rc)

5815

return rc;

5815

return rc;

5816

5817

for_each_zone(zone)

5817

for_each_zone(zone)

5818

zone->min_slab_pages = (zone->managed_pages *

5818

zone->min_slab_pages = (zone->managed_pages *

5819

sysctl_min_slab_ratio) / 100;

5819

sysctl_min_slab_ratio) / 100;

5820

return 0;

5820

return 0;

5821

}

5821

}

5822

#endif

5822

#endif

5823

5824

/*

5824

/*

5825

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5825

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5826

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5826

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5827

* whenever sysctl_lowmem_reserve_ratio changes.

5827

* whenever sysctl_lowmem_reserve_ratio changes.

5828

*

5828

*

5829

* The reserve ratio obviously has absolutely no relation with the

5829

* The reserve ratio obviously has absolutely no relation with the

5830

* minimum watermarks. The lowmem reserve ratio can only make sense

5830

* minimum watermarks. The lowmem reserve ratio can only make sense

5831

* if in function of the boot time zone sizes.

5831

* if in function of the boot time zone sizes.

5832

*/

5832

*/

5833

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5833

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5834

void __user *buffer, size_t *length, loff_t *ppos)

5834

void __user *buffer, size_t *length, loff_t *ppos)

5835

{

5835

{

5836

proc_dointvec_minmax(table, write, buffer, length, ppos);

5836

proc_dointvec_minmax(table, write, buffer, length, ppos);

5837

setup_per_zone_lowmem_reserve();

5837

setup_per_zone_lowmem_reserve();

5838

return 0;

5838

return 0;

5839

}

5839

}

5840

5841

/*

5841

/*

5842

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5842

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5843

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5843

* cpu. It is the fraction of total pages in each zone that a hot per cpu

5844

* pagelist can have before it gets flushed back to buddy allocator.

5844

* pagelist can have before it gets flushed back to buddy allocator.

5845

*/

5845

*/

5846

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5846

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5847

void __user *buffer, size_t *length, loff_t *ppos)

5847

void __user *buffer, size_t *length, loff_t *ppos)

5848

{

5848

{

5849

struct zone *zone;

5849

struct zone *zone;

5850

int old_percpu_pagelist_fraction;

5850

int old_percpu_pagelist_fraction;

5851

int ret;

5851

int ret;

5852

5853

mutex_lock(&pcp_batch_high_lock);

5853

mutex_lock(&pcp_batch_high_lock);

5854

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5854

old_percpu_pagelist_fraction = percpu_pagelist_fraction;

5855

5856

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5856

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5857

if (!write || ret < 0)

5857

if (!write || ret < 0)

5858

goto out;

5858

goto out;

5859

5860

/* Sanity checking to avoid pcp imbalance */

5860

/* Sanity checking to avoid pcp imbalance */

5861

if (percpu_pagelist_fraction &&

5861

if (percpu_pagelist_fraction &&

5862

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5862

percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {

5863

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5863

percpu_pagelist_fraction = old_percpu_pagelist_fraction;

5864

ret = -EINVAL;

5864

ret = -EINVAL;

5865

goto out;

5865

goto out;

5866

}

5866

}

5867

5868

/* No change? */

5868

/* No change? */

5869

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5869

if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)

5870

goto out;

5870

goto out;

5871

5872

for_each_populated_zone(zone) {

5872

for_each_populated_zone(zone) {

5873

unsigned int cpu;

5873

unsigned int cpu;

5874

5875

for_each_possible_cpu(cpu)

5875

for_each_possible_cpu(cpu)

5876

pageset_set_high_and_batch(zone,

5876

pageset_set_high_and_batch(zone,

5877

per_cpu_ptr(zone->pageset, cpu));

5877

per_cpu_ptr(zone->pageset, cpu));

5878

}

5878

}

5879

out:

5879

out:

5880

mutex_unlock(&pcp_batch_high_lock);

5880

mutex_unlock(&pcp_batch_high_lock);

5881

return ret;

5881

return ret;

5882

}

5882

}

5883

5884

int hashdist = HASHDIST_DEFAULT;

5884

int hashdist = HASHDIST_DEFAULT;

5885

5886

#ifdef CONFIG_NUMA

5886

#ifdef CONFIG_NUMA

5887

static int __init set_hashdist(char *str)

5887

static int __init set_hashdist(char *str)

5888

{

5888

{

5889

if (!str)

5889

if (!str)

5890

return 0;

5890

return 0;

5891

hashdist = simple_strtoul(str, &str, 0);

5891

hashdist = simple_strtoul(str, &str, 0);

5892

return 1;

5892

return 1;

5893

}

5893

}

5894

__setup("hashdist=", set_hashdist);

5894

__setup("hashdist=", set_hashdist);

5895

#endif

5895

#endif

5896

5897

/*

5897

/*

5898

* allocate a large system hash table from bootmem

5898

* allocate a large system hash table from bootmem

5899

* - it is assumed that the hash table must contain an exact power-of-2

5899

* - it is assumed that the hash table must contain an exact power-of-2

5900

* quantity of entries

5900

* quantity of entries

5901

* - limit is the number of hash buckets, not the total allocation size

5901

* - limit is the number of hash buckets, not the total allocation size

5902

*/

5902

*/

5903

void *__init alloc_large_system_hash(const char *tablename,

5903

void *__init alloc_large_system_hash(const char *tablename,

5904

unsigned long bucketsize,

5904

unsigned long bucketsize,

5905

unsigned long numentries,

5905

unsigned long numentries,

5906

int scale,

5906

int scale,

5907

int flags,

5907

int flags,

5908

unsigned int *_hash_shift,

5908

unsigned int *_hash_shift,

5909

unsigned int *_hash_mask,

5909

unsigned int *_hash_mask,

5910

unsigned long low_limit,

5910

unsigned long low_limit,

5911

unsigned long high_limit)

5911

unsigned long high_limit)

5912

{

5912

{

5913

unsigned long long max = high_limit;

5913

unsigned long long max = high_limit;

5914

unsigned long log2qty, size;

5914

unsigned long log2qty, size;

5915

void *table = NULL;

5915

void *table = NULL;

5916

5917

/* allow the kernel cmdline to have a say */

5917

/* allow the kernel cmdline to have a say */

5918

if (!numentries) {

5918

if (!numentries) {

5919

/* round applicable memory size up to nearest megabyte */

5919

/* round applicable memory size up to nearest megabyte */

5920

numentries = nr_kernel_pages;

5920

numentries = nr_kernel_pages;

5921

5922

/* It isn't necessary when PAGE_SIZE >= 1MB */

5922

/* It isn't necessary when PAGE_SIZE >= 1MB */

5923

if (PAGE_SHIFT < 20)

5923

if (PAGE_SHIFT < 20)

5924

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5924

numentries = round_up(numentries, (1<<20)/PAGE_SIZE);

5925

5926

/* limit to 1 bucket per 2^scale bytes of low memory */

5926

/* limit to 1 bucket per 2^scale bytes of low memory */

5927

if (scale > PAGE_SHIFT)

5927

if (scale > PAGE_SHIFT)

5928

numentries >>= (scale - PAGE_SHIFT);

5928

numentries >>= (scale - PAGE_SHIFT);

5929

else

5929

else

5930

numentries <<= (PAGE_SHIFT - scale);

5930

numentries <<= (PAGE_SHIFT - scale);

5931

5932

/* Make sure we've got at least a 0-order allocation.. */

5932

/* Make sure we've got at least a 0-order allocation.. */

5933

if (unlikely(flags & HASH_SMALL)) {

5933

if (unlikely(flags & HASH_SMALL)) {

5934

/* Makes no sense without HASH_EARLY */

5934

/* Makes no sense without HASH_EARLY */

5935

WARN_ON(!(flags & HASH_EARLY));

5935

WARN_ON(!(flags & HASH_EARLY));

5936

if (!(numentries >> *_hash_shift)) {

5936

if (!(numentries >> *_hash_shift)) {

5937

numentries = 1UL << *_hash_shift;

5937

numentries = 1UL << *_hash_shift;

5938

BUG_ON(!numentries);

5938

BUG_ON(!numentries);

5939

}

5939

}

5940

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5940

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5941

numentries = PAGE_SIZE / bucketsize;

5941

numentries = PAGE_SIZE / bucketsize;

5942

}

5942

}

5943

numentries = roundup_pow_of_two(numentries);

5943

numentries = roundup_pow_of_two(numentries);

5944

5945

/* limit allocation size to 1/16 total memory by default */

5945

/* limit allocation size to 1/16 total memory by default */

5946

if (max == 0) {

5946

if (max == 0) {

5947

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5947

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5948

do_div(max, bucketsize);

5948

do_div(max, bucketsize);

5949

}

5949

}

5950

max = min(max, 0x80000000ULL);

5950

max = min(max, 0x80000000ULL);

5951

5952

if (numentries < low_limit)

5952

if (numentries < low_limit)

5953

numentries = low_limit;

5953

numentries = low_limit;

5954

if (numentries > max)

5954

if (numentries > max)

5955

numentries = max;

5955

numentries = max;

5956

5957

log2qty = ilog2(numentries);

5957

log2qty = ilog2(numentries);

5958

5959

do {

5959

do {

5960

size = bucketsize << log2qty;

5960

size = bucketsize << log2qty;

5961

if (flags & HASH_EARLY)

5961

if (flags & HASH_EARLY)

5962

table = alloc_bootmem_nopanic(size);

5962

table = alloc_bootmem_nopanic(size);

5963

else if (hashdist)

5963

else if (hashdist)

5964

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5964

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5965

else {

5965

else {

5966

/*

5966

/*

5967

* If bucketsize is not a power-of-two, we may free

5967

* If bucketsize is not a power-of-two, we may free

5968

* some pages at the end of hash table which

5968

* some pages at the end of hash table which

5969

* alloc_pages_exact() automatically does

5969

* alloc_pages_exact() automatically does

5970

*/

5970

*/

5971

if (get_order(size) < MAX_ORDER) {

5971

if (get_order(size) < MAX_ORDER) {

5972

table = alloc_pages_exact(size, GFP_ATOMIC);

5972

table = alloc_pages_exact(size, GFP_ATOMIC);

5973

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5973

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5974

}

5974

}

5975

}

5975

}

5976

} while (!table && size > PAGE_SIZE && --log2qty);

5976

} while (!table && size > PAGE_SIZE && --log2qty);

5977

5978

if (!table)

5978

if (!table)

5979

panic("Failed to allocate %s hash table\n", tablename);

5979

panic("Failed to allocate %s hash table\n", tablename);

5980

5981

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5981

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5982

tablename,

5982

tablename,

5983

(1UL << log2qty),

5983

(1UL << log2qty),

5984

ilog2(size) - PAGE_SHIFT,

5984

ilog2(size) - PAGE_SHIFT,

5985

size);

5985

size);

5986

5987

if (_hash_shift)

5987

if (_hash_shift)

5988

*_hash_shift = log2qty;

5988

*_hash_shift = log2qty;

5989

if (_hash_mask)

5989

if (_hash_mask)

5990

*_hash_mask = (1 << log2qty) - 1;

5990

*_hash_mask = (1 << log2qty) - 1;

5991

5992

return table;

5992

return table;

5993

}

5993

}

5994

5995

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5995

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5996

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5996

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5997

unsigned long pfn)

5997

unsigned long pfn)

5998

{

5998

{

5999

#ifdef CONFIG_SPARSEMEM

5999

#ifdef CONFIG_SPARSEMEM

6000

return __pfn_to_section(pfn)->pageblock_flags;

6000

return __pfn_to_section(pfn)->pageblock_flags;

6001

#else

6001

#else

6002

return zone->pageblock_flags;

6002

return zone->pageblock_flags;

6003

#endif /* CONFIG_SPARSEMEM */

6003

#endif /* CONFIG_SPARSEMEM */

6004

}

6004

}

6005

6006

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

6006

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

6007

{

6007

{

6008

#ifdef CONFIG_SPARSEMEM

6008

#ifdef CONFIG_SPARSEMEM

6009

pfn &= (PAGES_PER_SECTION-1);

6009

pfn &= (PAGES_PER_SECTION-1);

6010

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6010

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6011

#else

6011

#else

6012

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

6012

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

6013

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6013

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

6014

#endif /* CONFIG_SPARSEMEM */

6014

#endif /* CONFIG_SPARSEMEM */

6015

}

6015

}

6016

6017

/**

6017

/**

6018

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6018

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

6019

* @page: The page within the block of interest

6019

* @page: The page within the block of interest

6020

* @start_bitidx: The first bit of interest to retrieve

6020

* @start_bitidx: The first bit of interest to retrieve

6021

* @end_bitidx: The last bit of interest

6021

* @end_bitidx: The last bit of interest

6022

* returns pageblock_bits flags

6022

* returns pageblock_bits flags

6023

*/

6023

*/

6024

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

6024

unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,

6025

unsigned long end_bitidx,

6025

unsigned long end_bitidx,

6026

unsigned long mask)

6026

unsigned long mask)

6027

{

6027

{

6028

struct zone *zone;

6028

struct zone *zone;

6029

unsigned long *bitmap;

6029

unsigned long *bitmap;

6030

unsigned long bitidx, word_bitidx;

6030

unsigned long bitidx, word_bitidx;

6031

unsigned long word;

6031

unsigned long word;

6032

6033

zone = page_zone(page);

6033

zone = page_zone(page);

6034

bitmap = get_pageblock_bitmap(zone, pfn);

6034

bitmap = get_pageblock_bitmap(zone, pfn);

6035

bitidx = pfn_to_bitidx(zone, pfn);

6035

bitidx = pfn_to_bitidx(zone, pfn);

6036

word_bitidx = bitidx / BITS_PER_LONG;

6036

word_bitidx = bitidx / BITS_PER_LONG;

6037

bitidx &= (BITS_PER_LONG-1);

6037

bitidx &= (BITS_PER_LONG-1);

6038

6039

word = bitmap[word_bitidx];

6039

word = bitmap[word_bitidx];

6040

bitidx += end_bitidx;

6040

bitidx += end_bitidx;

6041

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6041

return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;

6042

}

6042

}

6043

6044

/**

6044

/**

6045

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6045

* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages

6046

* @page: The page within the block of interest

6046

* @page: The page within the block of interest

6047

* @start_bitidx: The first bit of interest

6047

* @start_bitidx: The first bit of interest

6048

* @end_bitidx: The last bit of interest

6048

* @end_bitidx: The last bit of interest

6049

* @flags: The flags to set

6049

* @flags: The flags to set

6050

*/

6050

*/

6051

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

6051

void set_pfnblock_flags_mask(struct page *page, unsigned long flags,

6052

unsigned long pfn,

6052

unsigned long pfn,

6053

unsigned long end_bitidx,

6053

unsigned long end_bitidx,

6054

unsigned long mask)

6054

unsigned long mask)

6055

{

6055

{

6056

struct zone *zone;

6056

struct zone *zone;

6057

unsigned long *bitmap;

6057

unsigned long *bitmap;

6058

unsigned long bitidx, word_bitidx;

6058

unsigned long bitidx, word_bitidx;

6059

unsigned long old_word, word;

6059

unsigned long old_word, word;

6060

6061

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6061

BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);

6062

6063

zone = page_zone(page);

6063

zone = page_zone(page);

6064

bitmap = get_pageblock_bitmap(zone, pfn);

6064

bitmap = get_pageblock_bitmap(zone, pfn);

6065

bitidx = pfn_to_bitidx(zone, pfn);

6065

bitidx = pfn_to_bitidx(zone, pfn);

6066

word_bitidx = bitidx / BITS_PER_LONG;

6066

word_bitidx = bitidx / BITS_PER_LONG;

6067

bitidx &= (BITS_PER_LONG-1);

6067

bitidx &= (BITS_PER_LONG-1);

6068

6069

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6069

VM_BUG_ON(!zone_spans_pfn(zone, pfn));

6070

6071

bitidx += end_bitidx;

6071

bitidx += end_bitidx;

6072

mask <<= (BITS_PER_LONG - bitidx - 1);

6072

mask <<= (BITS_PER_LONG - bitidx - 1);

6073

flags <<= (BITS_PER_LONG - bitidx - 1);

6073

flags <<= (BITS_PER_LONG - bitidx - 1);

6074

6075

word = ACCESS_ONCE(bitmap[word_bitidx]);

6075

word = ACCESS_ONCE(bitmap[word_bitidx]);

6076

for (;;) {

6076

for (;;) {

6077

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6077

old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);

6078

if (word == old_word)

6078

if (word == old_word)

6079

break;

6079

break;

6080

word = old_word;

6080

word = old_word;

6081

}

6081

}

6082

}

6082

}

6083

6084

/*

6084

/*

6085

* This function checks whether pageblock includes unmovable pages or not.

6085

* This function checks whether pageblock includes unmovable pages or not.

6086

* If @count is not zero, it is okay to include less @count unmovable pages

6086

* If @count is not zero, it is okay to include less @count unmovable pages

6087

*

6087

*

6088

* PageLRU check without isolation or lru_lock could race so that

6088

* PageLRU check without isolation or lru_lock could race so that

6089

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6089

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

6090

* expect this function should be exact.

6090

* expect this function should be exact.

6091

*/

6091

*/

6092

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6092

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

6093

bool skip_hwpoisoned_pages)

6093

bool skip_hwpoisoned_pages)

6094

{

6094

{

6095

unsigned long pfn, iter, found;

6095

unsigned long pfn, iter, found;

6096

int mt;

6096

int mt;

6097

6098

/*

6098

/*

6099

* For avoiding noise data, lru_add_drain_all() should be called

6099

* For avoiding noise data, lru_add_drain_all() should be called

6100

* If ZONE_MOVABLE, the zone never contains unmovable pages

6100

* If ZONE_MOVABLE, the zone never contains unmovable pages

6101

*/

6101

*/

6102

if (zone_idx(zone) == ZONE_MOVABLE)

6102

if (zone_idx(zone) == ZONE_MOVABLE)

6103

return false;

6103

return false;

6104

mt = get_pageblock_migratetype(page);

6104

mt = get_pageblock_migratetype(page);

6105

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6105

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

6106

return false;

6106

return false;

6107

6108

pfn = page_to_pfn(page);

6108

pfn = page_to_pfn(page);

6109

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6109

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

6110

unsigned long check = pfn + iter;

6110

unsigned long check = pfn + iter;

6111

6112

if (!pfn_valid_within(check))

6112

if (!pfn_valid_within(check))

6113

continue;

6113

continue;

6114

6115

page = pfn_to_page(check);

6115

page = pfn_to_page(check);

6116

6117

/*

6117

/*

6118

* Hugepages are not in LRU lists, but they're movable.

6118

* Hugepages are not in LRU lists, but they're movable.

6119

* We need not scan over tail pages bacause we don't

6119

* We need not scan over tail pages bacause we don't

6120

* handle each tail page individually in migration.

6120

* handle each tail page individually in migration.

6121

*/

6121

*/

6122

if (PageHuge(page)) {

6122

if (PageHuge(page)) {

6123

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6123

iter = round_up(iter + 1, 1<<compound_order(page)) - 1;

6124

continue;

6124

continue;

6125

}

6125

}

6126

6127

/*

6127

/*

6128

* We can't use page_count without pin a page

6128

* We can't use page_count without pin a page

6129

* because another CPU can free compound page.

6129

* because another CPU can free compound page.

6130

* This check already skips compound tails of THP

6130

* This check already skips compound tails of THP

6131

* because their page->_count is zero at all time.

6131

* because their page->_count is zero at all time.

6132

*/

6132

*/

6133

if (!atomic_read(&page->_count)) {

6133

if (!atomic_read(&page->_count)) {

6134

if (PageBuddy(page))

6134

if (PageBuddy(page))

6135

iter += (1 << page_order(page)) - 1;

6135

iter += (1 << page_order(page)) - 1;

6136

continue;

6136

continue;

6137

}

6137

}

6138

6139

/*

6139

/*

6140

* The HWPoisoned page may be not in buddy system, and

6140

* The HWPoisoned page may be not in buddy system, and

6141

* page_count() is not 0.

6141

* page_count() is not 0.

6142

*/

6142

*/

6143

if (skip_hwpoisoned_pages && PageHWPoison(page))

6143

if (skip_hwpoisoned_pages && PageHWPoison(page))

6144

continue;

6144

continue;

6145

6146

if (!PageLRU(page))

6146

if (!PageLRU(page))

6147

found++;

6147

found++;

6148

/*

6148

/*

6149

* If there are RECLAIMABLE pages, we need to check it.

6149

* If there are RECLAIMABLE pages, we need to check it.

6150

* But now, memory offline itself doesn't call shrink_slab()

6150

* But now, memory offline itself doesn't call shrink_slab()

6151

* and it still to be fixed.

6151

* and it still to be fixed.

6152

*/

6152

*/

6153

/*

6153

/*

6154

* If the page is not RAM, page_count()should be 0.

6154

* If the page is not RAM, page_count()should be 0.

6155

* we don't need more check. This is an _used_ not-movable page.

6155

* we don't need more check. This is an _used_ not-movable page.

6156

*

6156

*

6157

* The problematic thing here is PG_reserved pages. PG_reserved

6157

* The problematic thing here is PG_reserved pages. PG_reserved

6158

* is set to both of a memory hole page and a _used_ kernel

6158

* is set to both of a memory hole page and a _used_ kernel

6159

* page at boot.

6159

* page at boot.

6160

*/

6160

*/

6161

if (found > count)

6161

if (found > count)

6162

return true;

6162

return true;

6163

}

6163

}

6164

return false;

6164

return false;

6165

}

6165

}

6166

6167

bool is_pageblock_removable_nolock(struct page *page)

6167

bool is_pageblock_removable_nolock(struct page *page)

6168

{

6168

{

6169

struct zone *zone;

6169

struct zone *zone;

6170

unsigned long pfn;

6170

unsigned long pfn;

6171

6172

/*

6172

/*

6173

* We have to be careful here because we are iterating over memory

6173

* We have to be careful here because we are iterating over memory

6174

* sections which are not zone aware so we might end up outside of

6174

* sections which are not zone aware so we might end up outside of

6175

* the zone but still within the section.

6175

* the zone but still within the section.

6176

* We have to take care about the node as well. If the node is offline

6176

* We have to take care about the node as well. If the node is offline

6177

* its NODE_DATA will be NULL - see page_zone.

6177

* its NODE_DATA will be NULL - see page_zone.

6178

*/

6178

*/

6179

if (!node_online(page_to_nid(page)))

6179

if (!node_online(page_to_nid(page)))

6180

return false;

6180

return false;

6181

6182

zone = page_zone(page);

6182

zone = page_zone(page);

6183

pfn = page_to_pfn(page);

6183

pfn = page_to_pfn(page);

6184

if (!zone_spans_pfn(zone, pfn))

6184

if (!zone_spans_pfn(zone, pfn))

6185

return false;

6185

return false;

6186

6187

return !has_unmovable_pages(zone, page, 0, true);

6187

return !has_unmovable_pages(zone, page, 0, true);

6188

}

6188

}

6189

6190

#ifdef CONFIG_CMA

6190

#ifdef CONFIG_CMA

6191

6192

static unsigned long pfn_max_align_down(unsigned long pfn)

6192

static unsigned long pfn_max_align_down(unsigned long pfn)

6193

{

6193

{

6194

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6194

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

6195

pageblock_nr_pages) - 1);

6195

pageblock_nr_pages) - 1);

6196

}

6196

}

6197

6198

static unsigned long pfn_max_align_up(unsigned long pfn)

6198

static unsigned long pfn_max_align_up(unsigned long pfn)

6199

{

6199

{

6200

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6200

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

6201

pageblock_nr_pages));

6201

pageblock_nr_pages));

6202

}

6202

}

6203

6204

/* [start, end) must belong to a single zone. */

6204

/* [start, end) must belong to a single zone. */

6205

static int __alloc_contig_migrate_range(struct compact_control *cc,

6205

static int __alloc_contig_migrate_range(struct compact_control *cc,

6206

unsigned long start, unsigned long end)

6206

unsigned long start, unsigned long end)

6207

{

6207

{

6208

/* This function is based on compact_zone() from compaction.c. */

6208

/* This function is based on compact_zone() from compaction.c. */

6209

unsigned long nr_reclaimed;

6209

unsigned long nr_reclaimed;

6210

unsigned long pfn = start;

6210

unsigned long pfn = start;

6211

unsigned int tries = 0;

6211

unsigned int tries = 0;

6212

int ret = 0;

6212

int ret = 0;

6213

6214

migrate_prep();

6214

migrate_prep();

6215

6216

while (pfn < end || !list_empty(&cc->migratepages)) {

6216

while (pfn < end || !list_empty(&cc->migratepages)) {

6217

if (fatal_signal_pending(current)) {

6217

if (fatal_signal_pending(current)) {

6218

ret = -EINTR;

6218

ret = -EINTR;

6219

break;

6219

break;

6220

}

6220

}

6221

6222

if (list_empty(&cc->migratepages)) {

6222

if (list_empty(&cc->migratepages)) {

6223

cc->nr_migratepages = 0;

6223

cc->nr_migratepages = 0;

6224

pfn = isolate_migratepages_range(cc->zone, cc,

6224

pfn = isolate_migratepages_range(cc->zone, cc,

6225

pfn, end, true);

6225

pfn, end, true);

6226

if (!pfn) {

6226

if (!pfn) {

6227

ret = -EINTR;

6227

ret = -EINTR;

6228

break;

6228

break;

6229

}

6229

}

6230

tries = 0;

6230

tries = 0;

6231

} else if (++tries == 5) {

6231

} else if (++tries == 5) {

6232

ret = ret < 0 ? ret : -EBUSY;

6232

ret = ret < 0 ? ret : -EBUSY;

6233

break;

6233

break;

6234

}

6234

}

6235

6236

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6236

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

6237

&cc->migratepages);

6237

&cc->migratepages);

6238

cc->nr_migratepages -= nr_reclaimed;

6238

cc->nr_migratepages -= nr_reclaimed;

6239

6240

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6240

ret = migrate_pages(&cc->migratepages, alloc_migrate_target,

6241

NULL, 0, cc->mode, MR_CMA);

6241

NULL, 0, cc->mode, MR_CMA);

6242

}

6242

}

6243

if (ret < 0) {

6243

if (ret < 0) {

6244

putback_movable_pages(&cc->migratepages);

6244

putback_movable_pages(&cc->migratepages);

6245

return ret;

6245

return ret;

6246

}

6246

}

6247

return 0;

6247

return 0;

6248

}

6248

}

6249

6250

/**

6250

/**

6251

* alloc_contig_range() -- tries to allocate given range of pages

6251

* alloc_contig_range() -- tries to allocate given range of pages

6252

* @start: start PFN to allocate

6252

* @start: start PFN to allocate

6253

* @end: one-past-the-last PFN to allocate

6253

* @end: one-past-the-last PFN to allocate

6254

* @migratetype: migratetype of the underlaying pageblocks (either

6254

* @migratetype: migratetype of the underlaying pageblocks (either

6255

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6255

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

6256

* in range must have the same migratetype and it must

6256

* in range must have the same migratetype and it must

6257

* be either of the two.

6257

* be either of the two.

6258

*

6258

*

6259

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6259

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

6260

* aligned, however it's the caller's responsibility to guarantee that

6260

* aligned, however it's the caller's responsibility to guarantee that

6261

* we are the only thread that changes migrate type of pageblocks the

6261

* we are the only thread that changes migrate type of pageblocks the

6262

* pages fall in.

6262

* pages fall in.

6263

*

6263

*

6264

* The PFN range must belong to a single zone.

6264

* The PFN range must belong to a single zone.

6265

*

6265

*

6266

* Returns zero on success or negative error code. On success all

6266

* Returns zero on success or negative error code. On success all

6267

* pages which PFN is in [start, end) are allocated for the caller and

6267

* pages which PFN is in [start, end) are allocated for the caller and

6268

* need to be freed with free_contig_range().

6268

* need to be freed with free_contig_range().

6269

*/

6269

*/

6270

int alloc_contig_range(unsigned long start, unsigned long end,

6270

int alloc_contig_range(unsigned long start, unsigned long end,

6271

unsigned migratetype)

6271

unsigned migratetype)

6272

{

6272

{

6273

unsigned long outer_start, outer_end;

6273

unsigned long outer_start, outer_end;

6274

int ret = 0, order;

6274

int ret = 0, order;

6275

6276

struct compact_control cc = {

6276

struct compact_control cc = {

6277

.nr_migratepages = 0,

6277

.nr_migratepages = 0,

6278

.order = -1,

6278

.order = -1,

6279

.zone = page_zone(pfn_to_page(start)),

6279

.zone = page_zone(pfn_to_page(start)),

6280

.mode = MIGRATE_SYNC,

6280

.mode = MIGRATE_SYNC,

6281

.ignore_skip_hint = true,

6281

.ignore_skip_hint = true,

6282

};

6282

};

6283

INIT_LIST_HEAD(&cc.migratepages);

6283

INIT_LIST_HEAD(&cc.migratepages);

6284

6285

/*

6285

/*

6286

* What we do here is we mark all pageblocks in range as

6286

* What we do here is we mark all pageblocks in range as

6287

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6287

* MIGRATE_ISOLATE. Because pageblock and max order pages may

6288

* have different sizes, and due to the way page allocator

6288

* have different sizes, and due to the way page allocator

6289

* work, we align the range to biggest of the two pages so

6289

* work, we align the range to biggest of the two pages so

6290

* that page allocator won't try to merge buddies from

6290

* that page allocator won't try to merge buddies from

6291

* different pageblocks and change MIGRATE_ISOLATE to some

6291

* different pageblocks and change MIGRATE_ISOLATE to some

6292

* other migration type.

6292

* other migration type.

6293

*

6293

*

6294

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6294

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

6295

* migrate the pages from an unaligned range (ie. pages that

6295

* migrate the pages from an unaligned range (ie. pages that

6296

* we are interested in). This will put all the pages in

6296

* we are interested in). This will put all the pages in

6297

* range back to page allocator as MIGRATE_ISOLATE.

6297

* range back to page allocator as MIGRATE_ISOLATE.

6298

*

6298

*

6299

* When this is done, we take the pages in range from page

6299

* When this is done, we take the pages in range from page

6300

* allocator removing them from the buddy system. This way

6300

* allocator removing them from the buddy system. This way

6301

* page allocator will never consider using them.

6301

* page allocator will never consider using them.

6302

*

6302

*

6303

* This lets us mark the pageblocks back as

6303

* This lets us mark the pageblocks back as

6304

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6304

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

6305

* aligned range but not in the unaligned, original range are

6305

* aligned range but not in the unaligned, original range are

6306

* put back to page allocator so that buddy can use them.

6306

* put back to page allocator so that buddy can use them.

6307

*/

6307

*/

6308

6309

ret = start_isolate_page_range(pfn_max_align_down(start),

6309

ret = start_isolate_page_range(pfn_max_align_down(start),

6310

pfn_max_align_up(end), migratetype,

6310

pfn_max_align_up(end), migratetype,

6311

false);

6311

false);

6312

if (ret)

6312

if (ret)

6313

return ret;

6313

return ret;

6314

6315

ret = __alloc_contig_migrate_range(&cc, start, end);

6315

ret = __alloc_contig_migrate_range(&cc, start, end);

6316

if (ret)

6316

if (ret)

6317

goto done;

6317

goto done;

6318

6319

/*

6319

/*

6320

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6320

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

6321

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6321

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

6322

* more, all pages in [start, end) are free in page allocator.

6322

* more, all pages in [start, end) are free in page allocator.

6323

* What we are going to do is to allocate all pages from

6323

* What we are going to do is to allocate all pages from

6324

* [start, end) (that is remove them from page allocator).

6324

* [start, end) (that is remove them from page allocator).

6325

*

6325

*

6326

* The only problem is that pages at the beginning and at the

6326

* The only problem is that pages at the beginning and at the

6327

* end of interesting range may be not aligned with pages that

6327

* end of interesting range may be not aligned with pages that

6328

* page allocator holds, ie. they can be part of higher order

6328

* page allocator holds, ie. they can be part of higher order

6329

* pages. Because of this, we reserve the bigger range and

6329

* pages. Because of this, we reserve the bigger range and

6330

* once this is done free the pages we are not interested in.

6330

* once this is done free the pages we are not interested in.

6331

*

6331

*

6332

* We don't have to hold zone->lock here because the pages are

6332

* We don't have to hold zone->lock here because the pages are

6333

* isolated thus they won't get removed from buddy.

6333

* isolated thus they won't get removed from buddy.

6334

*/

6334

*/

6335

6336

lru_add_drain_all();

6336

lru_add_drain_all();

6337

drain_all_pages();

6337

drain_all_pages();

6338

6339

order = 0;

6339

order = 0;

6340

outer_start = start;

6340

outer_start = start;

6341

while (!PageBuddy(pfn_to_page(outer_start))) {

6341

while (!PageBuddy(pfn_to_page(outer_start))) {

6342

if (++order >= MAX_ORDER) {

6342

if (++order >= MAX_ORDER) {

6343

ret = -EBUSY;

6343

ret = -EBUSY;

6344

goto done;

6344

goto done;

6345

}

6345

}

6346

outer_start &= ~0UL << order;

6346

outer_start &= ~0UL << order;

6347

}

6347

}

6348

6349

/* Make sure the range is really isolated. */

6349

/* Make sure the range is really isolated. */

6350

if (test_pages_isolated(outer_start, end, false)) {

6350

if (test_pages_isolated(outer_start, end, false)) {

6351

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6351

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

6352

outer_start, end);

6352

outer_start, end);

6353

ret = -EBUSY;

6353

ret = -EBUSY;

6354

goto done;

6354

goto done;

6355

}

6355

}

6356

6357

6358

/* Grab isolated pages from freelists. */

6358

/* Grab isolated pages from freelists. */

6359

outer_end = isolate_freepages_range(&cc, outer_start, end);

6359

outer_end = isolate_freepages_range(&cc, outer_start, end);

6360

if (!outer_end) {

6360

if (!outer_end) {

6361

ret = -EBUSY;

6361

ret = -EBUSY;

6362

goto done;

6362

goto done;

6363

}

6363

}

6364

6365

/* Free head and tail (if any) */

6365

/* Free head and tail (if any) */

6366

if (start != outer_start)

6366

if (start != outer_start)

6367

free_contig_range(outer_start, start - outer_start);

6367

free_contig_range(outer_start, start - outer_start);

6368

if (end != outer_end)

6368

if (end != outer_end)

6369

free_contig_range(end, outer_end - end);

6369

free_contig_range(end, outer_end - end);

6370

6371

done:

6371

done:

6372

undo_isolate_page_range(pfn_max_align_down(start),

6372

undo_isolate_page_range(pfn_max_align_down(start),

6373

pfn_max_align_up(end), migratetype);

6373

pfn_max_align_up(end), migratetype);

6374

return ret;

6374

return ret;

6375

}

6375

}

6376

6377

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6377

void free_contig_range(unsigned long pfn, unsigned nr_pages)

6378

{

6378

{

6379

unsigned int count = 0;

6379

unsigned int count = 0;

6380

6381

for (; nr_pages--; pfn++) {

6381

for (; nr_pages--; pfn++) {

6382

struct page *page = pfn_to_page(pfn);

6382

struct page *page = pfn_to_page(pfn);

6383

6384

count += page_count(page) != 1;

6384

count += page_count(page) != 1;

6385

__free_page(page);

6385

__free_page(page);

6386

}

6386

}

6387

WARN(count != 0, "%d pages are still in use!\n", count);

6387

WARN(count != 0, "%d pages are still in use!\n", count);

6388

}

6388

}

6389

#endif

6389

#endif

6390

6391

#ifdef CONFIG_MEMORY_HOTPLUG

6391

#ifdef CONFIG_MEMORY_HOTPLUG

6392

/*

6392

/*

6393

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6393

* The zone indicated has a new number of managed_pages; batch sizes and percpu

6394

* page high values need to be recalulated.

6394

* page high values need to be recalulated.

6395

*/

6395

*/

6396

void __meminit zone_pcp_update(struct zone *zone)

6396

void __meminit zone_pcp_update(struct zone *zone)

6397

{

6397

{

6398

unsigned cpu;

6398

unsigned cpu;

6399

mutex_lock(&pcp_batch_high_lock);

6399

mutex_lock(&pcp_batch_high_lock);

6400

for_each_possible_cpu(cpu)

6400

for_each_possible_cpu(cpu)

6401

pageset_set_high_and_batch(zone,

6401

pageset_set_high_and_batch(zone,

6402

per_cpu_ptr(zone->pageset, cpu));

6402

per_cpu_ptr(zone->pageset, cpu));

6403

mutex_unlock(&pcp_batch_high_lock);

6403

mutex_unlock(&pcp_batch_high_lock);

6404

}

6404

}

6405

#endif

6405

#endif

6406

6407

void zone_pcp_reset(struct zone *zone)

6407

void zone_pcp_reset(struct zone *zone)

6408

{

6408

{

6409

unsigned long flags;

6409

unsigned long flags;

6410

int cpu;

6410

int cpu;

6411

struct per_cpu_pageset *pset;

6411

struct per_cpu_pageset *pset;

6412

6413

/* avoid races with drain_pages() */

6413

/* avoid races with drain_pages() */

6414

local_irq_save(flags);

6414

local_irq_save(flags);

6415

if (zone->pageset != &boot_pageset) {

6415

if (zone->pageset != &boot_pageset) {

6416

for_each_online_cpu(cpu) {

6416

for_each_online_cpu(cpu) {

6417

pset = per_cpu_ptr(zone->pageset, cpu);

6417

pset = per_cpu_ptr(zone->pageset, cpu);

6418

drain_zonestat(zone, pset);

6418

drain_zonestat(zone, pset);

6419

}

6419

}

6420

free_percpu(zone->pageset);

6420

free_percpu(zone->pageset);

6421

zone->pageset = &boot_pageset;

6421

zone->pageset = &boot_pageset;

6422

}

6422

}

6423

local_irq_restore(flags);

6423

local_irq_restore(flags);

6424

}

6424

}

6425

6426

#ifdef CONFIG_MEMORY_HOTREMOVE

6426

#ifdef CONFIG_MEMORY_HOTREMOVE

6427

/*

6427

/*

6428

* All pages in the range must be isolated before calling this.

6428

* All pages in the range must be isolated before calling this.

6429

*/

6429

*/

6430

void

6430

void

6431

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6431

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6432

{

6432

{

6433

struct page *page;

6433

struct page *page;

6434

struct zone *zone;

6434

struct zone *zone;

6435

unsigned int order, i;

6435

unsigned int order, i;

6436

unsigned long pfn;

6436

unsigned long pfn;

6437

unsigned long flags;

6437

unsigned long flags;

6438

/* find the first valid pfn */

6438

/* find the first valid pfn */

6439

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6439

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6440

if (pfn_valid(pfn))

6440

if (pfn_valid(pfn))

6441

break;

6441

break;

6442

if (pfn == end_pfn)

6442

if (pfn == end_pfn)

6443

return;

6443

return;

6444

zone = page_zone(pfn_to_page(pfn));

6444

zone = page_zone(pfn_to_page(pfn));

6445

spin_lock_irqsave(&zone->lock, flags);

6445

spin_lock_irqsave(&zone->lock, flags);

6446

pfn = start_pfn;

6446

pfn = start_pfn;

6447

while (pfn < end_pfn) {

6447

while (pfn < end_pfn) {

6448

if (!pfn_valid(pfn)) {

6448

if (!pfn_valid(pfn)) {

6449

pfn++;

6449

pfn++;

6450

continue;

6450

continue;

6451

}

6451

}

6452

page = pfn_to_page(pfn);

6452

page = pfn_to_page(pfn);

6453

/*

6453

/*

6454

* The HWPoisoned page may be not in buddy system, and

6454

* The HWPoisoned page may be not in buddy system, and

6455

* page_count() is not 0.

6455

* page_count() is not 0.

6456

*/

6456

*/

6457

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6457

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6458

pfn++;

6458

pfn++;

6459

SetPageReserved(page);

6459

SetPageReserved(page);

6460

continue;

6460

continue;

6461

}

6461

}

6462

6463

BUG_ON(page_count(page));

6463

BUG_ON(page_count(page));

6464

BUG_ON(!PageBuddy(page));

6464

BUG_ON(!PageBuddy(page));

6465

order = page_order(page);

6465

order = page_order(page);

6466

#ifdef CONFIG_DEBUG_VM

6466

#ifdef CONFIG_DEBUG_VM

6467

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6467

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6468

pfn, 1 << order, end_pfn);

6468

pfn, 1 << order, end_pfn);

6469

#endif

6469

#endif

6470

list_del(&page->lru);

6470

list_del(&page->lru);

6471

rmv_page_order(page);

6471

rmv_page_order(page);

6472

zone->free_area[order].nr_free--;

6472

zone->free_area[order].nr_free--;

6473

for (i = 0; i < (1 << order); i++)

6473

for (i = 0; i < (1 << order); i++)

6474

SetPageReserved((page+i));

6474

SetPageReserved((page+i));

6475

pfn += (1 << order);

6475

pfn += (1 << order);

6476

}

6476

}

6477

spin_unlock_irqrestore(&zone->lock, flags);

6477

spin_unlock_irqrestore(&zone->lock, flags);

6478

}

6478

}

6479

#endif

6479

#endif

6480

6481

#ifdef CONFIG_MEMORY_FAILURE

6481

#ifdef CONFIG_MEMORY_FAILURE

6482

bool is_free_buddy_page(struct page *page)

6482

bool is_free_buddy_page(struct page *page)

6483

{

6483

{

6484

struct zone *zone = page_zone(page);

6484

struct zone *zone = page_zone(page);

6485

unsigned long pfn = page_to_pfn(page);

6485

unsigned long pfn = page_to_pfn(page);

6486

unsigned long flags;

6486

unsigned long flags;

6487

unsigned int order;

6487

unsigned int order;

6488

6489

spin_lock_irqsave(&zone->lock, flags);

6489

spin_lock_irqsave(&zone->lock, flags);

6490

for (order = 0; order < MAX_ORDER; order++) {

6490

for (order = 0; order < MAX_ORDER; order++) {

6491

struct page *page_head = page - (pfn & ((1 << order) - 1));

6491

struct page *page_head = page - (pfn & ((1 << order) - 1));

6492

6493

if (PageBuddy(page_head) && page_order(page_head) >= order)

6493

if (PageBuddy(page_head) && page_order(page_head) >= order)

6494

break;

6494

break;

6495

}

6495

}

6496

spin_unlock_irqrestore(&zone->lock, flags);

6496

spin_unlock_irqrestore(&zone->lock, flags);

6497

6498

return order < MAX_ORDER;

6498

return order < MAX_ORDER;

6499

}

6499

}

6500

#endif

6500

#endif

6501

6502

static const struct trace_print_flags pageflag_names[] = {

6502

static const struct trace_print_flags pageflag_names[] = {

6503

{1UL << PG_locked, "locked" },

6503

{1UL << PG_locked, "locked" },

6504

{1UL << PG_error, "error" },

6504

{1UL << PG_error, "error" },

6505

{1UL << PG_referenced, "referenced" },

6505

{1UL << PG_referenced, "referenced" },

6506

{1UL << PG_uptodate, "uptodate" },

6506

{1UL << PG_uptodate, "uptodate" },

6507

{1UL << PG_dirty, "dirty" },

6507

{1UL << PG_dirty, "dirty" },

6508

{1UL << PG_lru, "lru" },

6508

{1UL << PG_lru, "lru" },

6509

{1UL << PG_active, "active" },

6509

{1UL << PG_active, "active" },

6510

{1UL << PG_slab, "slab" },

6510

{1UL << PG_slab, "slab" },

6511

{1UL << PG_owner_priv_1, "owner_priv_1" },

6511

{1UL << PG_owner_priv_1, "owner_priv_1" },

6512

{1UL << PG_arch_1, "arch_1" },

6512

{1UL << PG_arch_1, "arch_1" },

6513

{1UL << PG_reserved, "reserved" },

6513

{1UL << PG_reserved, "reserved" },

6514

{1UL << PG_private, "private" },

6514

{1UL << PG_private, "private" },

6515

{1UL << PG_private_2, "private_2" },

6515

{1UL << PG_private_2, "private_2" },

6516

{1UL << PG_writeback, "writeback" },

6516

{1UL << PG_writeback, "writeback" },

6517

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6517

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6518

{1UL << PG_head, "head" },

6518

{1UL << PG_head, "head" },

6519

{1UL << PG_tail, "tail" },

6519

{1UL << PG_tail, "tail" },

6520

#else

6520

#else

6521

{1UL << PG_compound, "compound" },

6521

{1UL << PG_compound, "compound" },

6522

#endif

6522

#endif

6523

{1UL << PG_swapcache, "swapcache" },

6523

{1UL << PG_swapcache, "swapcache" },

6524

{1UL << PG_mappedtodisk, "mappedtodisk" },

6524

{1UL << PG_mappedtodisk, "mappedtodisk" },

6525

{1UL << PG_reclaim, "reclaim" },

6525

{1UL << PG_reclaim, "reclaim" },

6526

{1UL << PG_swapbacked, "swapbacked" },

6526

{1UL << PG_swapbacked, "swapbacked" },

6527

{1UL << PG_unevictable, "unevictable" },

6527

{1UL << PG_unevictable, "unevictable" },

6528

#ifdef CONFIG_MMU

6528

#ifdef CONFIG_MMU

6529

{1UL << PG_mlocked, "mlocked" },

6529

{1UL << PG_mlocked, "mlocked" },

6530

#endif

6530

#endif

6531

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6531

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6532

{1UL << PG_uncached, "uncached" },

6532

{1UL << PG_uncached, "uncached" },

6533

#endif

6533

#endif

6534

#ifdef CONFIG_MEMORY_FAILURE

6534

#ifdef CONFIG_MEMORY_FAILURE

6535

{1UL << PG_hwpoison, "hwpoison" },

6535

{1UL << PG_hwpoison, "hwpoison" },

6536

#endif

6536

#endif

6537

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6537

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6538

{1UL << PG_compound_lock, "compound_lock" },

6538

{1UL << PG_compound_lock, "compound_lock" },

6539

#endif

6539

#endif

6540

};

6540

};

6541

6542

static void dump_page_flags(unsigned long flags)

6542

static void dump_page_flags(unsigned long flags)

6543

{

6543

{

6544

const char *delim = "";

6544

const char *delim = "";

6545

unsigned long mask;

6545

unsigned long mask;

6546

int i;

6546

int i;

6547

6548

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6548

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6549

6550

printk(KERN_ALERT "page flags: %#lx(", flags);

6550

printk(KERN_ALERT "page flags: %#lx(", flags);

6551

6552

/* remove zone id */

6552

/* remove zone id */

6553

flags &= (1UL << NR_PAGEFLAGS) - 1;

6553

flags &= (1UL << NR_PAGEFLAGS) - 1;

6554

6555

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6555

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6556

6557

mask = pageflag_names[i].mask;

6557

mask = pageflag_names[i].mask;

6558

if ((flags & mask) != mask)

6558

if ((flags & mask) != mask)

6559

continue;

6559

continue;

6560

6561

flags &= ~mask;

6561

flags &= ~mask;

6562

printk("%s%s", delim, pageflag_names[i].name);

6562

printk("%s%s", delim, pageflag_names[i].name);

6563

delim = "|";

6563

delim = "|";

6564

}

6564

}

6565

6566

/* check for left over flags */

6566

/* check for left over flags */

6567

if (flags)

6567

if (flags)

6568

printk("%s%#lx", delim, flags);

6568

printk("%s%#lx", delim, flags);

6569

6570

printk(")\n");

6570

printk(")\n");

6571

}

6571

}

6572

6573

void dump_page(struct page *page)

6573

void dump_page(struct page *page)

6574

{

6574

{

6575

printk(KERN_ALERT

6575

printk(KERN_ALERT

6576

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6576

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6577

page, atomic_read(&page->_count), page_mapcount(page),

6577

page, atomic_read(&page->_count), page_mapcount(page),

6578

page->mapping, page->index);

6578

page->mapping, page->index);

6579

dump_page_flags(page->flags);

6579

dump_page_flags(page->flags);

6580

mem_cgroup_print_bad_page(page);

6580

mem_cgroup_print_bad_page(page);

6581

}

6581

}

6582

GITLAB

mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered