Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/migrate.h>

59

#include <linux/migrate.h>

60

#include <linux/page-debug-flags.h>

60

#include <linux/page-debug-flags.h>

61

#include <linux/sched/rt.h>

61

#include <linux/sched/rt.h>

62

63

#include <asm/tlbflush.h>

63

#include <asm/tlbflush.h>

64

#include <asm/div64.h>

64

#include <asm/div64.h>

65

#include "internal.h"

65

#include "internal.h"

66

67

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

67

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

68

DEFINE_PER_CPU(int, numa_node);

68

DEFINE_PER_CPU(int, numa_node);

69

EXPORT_PER_CPU_SYMBOL(numa_node);

69

EXPORT_PER_CPU_SYMBOL(numa_node);

70

#endif

70

#endif

71

72

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

72

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

73

/*

73

/*

74

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

74

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

75

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

75

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

76

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

76

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

77

* defined in <linux/topology.h>.

77

* defined in <linux/topology.h>.

78

*/

78

*/

79

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

79

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

80

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

80

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

81

#endif

81

#endif

82

83

/*

83

/*

84

* Array of node states.

84

* Array of node states.

85

*/

85

*/

86

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

86

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

87

[N_POSSIBLE] = NODE_MASK_ALL,

87

[N_POSSIBLE] = NODE_MASK_ALL,

88

[N_ONLINE] = { { [0] = 1UL } },

88

[N_ONLINE] = { { [0] = 1UL } },

89

#ifndef CONFIG_NUMA

89

#ifndef CONFIG_NUMA

90

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

90

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

91

#ifdef CONFIG_HIGHMEM

91

#ifdef CONFIG_HIGHMEM

92

[N_HIGH_MEMORY] = { { [0] = 1UL } },

92

[N_HIGH_MEMORY] = { { [0] = 1UL } },

93

#endif

93

#endif

94

#ifdef CONFIG_MOVABLE_NODE

94

#ifdef CONFIG_MOVABLE_NODE

95

[N_MEMORY] = { { [0] = 1UL } },

95

[N_MEMORY] = { { [0] = 1UL } },

96

#endif

96

#endif

97

[N_CPU] = { { [0] = 1UL } },

97

[N_CPU] = { { [0] = 1UL } },

98

#endif /* NUMA */

98

#endif /* NUMA */

99

};

99

};

100

EXPORT_SYMBOL(node_states);

100

EXPORT_SYMBOL(node_states);

101

102

unsigned long totalram_pages __read_mostly;

102

unsigned long totalram_pages __read_mostly;

103

unsigned long totalreserve_pages __read_mostly;

103

unsigned long totalreserve_pages __read_mostly;

104

/*

104

/*

105

* When calculating the number of globally allowed dirty pages, there

105

* When calculating the number of globally allowed dirty pages, there

106

* is a certain number of per-zone reserves that should not be

106

* is a certain number of per-zone reserves that should not be

107

* considered dirtyable memory. This is the sum of those reserves

107

* considered dirtyable memory. This is the sum of those reserves

108

* over all existing zones that contribute dirtyable memory.

108

* over all existing zones that contribute dirtyable memory.

109

*/

109

*/

110

unsigned long dirty_balance_reserve __read_mostly;

110

unsigned long dirty_balance_reserve __read_mostly;

111

112

int percpu_pagelist_fraction;

112

int percpu_pagelist_fraction;

113

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

113

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

114

115

#ifdef CONFIG_PM_SLEEP

115

#ifdef CONFIG_PM_SLEEP

116

/*

116

/*

117

* The following functions are used by the suspend/hibernate code to temporarily

117

* The following functions are used by the suspend/hibernate code to temporarily

118

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

118

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

119

* while devices are suspended. To avoid races with the suspend/hibernate code,

119

* while devices are suspended. To avoid races with the suspend/hibernate code,

120

* they should always be called with pm_mutex held (gfp_allowed_mask also should

120

* they should always be called with pm_mutex held (gfp_allowed_mask also should

121

* only be modified with pm_mutex held, unless the suspend/hibernate code is

121

* only be modified with pm_mutex held, unless the suspend/hibernate code is

122

* guaranteed not to run in parallel with that modification).

122

* guaranteed not to run in parallel with that modification).

123

*/

123

*/

124

125

static gfp_t saved_gfp_mask;

125

static gfp_t saved_gfp_mask;

126

127

void pm_restore_gfp_mask(void)

127

void pm_restore_gfp_mask(void)

128

{

128

{

129

WARN_ON(!mutex_is_locked(&pm_mutex));

129

WARN_ON(!mutex_is_locked(&pm_mutex));

130

if (saved_gfp_mask) {

130

if (saved_gfp_mask) {

131

gfp_allowed_mask = saved_gfp_mask;

131

gfp_allowed_mask = saved_gfp_mask;

132

saved_gfp_mask = 0;

132

saved_gfp_mask = 0;

133

}

133

}

134

}

134

}

135

136

void pm_restrict_gfp_mask(void)

136

void pm_restrict_gfp_mask(void)

137

{

137

{

138

WARN_ON(!mutex_is_locked(&pm_mutex));

138

WARN_ON(!mutex_is_locked(&pm_mutex));

139

WARN_ON(saved_gfp_mask);

139

WARN_ON(saved_gfp_mask);

140

saved_gfp_mask = gfp_allowed_mask;

140

saved_gfp_mask = gfp_allowed_mask;

141

gfp_allowed_mask &= ~GFP_IOFS;

141

gfp_allowed_mask &= ~GFP_IOFS;

142

}

142

}

143

144

bool pm_suspended_storage(void)

144

bool pm_suspended_storage(void)

145

{

145

{

146

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

146

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

147

return false;

147

return false;

148

return true;

148

return true;

149

}

149

}

150

#endif /* CONFIG_PM_SLEEP */

150

#endif /* CONFIG_PM_SLEEP */

151

152

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

152

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

153

int pageblock_order __read_mostly;

153

int pageblock_order __read_mostly;

154

#endif

154

#endif

155

156

static void __free_pages_ok(struct page *page, unsigned int order);

156

static void __free_pages_ok(struct page *page, unsigned int order);

157

158

/*

158

/*

159

* results with 256, 32 in the lowmem_reserve sysctl:

159

* results with 256, 32 in the lowmem_reserve sysctl:

160

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

160

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

161

* 1G machine -> (16M dma, 784M normal, 224M high)

161

* 1G machine -> (16M dma, 784M normal, 224M high)

162

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

162

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

163

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

163

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

164

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

164

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

165

*

165

*

166

* TBD: should special case ZONE_DMA32 machines here - in those we normally

166

* TBD: should special case ZONE_DMA32 machines here - in those we normally

167

* don't need any ZONE_NORMAL reservation

167

* don't need any ZONE_NORMAL reservation

168

*/

168

*/

169

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

169

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

170

#ifdef CONFIG_ZONE_DMA

170

#ifdef CONFIG_ZONE_DMA

171

256,

171

256,

172

#endif

172

#endif

173

#ifdef CONFIG_ZONE_DMA32

173

#ifdef CONFIG_ZONE_DMA32

174

256,

174

256,

175

#endif

175

#endif

176

#ifdef CONFIG_HIGHMEM

176

#ifdef CONFIG_HIGHMEM

177

32,

177

32,

178

#endif

178

#endif

179

32,

179

32,

180

};

180

};

181

182

EXPORT_SYMBOL(totalram_pages);

182

EXPORT_SYMBOL(totalram_pages);

183

184

static char * const zone_names[MAX_NR_ZONES] = {

184

static char * const zone_names[MAX_NR_ZONES] = {

185

#ifdef CONFIG_ZONE_DMA

185

#ifdef CONFIG_ZONE_DMA

186

"DMA",

186

"DMA",

187

#endif

187

#endif

188

#ifdef CONFIG_ZONE_DMA32

188

#ifdef CONFIG_ZONE_DMA32

189

"DMA32",

189

"DMA32",

190

#endif

190

#endif

191

"Normal",

191

"Normal",

192

#ifdef CONFIG_HIGHMEM

192

#ifdef CONFIG_HIGHMEM

193

"HighMem",

193

"HighMem",

194

#endif

194

#endif

195

"Movable",

195

"Movable",

196

};

196

};

197

198

int min_free_kbytes = 1024;

198

int min_free_kbytes = 1024;

199

200

static unsigned long __meminitdata nr_kernel_pages;

200

static unsigned long __meminitdata nr_kernel_pages;

201

static unsigned long __meminitdata nr_all_pages;

201

static unsigned long __meminitdata nr_all_pages;

202

static unsigned long __meminitdata dma_reserve;

202

static unsigned long __meminitdata dma_reserve;

203

204

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

204

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

205

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

205

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

206

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

206

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

207

static unsigned long __initdata required_kernelcore;

207

static unsigned long __initdata required_kernelcore;

208

static unsigned long __initdata required_movablecore;

208

static unsigned long __initdata required_movablecore;

209

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

209

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

210

211

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

211

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

212

int movable_zone;

212

int movable_zone;

213

EXPORT_SYMBOL(movable_zone);

213

EXPORT_SYMBOL(movable_zone);

214

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

214

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

215

216

#if MAX_NUMNODES > 1

216

#if MAX_NUMNODES > 1

217

int nr_node_ids __read_mostly = MAX_NUMNODES;

217

int nr_node_ids __read_mostly = MAX_NUMNODES;

218

int nr_online_nodes __read_mostly = 1;

218

int nr_online_nodes __read_mostly = 1;

219

EXPORT_SYMBOL(nr_node_ids);

219

EXPORT_SYMBOL(nr_node_ids);

220

EXPORT_SYMBOL(nr_online_nodes);

220

EXPORT_SYMBOL(nr_online_nodes);

221

#endif

221

#endif

222

223

int page_group_by_mobility_disabled __read_mostly;

223

int page_group_by_mobility_disabled __read_mostly;

224

225

void set_pageblock_migratetype(struct page *page, int migratetype)

225

void set_pageblock_migratetype(struct page *page, int migratetype)

226

{

226

{

227

228

if (unlikely(page_group_by_mobility_disabled))

228

if (unlikely(page_group_by_mobility_disabled))

229

migratetype = MIGRATE_UNMOVABLE;

229

migratetype = MIGRATE_UNMOVABLE;

230

231

set_pageblock_flags_group(page, (unsigned long)migratetype,

231

set_pageblock_flags_group(page, (unsigned long)migratetype,

232

PB_migrate, PB_migrate_end);

232

PB_migrate, PB_migrate_end);

233

}

233

}

234

235

bool oom_killer_disabled __read_mostly;

235

bool oom_killer_disabled __read_mostly;

236

237

#ifdef CONFIG_DEBUG_VM

237

#ifdef CONFIG_DEBUG_VM

238

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

238

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

239

{

239

{

240

int ret = 0;

240

int ret = 0;

241

unsigned seq;

241

unsigned seq;

242

unsigned long pfn = page_to_pfn(page);

242

unsigned long pfn = page_to_pfn(page);

243

244

do {

244

do {

245

seq = zone_span_seqbegin(zone);

245

seq = zone_span_seqbegin(zone);

246

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

246

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

247

ret = 1;

247

ret = 1;

248

else if (pfn < zone->zone_start_pfn)

248

else if (pfn < zone->zone_start_pfn)

249

ret = 1;

249

ret = 1;

250

} while (zone_span_seqretry(zone, seq));

250

} while (zone_span_seqretry(zone, seq));

251

252

return ret;

252

return ret;

253

}

253

}

254

255

static int page_is_consistent(struct zone *zone, struct page *page)

255

static int page_is_consistent(struct zone *zone, struct page *page)

256

{

256

{

257

if (!pfn_valid_within(page_to_pfn(page)))

257

if (!pfn_valid_within(page_to_pfn(page)))

258

return 0;

258

return 0;

259

if (zone != page_zone(page))

259

if (zone != page_zone(page))

260

return 0;

260

return 0;

261

262

return 1;

262

return 1;

263

}

263

}

264

/*

264

/*

265

* Temporary debugging check for pages not lying within a given zone.

265

* Temporary debugging check for pages not lying within a given zone.

266

*/

266

*/

267

static int bad_range(struct zone *zone, struct page *page)

267

static int bad_range(struct zone *zone, struct page *page)

268

{

268

{

269

if (page_outside_zone_boundaries(zone, page))

269

if (page_outside_zone_boundaries(zone, page))

270

return 1;

270

return 1;

271

if (!page_is_consistent(zone, page))

271

if (!page_is_consistent(zone, page))

272

return 1;

272

return 1;

273

274

return 0;

274

return 0;

275

}

275

}

276

#else

276

#else

277

static inline int bad_range(struct zone *zone, struct page *page)

277

static inline int bad_range(struct zone *zone, struct page *page)

278

{

278

{

279

return 0;

279

return 0;

280

}

280

}

281

#endif

281

#endif

282

283

static void bad_page(struct page *page)

283

static void bad_page(struct page *page)

284

{

284

{

285

static unsigned long resume;

285

static unsigned long resume;

286

static unsigned long nr_shown;

286

static unsigned long nr_shown;

287

static unsigned long nr_unshown;

287

static unsigned long nr_unshown;

288

289

/* Don't complain about poisoned pages */

289

/* Don't complain about poisoned pages */

290

if (PageHWPoison(page)) {

290

if (PageHWPoison(page)) {

291

reset_page_mapcount(page); /* remove PageBuddy */

291

reset_page_mapcount(page); /* remove PageBuddy */

292

return;

292

return;

293

}

293

}

294

295

/*

295

/*

296

* Allow a burst of 60 reports, then keep quiet for that minute;

296

* Allow a burst of 60 reports, then keep quiet for that minute;

297

* or allow a steady drip of one report per second.

297

* or allow a steady drip of one report per second.

298

*/

298

*/

299

if (nr_shown == 60) {

299

if (nr_shown == 60) {

300

if (time_before(jiffies, resume)) {

300

if (time_before(jiffies, resume)) {

301

nr_unshown++;

301

nr_unshown++;

302

goto out;

302

goto out;

303

}

303

}

304

if (nr_unshown) {

304

if (nr_unshown) {

305

printk(KERN_ALERT

305

printk(KERN_ALERT

306

"BUG: Bad page state: %lu messages suppressed\n",

306

"BUG: Bad page state: %lu messages suppressed\n",

307

nr_unshown);

307

nr_unshown);

308

nr_unshown = 0;

308

nr_unshown = 0;

309

}

309

}

310

nr_shown = 0;

310

nr_shown = 0;

311

}

311

}

312

if (nr_shown++ == 0)

312

if (nr_shown++ == 0)

313

resume = jiffies + 60 * HZ;

313

resume = jiffies + 60 * HZ;

314

315

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

315

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

316

current->comm, page_to_pfn(page));

316

current->comm, page_to_pfn(page));

317

dump_page(page);

317

dump_page(page);

318

319

print_modules();

319

print_modules();

320

dump_stack();

320

dump_stack();

321

out:

321

out:

322

/* Leave bad fields for debug, except PageBuddy could make trouble */

322

/* Leave bad fields for debug, except PageBuddy could make trouble */

323

reset_page_mapcount(page); /* remove PageBuddy */

323

reset_page_mapcount(page); /* remove PageBuddy */

324

add_taint(TAINT_BAD_PAGE);

324

add_taint(TAINT_BAD_PAGE);

325

}

325

}

326

327

/*

327

/*

328

* Higher-order pages are called "compound pages". They are structured thusly:

328

* Higher-order pages are called "compound pages". They are structured thusly:

329

*

329

*

330

* The first PAGE_SIZE page is called the "head page".

330

* The first PAGE_SIZE page is called the "head page".

331

*

331

*

332

* The remaining PAGE_SIZE pages are called "tail pages".

332

* The remaining PAGE_SIZE pages are called "tail pages".

333

*

333

*

334

* All pages have PG_compound set. All tail pages have their ->first_page

334

* All pages have PG_compound set. All tail pages have their ->first_page

335

* pointing at the head page.

335

* pointing at the head page.

336

*

336

*

337

* The first tail page's ->lru.next holds the address of the compound page's

337

* The first tail page's ->lru.next holds the address of the compound page's

338

* put_page() function. Its ->lru.prev holds the order of allocation.

338

* put_page() function. Its ->lru.prev holds the order of allocation.

339

* This usage means that zero-order pages may not be compound.

339

* This usage means that zero-order pages may not be compound.

340

*/

340

*/

341

342

static void free_compound_page(struct page *page)

342

static void free_compound_page(struct page *page)

343

{

343

{

344

__free_pages_ok(page, compound_order(page));

344

__free_pages_ok(page, compound_order(page));

345

}

345

}

346

347

void prep_compound_page(struct page *page, unsigned long order)

347

void prep_compound_page(struct page *page, unsigned long order)

348

{

348

{

349

int i;

349

int i;

350

int nr_pages = 1 << order;

350

int nr_pages = 1 << order;

351

352

set_compound_page_dtor(page, free_compound_page);

352

set_compound_page_dtor(page, free_compound_page);

353

set_compound_order(page, order);

353

set_compound_order(page, order);

354

__SetPageHead(page);

354

__SetPageHead(page);

355

for (i = 1; i < nr_pages; i++) {

355

for (i = 1; i < nr_pages; i++) {

356

struct page *p = page + i;

356

struct page *p = page + i;

357

__SetPageTail(p);

357

__SetPageTail(p);

358

set_page_count(p, 0);

358

set_page_count(p, 0);

359

p->first_page = page;

359

p->first_page = page;

360

}

360

}

361

}

361

}

362

363

/* update __split_huge_page_refcount if you change this function */

363

/* update __split_huge_page_refcount if you change this function */

364

static int destroy_compound_page(struct page *page, unsigned long order)

364

static int destroy_compound_page(struct page *page, unsigned long order)

365

{

365

{

366

int i;

366

int i;

367

int nr_pages = 1 << order;

367

int nr_pages = 1 << order;

368

int bad = 0;

368

int bad = 0;

369

370

if (unlikely(compound_order(page) != order)) {

370

if (unlikely(compound_order(page) != order)) {

371

bad_page(page);

371

bad_page(page);

372

bad++;

372

bad++;

373

}

373

}

374

375

__ClearPageHead(page);

375

__ClearPageHead(page);

376

377

for (i = 1; i < nr_pages; i++) {

377

for (i = 1; i < nr_pages; i++) {

378

struct page *p = page + i;

378

struct page *p = page + i;

379

380

if (unlikely(!PageTail(p) || (p->first_page != page))) {

380

if (unlikely(!PageTail(p) || (p->first_page != page))) {

381

bad_page(page);

381

bad_page(page);

382

bad++;

382

bad++;

383

}

383

}

384

__ClearPageTail(p);

384

__ClearPageTail(p);

385

}

385

}

386

387

return bad;

387

return bad;

388

}

388

}

389

390

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

390

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

391

{

391

{

392

int i;

392

int i;

393

394

/*

394

/*

395

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

395

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

396

* and __GFP_HIGHMEM from hard or soft interrupt context.

396

* and __GFP_HIGHMEM from hard or soft interrupt context.

397

*/

397

*/

398

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

398

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

399

for (i = 0; i < (1 << order); i++)

399

for (i = 0; i < (1 << order); i++)

400

clear_highpage(page + i);

400

clear_highpage(page + i);

401

}

401

}

402

403

#ifdef CONFIG_DEBUG_PAGEALLOC

403

#ifdef CONFIG_DEBUG_PAGEALLOC

404

unsigned int _debug_guardpage_minorder;

404

unsigned int _debug_guardpage_minorder;

405

406

static int __init debug_guardpage_minorder_setup(char *buf)

406

static int __init debug_guardpage_minorder_setup(char *buf)

407

{

407

{

408

unsigned long res;

408

unsigned long res;

409

410

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

410

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

411

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

411

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

412

return 0;

412

return 0;

413

}

413

}

414

_debug_guardpage_minorder = res;

414

_debug_guardpage_minorder = res;

415

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

415

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

416

return 0;

416

return 0;

417

}

417

}

418

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

418

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

419

420

static inline void set_page_guard_flag(struct page *page)

420

static inline void set_page_guard_flag(struct page *page)

421

{

421

{

422

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

422

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

423

}

423

}

424

425

static inline void clear_page_guard_flag(struct page *page)

425

static inline void clear_page_guard_flag(struct page *page)

426

{

426

{

427

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

427

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

428

}

428

}

429

#else

429

#else

430

static inline void set_page_guard_flag(struct page *page) { }

430

static inline void set_page_guard_flag(struct page *page) { }

431

static inline void clear_page_guard_flag(struct page *page) { }

431

static inline void clear_page_guard_flag(struct page *page) { }

432

#endif

432

#endif

433

434

static inline void set_page_order(struct page *page, int order)

434

static inline void set_page_order(struct page *page, int order)

435

{

435

{

436

set_page_private(page, order);

436

set_page_private(page, order);

437

__SetPageBuddy(page);

437

__SetPageBuddy(page);

438

}

438

}

439

440

static inline void rmv_page_order(struct page *page)

440

static inline void rmv_page_order(struct page *page)

441

{

441

{

442

__ClearPageBuddy(page);

442

__ClearPageBuddy(page);

443

set_page_private(page, 0);

443

set_page_private(page, 0);

444

}

444

}

445

446

/*

446

/*

447

* Locate the struct page for both the matching buddy in our

447

* Locate the struct page for both the matching buddy in our

448

* pair (buddy1) and the combined O(n+1) page they form (page).

448

* pair (buddy1) and the combined O(n+1) page they form (page).

449

*

449

*

450

* 1) Any buddy B1 will have an order O twin B2 which satisfies

450

* 1) Any buddy B1 will have an order O twin B2 which satisfies

451

* the following equation:

451

* the following equation:

452

* B2 = B1 ^ (1 << O)

452

* B2 = B1 ^ (1 << O)

453

* For example, if the starting buddy (buddy2) is #8 its order

453

* For example, if the starting buddy (buddy2) is #8 its order

454

* 1 buddy is #10:

454

* 1 buddy is #10:

455

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

455

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

456

*

456

*

457

* 2) Any buddy B will have an order O+1 parent P which

457

* 2) Any buddy B will have an order O+1 parent P which

458

* satisfies the following equation:

458

* satisfies the following equation:

459

* P = B & ~(1 << O)

459

* P = B & ~(1 << O)

460

*

460

*

461

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

461

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

462

*/

462

*/

463

static inline unsigned long

463

static inline unsigned long

464

__find_buddy_index(unsigned long page_idx, unsigned int order)

464

__find_buddy_index(unsigned long page_idx, unsigned int order)

465

{

465

{

466

return page_idx ^ (1 << order);

466

return page_idx ^ (1 << order);

467

}

467

}

468

469

/*

469

/*

470

* This function checks whether a page is free && is the buddy

470

* This function checks whether a page is free && is the buddy

471

* we can do coalesce a page and its buddy if

471

* we can do coalesce a page and its buddy if

472

* (a) the buddy is not in a hole &&

472

* (a) the buddy is not in a hole &&

473

* (b) the buddy is in the buddy system &&

473

* (b) the buddy is in the buddy system &&

474

* (c) a page and its buddy have the same order &&

474

* (c) a page and its buddy have the same order &&

475

* (d) a page and its buddy are in the same zone.

475

* (d) a page and its buddy are in the same zone.

476

*

476

*

477

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

477

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

478

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

478

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

479

*

479

*

480

* For recording page's order, we use page_private(page).

480

* For recording page's order, we use page_private(page).

481

*/

481

*/

482

static inline int page_is_buddy(struct page *page, struct page *buddy,

482

static inline int page_is_buddy(struct page *page, struct page *buddy,

483

int order)

483

int order)

484

{

484

{

485

if (!pfn_valid_within(page_to_pfn(buddy)))

485

if (!pfn_valid_within(page_to_pfn(buddy)))

486

return 0;

486

return 0;

487

488

if (page_zone_id(page) != page_zone_id(buddy))

488

if (page_zone_id(page) != page_zone_id(buddy))

489

return 0;

489

return 0;

490

491

if (page_is_guard(buddy) && page_order(buddy) == order) {

491

if (page_is_guard(buddy) && page_order(buddy) == order) {

492

VM_BUG_ON(page_count(buddy) != 0);

492

VM_BUG_ON(page_count(buddy) != 0);

493

return 1;

493

return 1;

494

}

494

}

495

496

if (PageBuddy(buddy) && page_order(buddy) == order) {

496

if (PageBuddy(buddy) && page_order(buddy) == order) {

497

VM_BUG_ON(page_count(buddy) != 0);

497

VM_BUG_ON(page_count(buddy) != 0);

498

return 1;

498

return 1;

499

}

499

}

500

return 0;

500

return 0;

501

}

501

}

502

503

/*

503

/*

504

* Freeing function for a buddy system allocator.

504

* Freeing function for a buddy system allocator.

505

*

505

*

506

* The concept of a buddy system is to maintain direct-mapped table

506

* The concept of a buddy system is to maintain direct-mapped table

507

* (containing bit values) for memory blocks of various "orders".

507

* (containing bit values) for memory blocks of various "orders".

508

* The bottom level table contains the map for the smallest allocatable

508

* The bottom level table contains the map for the smallest allocatable

509

* units of memory (here, pages), and each level above it describes

509

* units of memory (here, pages), and each level above it describes

510

* pairs of units from the levels below, hence, "buddies".

510

* pairs of units from the levels below, hence, "buddies".

511

* At a high level, all that happens here is marking the table entry

511

* At a high level, all that happens here is marking the table entry

512

* at the bottom level available, and propagating the changes upward

512

* at the bottom level available, and propagating the changes upward

513

* as necessary, plus some accounting needed to play nicely with other

513

* as necessary, plus some accounting needed to play nicely with other

514

* parts of the VM system.

514

* parts of the VM system.

515

* At each level, we keep a list of pages, which are heads of continuous

515

* At each level, we keep a list of pages, which are heads of continuous

516

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

516

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

517

* order is recorded in page_private(page) field.

517

* order is recorded in page_private(page) field.

518

* So when we are allocating or freeing one, we can derive the state of the

518

* So when we are allocating or freeing one, we can derive the state of the

519

* other. That is, if we allocate a small block, and both were

519

* other. That is, if we allocate a small block, and both were

520

* free, the remainder of the region must be split into blocks.

520

* free, the remainder of the region must be split into blocks.

521

* If a block is freed, and its buddy is also free, then this

521

* If a block is freed, and its buddy is also free, then this

522

* triggers coalescing into a block of larger size.

522

* triggers coalescing into a block of larger size.

523

*

523

*

524

* -- nyc

524

* -- nyc

525

*/

525

*/

526

527

static inline void __free_one_page(struct page *page,

527

static inline void __free_one_page(struct page *page,

528

struct zone *zone, unsigned int order,

528

struct zone *zone, unsigned int order,

529

int migratetype)

529

int migratetype)

530

{

530

{

531

unsigned long page_idx;

531

unsigned long page_idx;

532

unsigned long combined_idx;

532

unsigned long combined_idx;

533

unsigned long uninitialized_var(buddy_idx);

533

unsigned long uninitialized_var(buddy_idx);

534

struct page *buddy;

534

struct page *buddy;

535

536

if (unlikely(PageCompound(page)))

536

if (unlikely(PageCompound(page)))

537

if (unlikely(destroy_compound_page(page, order)))

537

if (unlikely(destroy_compound_page(page, order)))

538

return;

538

return;

539

540

VM_BUG_ON(migratetype == -1);

540

VM_BUG_ON(migratetype == -1);

541

542

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

542

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

543

544

VM_BUG_ON(page_idx & ((1 << order) - 1));

544

VM_BUG_ON(page_idx & ((1 << order) - 1));

545

VM_BUG_ON(bad_range(zone, page));

545

VM_BUG_ON(bad_range(zone, page));

546

547

while (order < MAX_ORDER-1) {

547

while (order < MAX_ORDER-1) {

548

buddy_idx = __find_buddy_index(page_idx, order);

548

buddy_idx = __find_buddy_index(page_idx, order);

549

buddy = page + (buddy_idx - page_idx);

549

buddy = page + (buddy_idx - page_idx);

550

if (!page_is_buddy(page, buddy, order))

550

if (!page_is_buddy(page, buddy, order))

551

break;

551

break;

552

/*

552

/*

553

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

553

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

554

* merge with it and move up one order.

554

* merge with it and move up one order.

555

*/

555

*/

556

if (page_is_guard(buddy)) {

556

if (page_is_guard(buddy)) {

557

clear_page_guard_flag(buddy);

557

clear_page_guard_flag(buddy);

558

set_page_private(page, 0);

558

set_page_private(page, 0);

559

__mod_zone_freepage_state(zone, 1 << order,

559

__mod_zone_freepage_state(zone, 1 << order,

560

migratetype);

560

migratetype);

561

} else {

561

} else {

562

list_del(&buddy->lru);

562

list_del(&buddy->lru);

563

zone->free_area[order].nr_free--;

563

zone->free_area[order].nr_free--;

564

rmv_page_order(buddy);

564

rmv_page_order(buddy);

565

}

565

}

566

combined_idx = buddy_idx & page_idx;

566

combined_idx = buddy_idx & page_idx;

567

page = page + (combined_idx - page_idx);

567

page = page + (combined_idx - page_idx);

568

page_idx = combined_idx;

568

page_idx = combined_idx;

569

order++;

569

order++;

570

}

570

}

571

set_page_order(page, order);

571

set_page_order(page, order);

572

573

/*

573

/*

574

* If this is not the largest possible page, check if the buddy

574

* If this is not the largest possible page, check if the buddy

575

* of the next-highest order is free. If it is, it's possible

575

* of the next-highest order is free. If it is, it's possible

576

* that pages are being freed that will coalesce soon. In case,

576

* that pages are being freed that will coalesce soon. In case,

577

* that is happening, add the free page to the tail of the list

577

* that is happening, add the free page to the tail of the list

578

* so it's less likely to be used soon and more likely to be merged

578

* so it's less likely to be used soon and more likely to be merged

579

* as a higher order page

579

* as a higher order page

580

*/

580

*/

581

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

581

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

582

struct page *higher_page, *higher_buddy;

582

struct page *higher_page, *higher_buddy;

583

combined_idx = buddy_idx & page_idx;

583

combined_idx = buddy_idx & page_idx;

584

higher_page = page + (combined_idx - page_idx);

584

higher_page = page + (combined_idx - page_idx);

585

buddy_idx = __find_buddy_index(combined_idx, order + 1);

585

buddy_idx = __find_buddy_index(combined_idx, order + 1);

586

higher_buddy = higher_page + (buddy_idx - combined_idx);

586

higher_buddy = higher_page + (buddy_idx - combined_idx);

587

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

587

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

588

list_add_tail(&page->lru,

588

list_add_tail(&page->lru,

589

&zone->free_area[order].free_list[migratetype]);

589

&zone->free_area[order].free_list[migratetype]);

590

goto out;

590

goto out;

591

}

591

}

592

}

592

}

593

594

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

594

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

595

out:

595

out:

596

zone->free_area[order].nr_free++;

596

zone->free_area[order].nr_free++;

597

}

597

}

598

599

static inline int free_pages_check(struct page *page)

599

static inline int free_pages_check(struct page *page)

600

{

600

{

601

if (unlikely(page_mapcount(page) |

601

if (unlikely(page_mapcount(page) |

602

(page->mapping != NULL) |

602

(page->mapping != NULL) |

603

(atomic_read(&page->_count) != 0) |

603

(atomic_read(&page->_count) != 0) |

604

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

604

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

605

(mem_cgroup_bad_page_check(page)))) {

605

(mem_cgroup_bad_page_check(page)))) {

606

bad_page(page);

606

bad_page(page);

607

return 1;

607

return 1;

608

}

608

}

609

reset_page_last_nid(page);

609

reset_page_last_nid(page);

610

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

610

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

611

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

611

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

612

return 0;

612

return 0;

613

}

613

}

614

615

/*

615

/*

616

* Frees a number of pages from the PCP lists

616

* Frees a number of pages from the PCP lists

617

* Assumes all pages on list are in same zone, and of same order.

617

* Assumes all pages on list are in same zone, and of same order.

618

* count is the number of pages to free.

618

* count is the number of pages to free.

619

*

619

*

620

* If the zone was previously in an "all pages pinned" state then look to

620

* If the zone was previously in an "all pages pinned" state then look to

621

* see if this freeing clears that state.

621

* see if this freeing clears that state.

622

*

622

*

623

* And clear the zone's pages_scanned counter, to hold off the "all pages are

623

* And clear the zone's pages_scanned counter, to hold off the "all pages are

624

* pinned" detection logic.

624

* pinned" detection logic.

625

*/

625

*/

626

static void free_pcppages_bulk(struct zone *zone, int count,

626

static void free_pcppages_bulk(struct zone *zone, int count,

627

struct per_cpu_pages *pcp)

627

struct per_cpu_pages *pcp)

628

{

628

{

629

int migratetype = 0;

629

int migratetype = 0;

630

int batch_free = 0;

630

int batch_free = 0;

631

int to_free = count;

631

int to_free = count;

632

633

spin_lock(&zone->lock);

633

spin_lock(&zone->lock);

634

zone->all_unreclaimable = 0;

634

zone->all_unreclaimable = 0;

635

zone->pages_scanned = 0;

635

zone->pages_scanned = 0;

636

637

while (to_free) {

637

while (to_free) {

638

struct page *page;

638

struct page *page;

639

struct list_head *list;

639

struct list_head *list;

640

641

/*

641

/*

642

* Remove pages from lists in a round-robin fashion. A

642

* Remove pages from lists in a round-robin fashion. A

643

* batch_free count is maintained that is incremented when an

643

* batch_free count is maintained that is incremented when an

644

* empty list is encountered. This is so more pages are freed

644

* empty list is encountered. This is so more pages are freed

645

* off fuller lists instead of spinning excessively around empty

645

* off fuller lists instead of spinning excessively around empty

646

* lists

646

* lists

647

*/

647

*/

648

do {

648

do {

649

batch_free++;

649

batch_free++;

650

if (++migratetype == MIGRATE_PCPTYPES)

650

if (++migratetype == MIGRATE_PCPTYPES)

651

migratetype = 0;

651

migratetype = 0;

652

list = &pcp->lists[migratetype];

652

list = &pcp->lists[migratetype];

653

} while (list_empty(list));

653

} while (list_empty(list));

654

655

/* This is the only non-empty list. Free them all. */

655

/* This is the only non-empty list. Free them all. */

656

if (batch_free == MIGRATE_PCPTYPES)

656

if (batch_free == MIGRATE_PCPTYPES)

657

batch_free = to_free;

657

batch_free = to_free;

658

659

do {

659

do {

660

int mt; /* migratetype of the to-be-freed page */

660

int mt; /* migratetype of the to-be-freed page */

661

662

page = list_entry(list->prev, struct page, lru);

662

page = list_entry(list->prev, struct page, lru);

663

/* must delete as __free_one_page list manipulates */

663

/* must delete as __free_one_page list manipulates */

664

list_del(&page->lru);

664

list_del(&page->lru);

665

mt = get_freepage_migratetype(page);

665

mt = get_freepage_migratetype(page);

666

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

666

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

667

__free_one_page(page, zone, 0, mt);

667

__free_one_page(page, zone, 0, mt);

668

trace_mm_page_pcpu_drain(page, 0, mt);

668

trace_mm_page_pcpu_drain(page, 0, mt);

669

if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {

669

if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {

670

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

670

__mod_zone_page_state(zone, NR_FREE_PAGES, 1);

671

if (is_migrate_cma(mt))

671

if (is_migrate_cma(mt))

672

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

672

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);

673

}

673

}

674

} while (--to_free && --batch_free && !list_empty(list));

674

} while (--to_free && --batch_free && !list_empty(list));

675

}

675

}

676

spin_unlock(&zone->lock);

676

spin_unlock(&zone->lock);

677

}

677

}

678

679

static void free_one_page(struct zone *zone, struct page *page, int order,

679

static void free_one_page(struct zone *zone, struct page *page, int order,

680

int migratetype)

680

int migratetype)

681

{

681

{

682

spin_lock(&zone->lock);

682

spin_lock(&zone->lock);

683

zone->all_unreclaimable = 0;

683

zone->all_unreclaimable = 0;

684

zone->pages_scanned = 0;

684

zone->pages_scanned = 0;

685

686

__free_one_page(page, zone, order, migratetype);

686

__free_one_page(page, zone, order, migratetype);

687

if (unlikely(migratetype != MIGRATE_ISOLATE))

687

if (unlikely(migratetype != MIGRATE_ISOLATE))

688

__mod_zone_freepage_state(zone, 1 << order, migratetype);

688

__mod_zone_freepage_state(zone, 1 << order, migratetype);

689

spin_unlock(&zone->lock);

689

spin_unlock(&zone->lock);

690

}

690

}

691

692

static bool free_pages_prepare(struct page *page, unsigned int order)

692

static bool free_pages_prepare(struct page *page, unsigned int order)

693

{

693

{

694

int i;

694

int i;

695

int bad = 0;

695

int bad = 0;

696

697

trace_mm_page_free(page, order);

697

trace_mm_page_free(page, order);

698

kmemcheck_free_shadow(page, order);

698

kmemcheck_free_shadow(page, order);

699

700

if (PageAnon(page))

700

if (PageAnon(page))

701

page->mapping = NULL;

701

page->mapping = NULL;

702

for (i = 0; i < (1 << order); i++)

702

for (i = 0; i < (1 << order); i++)

703

bad += free_pages_check(page + i);

703

bad += free_pages_check(page + i);

704

if (bad)

704

if (bad)

705

return false;

705

return false;

706

707

if (!PageHighMem(page)) {

707

if (!PageHighMem(page)) {

708

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

708

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

709

debug_check_no_obj_freed(page_address(page),

709

debug_check_no_obj_freed(page_address(page),

710

PAGE_SIZE << order);

710

PAGE_SIZE << order);

711

}

711

}

712

arch_free_page(page, order);

712

arch_free_page(page, order);

713

kernel_map_pages(page, 1 << order, 0);

713

kernel_map_pages(page, 1 << order, 0);

714

715

return true;

715

return true;

716

}

716

}

717

718

static void __free_pages_ok(struct page *page, unsigned int order)

718

static void __free_pages_ok(struct page *page, unsigned int order)

719

{

719

{

720

unsigned long flags;

720

unsigned long flags;

721

int migratetype;

721

int migratetype;

722

723

if (!free_pages_prepare(page, order))

723

if (!free_pages_prepare(page, order))

724

return;

724

return;

725

726

local_irq_save(flags);

726

local_irq_save(flags);

727

__count_vm_events(PGFREE, 1 << order);

727

__count_vm_events(PGFREE, 1 << order);

728

migratetype = get_pageblock_migratetype(page);

728

migratetype = get_pageblock_migratetype(page);

729

set_freepage_migratetype(page, migratetype);

729

set_freepage_migratetype(page, migratetype);

730

free_one_page(page_zone(page), page, order, migratetype);

730

free_one_page(page_zone(page), page, order, migratetype);

731

local_irq_restore(flags);

731

local_irq_restore(flags);

732

}

732

}

733

734

/*

734

/*

735

* Read access to zone->managed_pages is safe because it's unsigned long,

735

* Read access to zone->managed_pages is safe because it's unsigned long,

736

* but we still need to serialize writers. Currently all callers of

736

* but we still need to serialize writers. Currently all callers of

737

* __free_pages_bootmem() except put_page_bootmem() should only be used

737

* __free_pages_bootmem() except put_page_bootmem() should only be used

738

* at boot time. So for shorter boot time, we shift the burden to

738

* at boot time. So for shorter boot time, we shift the burden to

739

* put_page_bootmem() to serialize writers.

739

* put_page_bootmem() to serialize writers.

740

*/

740

*/

741

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

741

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

742

{

742

{

743

unsigned int nr_pages = 1 << order;

743

unsigned int nr_pages = 1 << order;

744

unsigned int loop;

744

unsigned int loop;

745

746

prefetchw(page);

746

prefetchw(page);

747

for (loop = 0; loop < nr_pages; loop++) {

747

for (loop = 0; loop < nr_pages; loop++) {

748

struct page *p = &page[loop];

748

struct page *p = &page[loop];

749

750

if (loop + 1 < nr_pages)

750

if (loop + 1 < nr_pages)

751

prefetchw(p + 1);

751

prefetchw(p + 1);

752

__ClearPageReserved(p);

752

__ClearPageReserved(p);

753

set_page_count(p, 0);

753

set_page_count(p, 0);

754

}

754

}

755

756

page_zone(page)->managed_pages += 1 << order;

756

page_zone(page)->managed_pages += 1 << order;

757

set_page_refcounted(page);

757

set_page_refcounted(page);

758

__free_pages(page, order);

758

__free_pages(page, order);

759

}

759

}

760

761

#ifdef CONFIG_CMA

761

#ifdef CONFIG_CMA

762

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

762

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

763

void __init init_cma_reserved_pageblock(struct page *page)

763

void __init init_cma_reserved_pageblock(struct page *page)

764

{

764

{

765

unsigned i = pageblock_nr_pages;

765

unsigned i = pageblock_nr_pages;

766

struct page *p = page;

766

struct page *p = page;

767

768

do {

768

do {

769

__ClearPageReserved(p);

769

__ClearPageReserved(p);

770

set_page_count(p, 0);

770

set_page_count(p, 0);

771

} while (++p, --i);

771

} while (++p, --i);

772

773

set_page_refcounted(page);

773

set_page_refcounted(page);

774

set_pageblock_migratetype(page, MIGRATE_CMA);

774

set_pageblock_migratetype(page, MIGRATE_CMA);

775

__free_pages(page, pageblock_order);

775

__free_pages(page, pageblock_order);

776

totalram_pages += pageblock_nr_pages;

776

totalram_pages += pageblock_nr_pages;

777

#ifdef CONFIG_HIGHMEM

777

#ifdef CONFIG_HIGHMEM

778

if (PageHighMem(page))

778

if (PageHighMem(page))

779

totalhigh_pages += pageblock_nr_pages;

779

totalhigh_pages += pageblock_nr_pages;

780

#endif

780

#endif

781

}

781

}

782

#endif

782

#endif

783

784

/*

784

/*

785

* The order of subdivision here is critical for the IO subsystem.

785

* The order of subdivision here is critical for the IO subsystem.

786

* Please do not alter this order without good reasons and regression

786

* Please do not alter this order without good reasons and regression

787

* testing. Specifically, as large blocks of memory are subdivided,

787

* testing. Specifically, as large blocks of memory are subdivided,

788

* the order in which smaller blocks are delivered depends on the order

788

* the order in which smaller blocks are delivered depends on the order

789

* they're subdivided in this function. This is the primary factor

789

* they're subdivided in this function. This is the primary factor

790

* influencing the order in which pages are delivered to the IO

790

* influencing the order in which pages are delivered to the IO

791

* subsystem according to empirical testing, and this is also justified

791

* subsystem according to empirical testing, and this is also justified

792

* by considering the behavior of a buddy system containing a single

792

* by considering the behavior of a buddy system containing a single

793

* large block of memory acted on by a series of small allocations.

793

* large block of memory acted on by a series of small allocations.

794

* This behavior is a critical factor in sglist merging's success.

794

* This behavior is a critical factor in sglist merging's success.

795

*

795

*

796

* -- nyc

796

* -- nyc

797

*/

797

*/

798

static inline void expand(struct zone *zone, struct page *page,

798

static inline void expand(struct zone *zone, struct page *page,

799

int low, int high, struct free_area *area,

799

int low, int high, struct free_area *area,

800

int migratetype)

800

int migratetype)

801

{

801

{

802

unsigned long size = 1 << high;

802

unsigned long size = 1 << high;

803

804

while (high > low) {

804

while (high > low) {

805

area--;

805

area--;

806

high--;

806

high--;

807

size >>= 1;

807

size >>= 1;

808

VM_BUG_ON(bad_range(zone, &page[size]));

808

VM_BUG_ON(bad_range(zone, &page[size]));

809

810

#ifdef CONFIG_DEBUG_PAGEALLOC

810

#ifdef CONFIG_DEBUG_PAGEALLOC

811

if (high < debug_guardpage_minorder()) {

811

if (high < debug_guardpage_minorder()) {

812

/*

812

/*

813

* Mark as guard pages (or page), that will allow to

813

* Mark as guard pages (or page), that will allow to

814

* merge back to allocator when buddy will be freed.

814

* merge back to allocator when buddy will be freed.

815

* Corresponding page table entries will not be touched,

815

* Corresponding page table entries will not be touched,

816

* pages will stay not present in virtual address space

816

* pages will stay not present in virtual address space

817

*/

817

*/

818

INIT_LIST_HEAD(&page[size].lru);

818

INIT_LIST_HEAD(&page[size].lru);

819

set_page_guard_flag(&page[size]);

819

set_page_guard_flag(&page[size]);

820

set_page_private(&page[size], high);

820

set_page_private(&page[size], high);

821

/* Guard pages are not available for any usage */

821

/* Guard pages are not available for any usage */

822

__mod_zone_freepage_state(zone, -(1 << high),

822

__mod_zone_freepage_state(zone, -(1 << high),

823

migratetype);

823

migratetype);

824

continue;

824

continue;

825

}

825

}

826

#endif

826

#endif

827

list_add(&page[size].lru, &area->free_list[migratetype]);

827

list_add(&page[size].lru, &area->free_list[migratetype]);

828

area->nr_free++;

828

area->nr_free++;

829

set_page_order(&page[size], high);

829

set_page_order(&page[size], high);

830

}

830

}

831

}

831

}

832

833

/*

833

/*

834

* This page is about to be returned from the page allocator

834

* This page is about to be returned from the page allocator

835

*/

835

*/

836

static inline int check_new_page(struct page *page)

836

static inline int check_new_page(struct page *page)

837

{

837

{

838

if (unlikely(page_mapcount(page) |

838

if (unlikely(page_mapcount(page) |

839

(page->mapping != NULL) |

839

(page->mapping != NULL) |

840

(atomic_read(&page->_count) != 0) |

840

(atomic_read(&page->_count) != 0) |

841

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

841

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

842

(mem_cgroup_bad_page_check(page)))) {

842

(mem_cgroup_bad_page_check(page)))) {

843

bad_page(page);

843

bad_page(page);

844

return 1;

844

return 1;

845

}

845

}

846

return 0;

846

return 0;

847

}

847

}

848

849

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

849

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

850

{

850

{

851

int i;

851

int i;

852

853

for (i = 0; i < (1 << order); i++) {

853

for (i = 0; i < (1 << order); i++) {

854

struct page *p = page + i;

854

struct page *p = page + i;

855

if (unlikely(check_new_page(p)))

855

if (unlikely(check_new_page(p)))

856

return 1;

856

return 1;

857

}

857

}

858

859

set_page_private(page, 0);

859

set_page_private(page, 0);

860

set_page_refcounted(page);

860

set_page_refcounted(page);

861

862

arch_alloc_page(page, order);

862

arch_alloc_page(page, order);

863

kernel_map_pages(page, 1 << order, 1);

863

kernel_map_pages(page, 1 << order, 1);

864

865

if (gfp_flags & __GFP_ZERO)

865

if (gfp_flags & __GFP_ZERO)

866

prep_zero_page(page, order, gfp_flags);

866

prep_zero_page(page, order, gfp_flags);

867

868

if (order && (gfp_flags & __GFP_COMP))

868

if (order && (gfp_flags & __GFP_COMP))

869

prep_compound_page(page, order);

869

prep_compound_page(page, order);

870

871

return 0;

871

return 0;

872

}

872

}

873

874

/*

874

/*

875

* Go through the free lists for the given migratetype and remove

875

* Go through the free lists for the given migratetype and remove

876

* the smallest available page from the freelists

876

* the smallest available page from the freelists

877

*/

877

*/

878

static inline

878

static inline

879

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

879

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

880

int migratetype)

880

int migratetype)

881

{

881

{

882

unsigned int current_order;

882

unsigned int current_order;

883

struct free_area * area;

883

struct free_area * area;

884

struct page *page;

884

struct page *page;

885

886

/* Find a page of the appropriate size in the preferred list */

886

/* Find a page of the appropriate size in the preferred list */

887

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

887

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

888

area = &(zone->free_area[current_order]);

888

area = &(zone->free_area[current_order]);

889

if (list_empty(&area->free_list[migratetype]))

889

if (list_empty(&area->free_list[migratetype]))

890

continue;

890

continue;

891

892

page = list_entry(area->free_list[migratetype].next,

892

page = list_entry(area->free_list[migratetype].next,

893

struct page, lru);

893

struct page, lru);

894

list_del(&page->lru);

894

list_del(&page->lru);

895

rmv_page_order(page);

895

rmv_page_order(page);

896

area->nr_free--;

896

area->nr_free--;

897

expand(zone, page, order, current_order, area, migratetype);

897

expand(zone, page, order, current_order, area, migratetype);

898

return page;

898

return page;

899

}

899

}

900

901

return NULL;

901

return NULL;

902

}

902

}

903

904

905

/*

905

/*

906

* This array describes the order lists are fallen back to when

906

* This array describes the order lists are fallen back to when

907

* the free lists for the desirable migrate type are depleted

907

* the free lists for the desirable migrate type are depleted

908

*/

908

*/

909

static int fallbacks[MIGRATE_TYPES][4] = {

909

static int fallbacks[MIGRATE_TYPES][4] = {

910

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

910

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

911

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

911

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

912

#ifdef CONFIG_CMA

912

#ifdef CONFIG_CMA

913

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

913

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

914

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

914

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

915

#else

915

#else

916

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

916

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

917

#endif

917

#endif

918

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

918

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

919

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

919

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

920

};

920

};

921

922

/*

922

/*

923

* Move the free pages in a range to the free lists of the requested type.

923

* Move the free pages in a range to the free lists of the requested type.

924

* Note that start_page and end_pages are not aligned on a pageblock

924

* Note that start_page and end_pages are not aligned on a pageblock

925

* boundary. If alignment is required, use move_freepages_block()

925

* boundary. If alignment is required, use move_freepages_block()

926

*/

926

*/

927

int move_freepages(struct zone *zone,

927

int move_freepages(struct zone *zone,

928

struct page *start_page, struct page *end_page,

928

struct page *start_page, struct page *end_page,

929

int migratetype)

929

int migratetype)

930

{

930

{

931

struct page *page;

931

struct page *page;

932

unsigned long order;

932

unsigned long order;

933

int pages_moved = 0;

933

int pages_moved = 0;

934

935

#ifndef CONFIG_HOLES_IN_ZONE

935

#ifndef CONFIG_HOLES_IN_ZONE

936

/*

936

/*

937

* page_zone is not safe to call in this context when

937

* page_zone is not safe to call in this context when

938

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

938

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

939

* anyway as we check zone boundaries in move_freepages_block().

939

* anyway as we check zone boundaries in move_freepages_block().

940

* Remove at a later date when no bug reports exist related to

940

* Remove at a later date when no bug reports exist related to

941

* grouping pages by mobility

941

* grouping pages by mobility

942

*/

942

*/

943

BUG_ON(page_zone(start_page) != page_zone(end_page));

943

BUG_ON(page_zone(start_page) != page_zone(end_page));

944

#endif

944

#endif

945

946

for (page = start_page; page <= end_page;) {

946

for (page = start_page; page <= end_page;) {

947

/* Make sure we are not inadvertently changing nodes */

947

/* Make sure we are not inadvertently changing nodes */

948

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

948

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

949

950

if (!pfn_valid_within(page_to_pfn(page))) {

950

if (!pfn_valid_within(page_to_pfn(page))) {

951

page++;

951

page++;

952

continue;

952

continue;

953

}

953

}

954

955

if (!PageBuddy(page)) {

955

if (!PageBuddy(page)) {

956

page++;

956

page++;

957

continue;

957

continue;

958

}

958

}

959

960

order = page_order(page);

960

order = page_order(page);

961

list_move(&page->lru,

961

list_move(&page->lru,

962

&zone->free_area[order].free_list[migratetype]);

962

&zone->free_area[order].free_list[migratetype]);

963

set_freepage_migratetype(page, migratetype);

963

set_freepage_migratetype(page, migratetype);

964

page += 1 << order;

964

page += 1 << order;

965

pages_moved += 1 << order;

965

pages_moved += 1 << order;

966

}

966

}

967

968

return pages_moved;

968

return pages_moved;

969

}

969

}

970

971

int move_freepages_block(struct zone *zone, struct page *page,

971

int move_freepages_block(struct zone *zone, struct page *page,

972

int migratetype)

972

int migratetype)

973

{

973

{

974

unsigned long start_pfn, end_pfn;

974

unsigned long start_pfn, end_pfn;

975

struct page *start_page, *end_page;

975

struct page *start_page, *end_page;

976

977

start_pfn = page_to_pfn(page);

977

start_pfn = page_to_pfn(page);

978

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

978

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

979

start_page = pfn_to_page(start_pfn);

979

start_page = pfn_to_page(start_pfn);

980

end_page = start_page + pageblock_nr_pages - 1;

980

end_page = start_page + pageblock_nr_pages - 1;

981

end_pfn = start_pfn + pageblock_nr_pages - 1;

981

end_pfn = start_pfn + pageblock_nr_pages - 1;

982

983

/* Do not cross zone boundaries */

983

/* Do not cross zone boundaries */

984

if (start_pfn < zone->zone_start_pfn)

984

if (start_pfn < zone->zone_start_pfn)

985

start_page = page;

985

start_page = page;

986

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

986

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

987

return 0;

987

return 0;

988

989

return move_freepages(zone, start_page, end_page, migratetype);

989

return move_freepages(zone, start_page, end_page, migratetype);

990

}

990

}

991

992

static void change_pageblock_range(struct page *pageblock_page,

992

static void change_pageblock_range(struct page *pageblock_page,

993

int start_order, int migratetype)

993

int start_order, int migratetype)

994

{

994

{

995

int nr_pageblocks = 1 << (start_order - pageblock_order);

995

int nr_pageblocks = 1 << (start_order - pageblock_order);

996

997

while (nr_pageblocks--) {

997

while (nr_pageblocks--) {

998

set_pageblock_migratetype(pageblock_page, migratetype);

998

set_pageblock_migratetype(pageblock_page, migratetype);

999

pageblock_page += pageblock_nr_pages;

999

pageblock_page += pageblock_nr_pages;

1000

}

1000

}

1001

}

1001

}

1002

1003

/* Remove an element from the buddy allocator from the fallback list */

1003

/* Remove an element from the buddy allocator from the fallback list */

1004

static inline struct page *

1004

static inline struct page *

1005

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1005

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

1006

{

1006

{

1007

struct free_area * area;

1007

struct free_area * area;

1008

int current_order;

1008

int current_order;

1009

struct page *page;

1009

struct page *page;

1010

int migratetype, i;

1010

int migratetype, i;

1011

1012

/* Find the largest possible block of pages in the other list */

1012

/* Find the largest possible block of pages in the other list */

1013

for (current_order = MAX_ORDER-1; current_order >= order;

1013

for (current_order = MAX_ORDER-1; current_order >= order;

1014

--current_order) {

1014

--current_order) {

1015

for (i = 0;; i++) {

1015

for (i = 0;; i++) {

1016

migratetype = fallbacks[start_migratetype][i];

1016

migratetype = fallbacks[start_migratetype][i];

1017

1018

/* MIGRATE_RESERVE handled later if necessary */

1018

/* MIGRATE_RESERVE handled later if necessary */

1019

if (migratetype == MIGRATE_RESERVE)

1019

if (migratetype == MIGRATE_RESERVE)

1020

break;

1020

break;

1021

1022

area = &(zone->free_area[current_order]);

1022

area = &(zone->free_area[current_order]);

1023

if (list_empty(&area->free_list[migratetype]))

1023

if (list_empty(&area->free_list[migratetype]))

1024

continue;

1024

continue;

1025

1026

page = list_entry(area->free_list[migratetype].next,

1026

page = list_entry(area->free_list[migratetype].next,

1027

struct page, lru);

1027

struct page, lru);

1028

area->nr_free--;

1028

area->nr_free--;

1029

1030

/*

1030

/*

1031

* If breaking a large block of pages, move all free

1031

* If breaking a large block of pages, move all free

1032

* pages to the preferred allocation list. If falling

1032

* pages to the preferred allocation list. If falling

1033

* back for a reclaimable kernel allocation, be more

1033

* back for a reclaimable kernel allocation, be more

1034

* aggressive about taking ownership of free pages

1034

* aggressive about taking ownership of free pages

1035

*

1035

*

1036

* On the other hand, never change migration

1036

* On the other hand, never change migration

1037

* type of MIGRATE_CMA pageblocks nor move CMA

1037

* type of MIGRATE_CMA pageblocks nor move CMA

1038

* pages on different free lists. We don't

1038

* pages on different free lists. We don't

1039

* want unmovable pages to be allocated from

1039

* want unmovable pages to be allocated from

1040

* MIGRATE_CMA areas.

1040

* MIGRATE_CMA areas.

1041

*/

1041

*/

1042

if (!is_migrate_cma(migratetype) &&

1042

if (!is_migrate_cma(migratetype) &&

1043

(unlikely(current_order >= pageblock_order / 2) ||

1043

(unlikely(current_order >= pageblock_order / 2) ||

1044

start_migratetype == MIGRATE_RECLAIMABLE ||

1044

start_migratetype == MIGRATE_RECLAIMABLE ||

1045

page_group_by_mobility_disabled)) {

1045

page_group_by_mobility_disabled)) {

1046

int pages;

1046

int pages;

1047

pages = move_freepages_block(zone, page,

1047

pages = move_freepages_block(zone, page,

1048

start_migratetype);

1048

start_migratetype);

1049

1050

/* Claim the whole block if over half of it is free */

1050

/* Claim the whole block if over half of it is free */

1051

if (pages >= (1 << (pageblock_order-1)) ||

1051

if (pages >= (1 << (pageblock_order-1)) ||

1052

page_group_by_mobility_disabled)

1052

page_group_by_mobility_disabled)

1053

set_pageblock_migratetype(page,

1053

set_pageblock_migratetype(page,

1054

start_migratetype);

1054

start_migratetype);

1055

1056

migratetype = start_migratetype;

1056

migratetype = start_migratetype;

1057

}

1057

}

1058

1059

/* Remove the page from the freelists */

1059

/* Remove the page from the freelists */

1060

list_del(&page->lru);

1060

list_del(&page->lru);

1061

rmv_page_order(page);

1061

rmv_page_order(page);

1062

1063

/* Take ownership for orders >= pageblock_order */

1063

/* Take ownership for orders >= pageblock_order */

1064

if (current_order >= pageblock_order &&

1064

if (current_order >= pageblock_order &&

1065

!is_migrate_cma(migratetype))

1065

!is_migrate_cma(migratetype))

1066

change_pageblock_range(page, current_order,

1066

change_pageblock_range(page, current_order,

1067

start_migratetype);

1067

start_migratetype);

1068

1069

expand(zone, page, order, current_order, area,

1069

expand(zone, page, order, current_order, area,

1070

is_migrate_cma(migratetype)

1070

is_migrate_cma(migratetype)

1071

? migratetype : start_migratetype);

1071

? migratetype : start_migratetype);

1072

1073

trace_mm_page_alloc_extfrag(page, order, current_order,

1073

trace_mm_page_alloc_extfrag(page, order, current_order,

1074

start_migratetype, migratetype);

1074

start_migratetype, migratetype);

1075

1076

return page;

1076

return page;

1077

}

1077

}

1078

}

1078

}

1079

1080

return NULL;

1080

return NULL;

1081

}

1081

}

1082

1083

/*

1083

/*

1084

* Do the hard work of removing an element from the buddy allocator.

1084

* Do the hard work of removing an element from the buddy allocator.

1085

* Call me with the zone->lock already held.

1085

* Call me with the zone->lock already held.

1086

*/

1086

*/

1087

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1087

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1088

int migratetype)

1088

int migratetype)

1089

{

1089

{

1090

struct page *page;

1090

struct page *page;

1091

1092

retry_reserve:

1092

retry_reserve:

1093

page = __rmqueue_smallest(zone, order, migratetype);

1093

page = __rmqueue_smallest(zone, order, migratetype);

1094

1095

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1095

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1096

page = __rmqueue_fallback(zone, order, migratetype);

1096

page = __rmqueue_fallback(zone, order, migratetype);

1097

1098

/*

1098

/*

1099

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1099

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1100

* is used because __rmqueue_smallest is an inline function

1100

* is used because __rmqueue_smallest is an inline function

1101

* and we want just one call site

1101

* and we want just one call site

1102

*/

1102

*/

1103

if (!page) {

1103

if (!page) {

1104

migratetype = MIGRATE_RESERVE;

1104

migratetype = MIGRATE_RESERVE;

1105

goto retry_reserve;

1105

goto retry_reserve;

1106

}

1106

}

1107

}

1107

}

1108

1109

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1109

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1110

return page;

1110

return page;

1111

}

1111

}

1112

1113

/*

1113

/*

1114

* Obtain a specified number of elements from the buddy allocator, all under

1114

* Obtain a specified number of elements from the buddy allocator, all under

1115

* a single hold of the lock, for efficiency. Add them to the supplied list.

1115

* a single hold of the lock, for efficiency. Add them to the supplied list.

1116

* Returns the number of new pages which were placed at *list.

1116

* Returns the number of new pages which were placed at *list.

1117

*/

1117

*/

1118

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1118

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1119

unsigned long count, struct list_head *list,

1119

unsigned long count, struct list_head *list,

1120

int migratetype, int cold)

1120

int migratetype, int cold)

1121

{

1121

{

1122

int mt = migratetype, i;

1122

int mt = migratetype, i;

1123

1124

spin_lock(&zone->lock);

1124

spin_lock(&zone->lock);

1125

for (i = 0; i < count; ++i) {

1125

for (i = 0; i < count; ++i) {

1126

struct page *page = __rmqueue(zone, order, migratetype);

1126

struct page *page = __rmqueue(zone, order, migratetype);

1127

if (unlikely(page == NULL))

1127

if (unlikely(page == NULL))

1128

break;

1128

break;

1129

1130

/*

1130

/*

1131

* Split buddy pages returned by expand() are received here

1131

* Split buddy pages returned by expand() are received here

1132

* in physical page order. The page is added to the callers and

1132

* in physical page order. The page is added to the callers and

1133

* list and the list head then moves forward. From the callers

1133

* list and the list head then moves forward. From the callers

1134

* perspective, the linked list is ordered by page number in

1134

* perspective, the linked list is ordered by page number in

1135

* some conditions. This is useful for IO devices that can

1135

* some conditions. This is useful for IO devices that can

1136

* merge IO requests if the physical pages are ordered

1136

* merge IO requests if the physical pages are ordered

1137

* properly.

1137

* properly.

1138

*/

1138

*/

1139

if (likely(cold == 0))

1139

if (likely(cold == 0))

1140

list_add(&page->lru, list);

1140

list_add(&page->lru, list);

1141

else

1141

else

1142

list_add_tail(&page->lru, list);

1142

list_add_tail(&page->lru, list);

1143

if (IS_ENABLED(CONFIG_CMA)) {

1143

if (IS_ENABLED(CONFIG_CMA)) {

1144

mt = get_pageblock_migratetype(page);

1144

mt = get_pageblock_migratetype(page);

1145

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1145

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1146

mt = migratetype;

1146

mt = migratetype;

1147

}

1147

}

1148

set_freepage_migratetype(page, mt);

1148

set_freepage_migratetype(page, mt);

1149

list = &page->lru;

1149

list = &page->lru;

1150

if (is_migrate_cma(mt))

1150

if (is_migrate_cma(mt))

1151

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1151

__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,

1152

-(1 << order));

1152

-(1 << order));

1153

}

1153

}

1154

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1154

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1155

spin_unlock(&zone->lock);

1155

spin_unlock(&zone->lock);

1156

return i;

1156

return i;

1157

}

1157

}

1158

1159

#ifdef CONFIG_NUMA

1159

#ifdef CONFIG_NUMA

1160

/*

1160

/*

1161

* Called from the vmstat counter updater to drain pagesets of this

1161

* Called from the vmstat counter updater to drain pagesets of this

1162

* currently executing processor on remote nodes after they have

1162

* currently executing processor on remote nodes after they have

1163

* expired.

1163

* expired.

1164

*

1164

*

1165

* Note that this function must be called with the thread pinned to

1165

* Note that this function must be called with the thread pinned to

1166

* a single processor.

1166

* a single processor.

1167

*/

1167

*/

1168

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1168

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1169

{

1169

{

1170

unsigned long flags;

1170

unsigned long flags;

1171

int to_drain;

1171

int to_drain;

1172

1173

local_irq_save(flags);

1173

local_irq_save(flags);

1174

if (pcp->count >= pcp->batch)

1174

if (pcp->count >= pcp->batch)

1175

to_drain = pcp->batch;

1175

to_drain = pcp->batch;

1176

else

1176

else

1177

to_drain = pcp->count;

1177

to_drain = pcp->count;

1178

if (to_drain > 0) {

1178

if (to_drain > 0) {

1179

free_pcppages_bulk(zone, to_drain, pcp);

1179

free_pcppages_bulk(zone, to_drain, pcp);

1180

pcp->count -= to_drain;

1180

pcp->count -= to_drain;

1181

}

1181

}

1182

local_irq_restore(flags);

1182

local_irq_restore(flags);

1183

}

1183

}

1184

#endif

1184

#endif

1185

1186

/*

1186

/*

1187

* Drain pages of the indicated processor.

1187

* Drain pages of the indicated processor.

1188

*

1188

*

1189

* The processor must either be the current processor and the

1189

* The processor must either be the current processor and the

1190

* thread pinned to the current processor or a processor that

1190

* thread pinned to the current processor or a processor that

1191

* is not online.

1191

* is not online.

1192

*/

1192

*/

1193

static void drain_pages(unsigned int cpu)

1193

static void drain_pages(unsigned int cpu)

1194

{

1194

{

1195

unsigned long flags;

1195

unsigned long flags;

1196

struct zone *zone;

1196

struct zone *zone;

1197

1198

for_each_populated_zone(zone) {

1198

for_each_populated_zone(zone) {

1199

struct per_cpu_pageset *pset;

1199

struct per_cpu_pageset *pset;

1200

struct per_cpu_pages *pcp;

1200

struct per_cpu_pages *pcp;

1201

1202

local_irq_save(flags);

1202

local_irq_save(flags);

1203

pset = per_cpu_ptr(zone->pageset, cpu);

1203

pset = per_cpu_ptr(zone->pageset, cpu);

1204

1205

pcp = &pset->pcp;

1205

pcp = &pset->pcp;

1206

if (pcp->count) {

1206

if (pcp->count) {

1207

free_pcppages_bulk(zone, pcp->count, pcp);

1207

free_pcppages_bulk(zone, pcp->count, pcp);

1208

pcp->count = 0;

1208

pcp->count = 0;

1209

}

1209

}

1210

local_irq_restore(flags);

1210

local_irq_restore(flags);

1211

}

1211

}

1212

}

1212

}

1213

1214

/*

1214

/*

1215

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1215

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1216

*/

1216

*/

1217

void drain_local_pages(void *arg)

1217

void drain_local_pages(void *arg)

1218

{

1218

{

1219

drain_pages(smp_processor_id());

1219

drain_pages(smp_processor_id());

1220

}

1220

}

1221

1222

/*

1222

/*

1223

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1223

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1224

*

1224

*

1225

* Note that this code is protected against sending an IPI to an offline

1225

* Note that this code is protected against sending an IPI to an offline

1226

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1226

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1227

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1227

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1228

* nothing keeps CPUs from showing up after we populated the cpumask and

1228

* nothing keeps CPUs from showing up after we populated the cpumask and

1229

* before the call to on_each_cpu_mask().

1229

* before the call to on_each_cpu_mask().

1230

*/

1230

*/

1231

void drain_all_pages(void)

1231

void drain_all_pages(void)

1232

{

1232

{

1233

int cpu;

1233

int cpu;

1234

struct per_cpu_pageset *pcp;

1234

struct per_cpu_pageset *pcp;

1235

struct zone *zone;

1235

struct zone *zone;

1236

1237

/*

1237

/*

1238

* Allocate in the BSS so we wont require allocation in

1238

* Allocate in the BSS so we wont require allocation in

1239

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1239

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1240

*/

1240

*/

1241

static cpumask_t cpus_with_pcps;

1241

static cpumask_t cpus_with_pcps;

1242

1243

/*

1243

/*

1244

* We don't care about racing with CPU hotplug event

1244

* We don't care about racing with CPU hotplug event

1245

* as offline notification will cause the notified

1245

* as offline notification will cause the notified

1246

* cpu to drain that CPU pcps and on_each_cpu_mask

1246

* cpu to drain that CPU pcps and on_each_cpu_mask

1247

* disables preemption as part of its processing

1247

* disables preemption as part of its processing

1248

*/

1248

*/

1249

for_each_online_cpu(cpu) {

1249

for_each_online_cpu(cpu) {

1250

bool has_pcps = false;

1250

bool has_pcps = false;

1251

for_each_populated_zone(zone) {

1251

for_each_populated_zone(zone) {

1252

pcp = per_cpu_ptr(zone->pageset, cpu);

1252

pcp = per_cpu_ptr(zone->pageset, cpu);

1253

if (pcp->pcp.count) {

1253

if (pcp->pcp.count) {

1254

has_pcps = true;

1254

has_pcps = true;

1255

break;

1255

break;

1256

}

1256

}

1257

}

1257

}

1258

if (has_pcps)

1258

if (has_pcps)

1259

cpumask_set_cpu(cpu, &cpus_with_pcps);

1259

cpumask_set_cpu(cpu, &cpus_with_pcps);

1260

else

1260

else

1261

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1261

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1262

}

1262

}

1263

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1263

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1264

}

1264

}

1265

1266

#ifdef CONFIG_HIBERNATION

1266

#ifdef CONFIG_HIBERNATION

1267

1268

void mark_free_pages(struct zone *zone)

1268

void mark_free_pages(struct zone *zone)

1269

{

1269

{

1270

unsigned long pfn, max_zone_pfn;

1270

unsigned long pfn, max_zone_pfn;

1271

unsigned long flags;

1271

unsigned long flags;

1272

int order, t;

1272

int order, t;

1273

struct list_head *curr;

1273

struct list_head *curr;

1274

1275

if (!zone->spanned_pages)

1275

if (!zone->spanned_pages)

1276

return;

1276

return;

1277

1278

spin_lock_irqsave(&zone->lock, flags);

1278

spin_lock_irqsave(&zone->lock, flags);

1279

1280

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1280

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1281

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1281

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1282

if (pfn_valid(pfn)) {

1282

if (pfn_valid(pfn)) {

1283

struct page *page = pfn_to_page(pfn);

1283

struct page *page = pfn_to_page(pfn);

1284

1285

if (!swsusp_page_is_forbidden(page))

1285

if (!swsusp_page_is_forbidden(page))

1286

swsusp_unset_page_free(page);

1286

swsusp_unset_page_free(page);

1287

}

1287

}

1288

1289

for_each_migratetype_order(order, t) {

1289

for_each_migratetype_order(order, t) {

1290

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1290

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1291

unsigned long i;

1291

unsigned long i;

1292

1293

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1293

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1294

for (i = 0; i < (1UL << order); i++)

1294

for (i = 0; i < (1UL << order); i++)

1295

swsusp_set_page_free(pfn_to_page(pfn + i));

1295

swsusp_set_page_free(pfn_to_page(pfn + i));

1296

}

1296

}

1297

}

1297

}

1298

spin_unlock_irqrestore(&zone->lock, flags);

1298

spin_unlock_irqrestore(&zone->lock, flags);

1299

}

1299

}

1300

#endif /* CONFIG_PM */

1300

#endif /* CONFIG_PM */

1301

1302

/*

1302

/*

1303

* Free a 0-order page

1303

* Free a 0-order page

1304

* cold == 1 ? free a cold page : free a hot page

1304

* cold == 1 ? free a cold page : free a hot page

1305

*/

1305

*/

1306

void free_hot_cold_page(struct page *page, int cold)

1306

void free_hot_cold_page(struct page *page, int cold)

1307

{

1307

{

1308

struct zone *zone = page_zone(page);

1308

struct zone *zone = page_zone(page);

1309

struct per_cpu_pages *pcp;

1309

struct per_cpu_pages *pcp;

1310

unsigned long flags;

1310

unsigned long flags;

1311

int migratetype;

1311

int migratetype;

1312

1313

if (!free_pages_prepare(page, 0))

1313

if (!free_pages_prepare(page, 0))

1314

return;

1314

return;

1315

1316

migratetype = get_pageblock_migratetype(page);

1316

migratetype = get_pageblock_migratetype(page);

1317

set_freepage_migratetype(page, migratetype);

1317

set_freepage_migratetype(page, migratetype);

1318

local_irq_save(flags);

1318

local_irq_save(flags);

1319

__count_vm_event(PGFREE);

1319

__count_vm_event(PGFREE);

1320

1321

/*

1321

/*

1322

* We only track unmovable, reclaimable and movable on pcp lists.

1322

* We only track unmovable, reclaimable and movable on pcp lists.

1323

* Free ISOLATE pages back to the allocator because they are being

1323

* Free ISOLATE pages back to the allocator because they are being

1324

* offlined but treat RESERVE as movable pages so we can get those

1324

* offlined but treat RESERVE as movable pages so we can get those

1325

* areas back if necessary. Otherwise, we may have to free

1325

* areas back if necessary. Otherwise, we may have to free

1326

* excessively into the page allocator

1326

* excessively into the page allocator

1327

*/

1327

*/

1328

if (migratetype >= MIGRATE_PCPTYPES) {

1328

if (migratetype >= MIGRATE_PCPTYPES) {

1329

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1329

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1330

free_one_page(zone, page, 0, migratetype);

1330

free_one_page(zone, page, 0, migratetype);

1331

goto out;

1331

goto out;

1332

}

1332

}

1333

migratetype = MIGRATE_MOVABLE;

1333

migratetype = MIGRATE_MOVABLE;

1334

}

1334

}

1335

1336

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1336

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1337

if (cold)

1337

if (cold)

1338

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1338

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1339

else

1339

else

1340

list_add(&page->lru, &pcp->lists[migratetype]);

1340

list_add(&page->lru, &pcp->lists[migratetype]);

1341

pcp->count++;

1341

pcp->count++;

1342

if (pcp->count >= pcp->high) {

1342

if (pcp->count >= pcp->high) {

1343

free_pcppages_bulk(zone, pcp->batch, pcp);

1343

free_pcppages_bulk(zone, pcp->batch, pcp);

1344

pcp->count -= pcp->batch;

1344

pcp->count -= pcp->batch;

1345

}

1345

}

1346

1347

out:

1347

out:

1348

local_irq_restore(flags);

1348

local_irq_restore(flags);

1349

}

1349

}

1350

1351

/*

1351

/*

1352

* Free a list of 0-order pages

1352

* Free a list of 0-order pages

1353

*/

1353

*/

1354

void free_hot_cold_page_list(struct list_head *list, int cold)

1354

void free_hot_cold_page_list(struct list_head *list, int cold)

1355

{

1355

{

1356

struct page *page, *next;

1356

struct page *page, *next;

1357

1358

list_for_each_entry_safe(page, next, list, lru) {

1358

list_for_each_entry_safe(page, next, list, lru) {

1359

trace_mm_page_free_batched(page, cold);

1359

trace_mm_page_free_batched(page, cold);

1360

free_hot_cold_page(page, cold);

1360

free_hot_cold_page(page, cold);

1361

}

1361

}

1362

}

1362

}

1363

1364

/*

1364

/*

1365

* split_page takes a non-compound higher-order page, and splits it into

1365

* split_page takes a non-compound higher-order page, and splits it into

1366

* n (1<<order) sub-pages: page[0..n]

1366

* n (1<<order) sub-pages: page[0..n]

1367

* Each sub-page must be freed individually.

1367

* Each sub-page must be freed individually.

1368

*

1368

*

1369

* Note: this is probably too low level an operation for use in drivers.

1369

* Note: this is probably too low level an operation for use in drivers.

1370

* Please consult with lkml before using this in your driver.

1370

* Please consult with lkml before using this in your driver.

1371

*/

1371

*/

1372

void split_page(struct page *page, unsigned int order)

1372

void split_page(struct page *page, unsigned int order)

1373

{

1373

{

1374

int i;

1374

int i;

1375

1376

VM_BUG_ON(PageCompound(page));

1376

VM_BUG_ON(PageCompound(page));

1377

VM_BUG_ON(!page_count(page));

1377

VM_BUG_ON(!page_count(page));

1378

1379

#ifdef CONFIG_KMEMCHECK

1379

#ifdef CONFIG_KMEMCHECK

1380

/*

1380

/*

1381

* Split shadow pages too, because free(page[0]) would

1381

* Split shadow pages too, because free(page[0]) would

1382

* otherwise free the whole shadow.

1382

* otherwise free the whole shadow.

1383

*/

1383

*/

1384

if (kmemcheck_page_is_tracked(page))

1384

if (kmemcheck_page_is_tracked(page))

1385

split_page(virt_to_page(page[0].shadow), order);

1385

split_page(virt_to_page(page[0].shadow), order);

1386

#endif

1386

#endif

1387

1388

for (i = 1; i < (1 << order); i++)

1388

for (i = 1; i < (1 << order); i++)

1389

set_page_refcounted(page + i);

1389

set_page_refcounted(page + i);

1390

}

1390

}

1391

1392

static int __isolate_free_page(struct page *page, unsigned int order)

1392

static int __isolate_free_page(struct page *page, unsigned int order)

1393

{

1393

{

1394

unsigned long watermark;

1394

unsigned long watermark;

1395

struct zone *zone;

1395

struct zone *zone;

1396

int mt;

1396

int mt;

1397

1398

BUG_ON(!PageBuddy(page));

1398

BUG_ON(!PageBuddy(page));

1399

1400

zone = page_zone(page);

1400

zone = page_zone(page);

1401

mt = get_pageblock_migratetype(page);

1401

mt = get_pageblock_migratetype(page);

1402

1403

if (mt != MIGRATE_ISOLATE) {

1403

if (mt != MIGRATE_ISOLATE) {

1404

/* Obey watermarks as if the page was being allocated */

1404

/* Obey watermarks as if the page was being allocated */

1405

watermark = low_wmark_pages(zone) + (1 << order);

1405

watermark = low_wmark_pages(zone) + (1 << order);

1406

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1406

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1407

return 0;

1407

return 0;

1408

1409

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1409

__mod_zone_freepage_state(zone, -(1UL << order), mt);

1410

}

1410

}

1411

1412

/* Remove page from free list */

1412

/* Remove page from free list */

1413

list_del(&page->lru);

1413

list_del(&page->lru);

1414

zone->free_area[order].nr_free--;

1414

zone->free_area[order].nr_free--;

1415

rmv_page_order(page);

1415

rmv_page_order(page);

1416

1417

/* Set the pageblock if the isolated page is at least a pageblock */

1417

/* Set the pageblock if the isolated page is at least a pageblock */

1418

if (order >= pageblock_order - 1) {

1418

if (order >= pageblock_order - 1) {

1419

struct page *endpage = page + (1 << order) - 1;

1419

struct page *endpage = page + (1 << order) - 1;

1420

for (; page < endpage; page += pageblock_nr_pages) {

1420

for (; page < endpage; page += pageblock_nr_pages) {

1421

int mt = get_pageblock_migratetype(page);

1421

int mt = get_pageblock_migratetype(page);

1422

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1422

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1423

set_pageblock_migratetype(page,

1423

set_pageblock_migratetype(page,

1424

MIGRATE_MOVABLE);

1424

MIGRATE_MOVABLE);

1425

}

1425

}

1426

}

1426

}

1427

1428

return 1UL << order;

1428

return 1UL << order;

1429

}

1429

}

1430

1431

/*

1431

/*

1432

* Similar to split_page except the page is already free. As this is only

1432

* Similar to split_page except the page is already free. As this is only

1433

* being used for migration, the migratetype of the block also changes.

1433

* being used for migration, the migratetype of the block also changes.

1434

* As this is called with interrupts disabled, the caller is responsible

1434

* As this is called with interrupts disabled, the caller is responsible

1435

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1435

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1436

* are enabled.

1436

* are enabled.

1437

*

1437

*

1438

* Note: this is probably too low level an operation for use in drivers.

1438

* Note: this is probably too low level an operation for use in drivers.

1439

* Please consult with lkml before using this in your driver.

1439

* Please consult with lkml before using this in your driver.

1440

*/

1440

*/

1441

int split_free_page(struct page *page)

1441

int split_free_page(struct page *page)

1442

{

1442

{

1443

unsigned int order;

1443

unsigned int order;

1444

int nr_pages;

1444

int nr_pages;

1445

1446

order = page_order(page);

1446

order = page_order(page);

1447

1448

nr_pages = __isolate_free_page(page, order);

1448

nr_pages = __isolate_free_page(page, order);

1449

if (!nr_pages)

1449

if (!nr_pages)

1450

return 0;

1450

return 0;

1451

1452

/* Split into individual pages */

1452

/* Split into individual pages */

1453

set_page_refcounted(page);

1453

set_page_refcounted(page);

1454

split_page(page, order);

1454

split_page(page, order);

1455

return nr_pages;

1455

return nr_pages;

1456

}

1456

}

1457

1458

/*

1458

/*

1459

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1459

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1460

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1460

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1461

* or two.

1461

* or two.

1462

*/

1462

*/

1463

static inline

1463

static inline

1464

struct page *buffered_rmqueue(struct zone *preferred_zone,

1464

struct page *buffered_rmqueue(struct zone *preferred_zone,

1465

struct zone *zone, int order, gfp_t gfp_flags,

1465

struct zone *zone, int order, gfp_t gfp_flags,

1466

int migratetype)

1466

int migratetype)

1467

{

1467

{

1468

unsigned long flags;

1468

unsigned long flags;

1469

struct page *page;

1469

struct page *page;

1470

int cold = !!(gfp_flags & __GFP_COLD);

1470

int cold = !!(gfp_flags & __GFP_COLD);

1471

1472

again:

1472

again:

1473

if (likely(order == 0)) {

1473

if (likely(order == 0)) {

1474

struct per_cpu_pages *pcp;

1474

struct per_cpu_pages *pcp;

1475

struct list_head *list;

1475

struct list_head *list;

1476

1477

local_irq_save(flags);

1477

local_irq_save(flags);

1478

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1478

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1479

list = &pcp->lists[migratetype];

1479

list = &pcp->lists[migratetype];

1480

if (list_empty(list)) {

1480

if (list_empty(list)) {

1481

pcp->count += rmqueue_bulk(zone, 0,

1481

pcp->count += rmqueue_bulk(zone, 0,

1482

pcp->batch, list,

1482

pcp->batch, list,

1483

migratetype, cold);

1483

migratetype, cold);

1484

if (unlikely(list_empty(list)))

1484

if (unlikely(list_empty(list)))

1485

goto failed;

1485

goto failed;

1486

}

1486

}

1487

1488

if (cold)

1488

if (cold)

1489

page = list_entry(list->prev, struct page, lru);

1489

page = list_entry(list->prev, struct page, lru);

1490

else

1490

else

1491

page = list_entry(list->next, struct page, lru);

1491

page = list_entry(list->next, struct page, lru);

1492

1493

list_del(&page->lru);

1493

list_del(&page->lru);

1494

pcp->count--;

1494

pcp->count--;

1495

} else {

1495

} else {

1496

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1496

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1497

/*

1497

/*

1498

* __GFP_NOFAIL is not to be used in new code.

1498

* __GFP_NOFAIL is not to be used in new code.

1499

*

1499

*

1500

* All __GFP_NOFAIL callers should be fixed so that they

1500

* All __GFP_NOFAIL callers should be fixed so that they

1501

* properly detect and handle allocation failures.

1501

* properly detect and handle allocation failures.

1502

*

1502

*

1503

* We most definitely don't want callers attempting to

1503

* We most definitely don't want callers attempting to

1504

* allocate greater than order-1 page units with

1504

* allocate greater than order-1 page units with

1505

* __GFP_NOFAIL.

1505

* __GFP_NOFAIL.

1506

*/

1506

*/

1507

WARN_ON_ONCE(order > 1);

1507

WARN_ON_ONCE(order > 1);

1508

}

1508

}

1509

spin_lock_irqsave(&zone->lock, flags);

1509

spin_lock_irqsave(&zone->lock, flags);

1510

page = __rmqueue(zone, order, migratetype);

1510

page = __rmqueue(zone, order, migratetype);

1511

spin_unlock(&zone->lock);

1511

spin_unlock(&zone->lock);

1512

if (!page)

1512

if (!page)

1513

goto failed;

1513

goto failed;

1514

__mod_zone_freepage_state(zone, -(1 << order),

1514

__mod_zone_freepage_state(zone, -(1 << order),

1515

get_pageblock_migratetype(page));

1515

get_pageblock_migratetype(page));

1516

}

1516

}

1517

1518

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1518

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1519

zone_statistics(preferred_zone, zone, gfp_flags);

1519

zone_statistics(preferred_zone, zone, gfp_flags);

1520

local_irq_restore(flags);

1520

local_irq_restore(flags);

1521

1522

VM_BUG_ON(bad_range(zone, page));

1522

VM_BUG_ON(bad_range(zone, page));

1523

if (prep_new_page(page, order, gfp_flags))

1523

if (prep_new_page(page, order, gfp_flags))

1524

goto again;

1524

goto again;

1525

return page;

1525

return page;

1526

1527

failed:

1527

failed:

1528

local_irq_restore(flags);

1528

local_irq_restore(flags);

1529

return NULL;

1529

return NULL;

1530

}

1530

}

1531

1532

#ifdef CONFIG_FAIL_PAGE_ALLOC

1532

#ifdef CONFIG_FAIL_PAGE_ALLOC

1533

1534

static struct {

1534

static struct {

1535

struct fault_attr attr;

1535

struct fault_attr attr;

1536

1537

u32 ignore_gfp_highmem;

1537

u32 ignore_gfp_highmem;

1538

u32 ignore_gfp_wait;

1538

u32 ignore_gfp_wait;

1539

u32 min_order;

1539

u32 min_order;

1540

} fail_page_alloc = {

1540

} fail_page_alloc = {

1541

.attr = FAULT_ATTR_INITIALIZER,

1541

.attr = FAULT_ATTR_INITIALIZER,

1542

.ignore_gfp_wait = 1,

1542

.ignore_gfp_wait = 1,

1543

.ignore_gfp_highmem = 1,

1543

.ignore_gfp_highmem = 1,

1544

.min_order = 1,

1544

.min_order = 1,

1545

};

1545

};

1546

1547

static int __init setup_fail_page_alloc(char *str)

1547

static int __init setup_fail_page_alloc(char *str)

1548

{

1548

{

1549

return setup_fault_attr(&fail_page_alloc.attr, str);

1549

return setup_fault_attr(&fail_page_alloc.attr, str);

1550

}

1550

}

1551

__setup("fail_page_alloc=", setup_fail_page_alloc);

1551

__setup("fail_page_alloc=", setup_fail_page_alloc);

1552

1553

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1553

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1554

{

1554

{

1555

if (order < fail_page_alloc.min_order)

1555

if (order < fail_page_alloc.min_order)

1556

return false;

1556

return false;

1557

if (gfp_mask & __GFP_NOFAIL)

1557

if (gfp_mask & __GFP_NOFAIL)

1558

return false;

1558

return false;

1559

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1559

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1560

return false;

1560

return false;

1561

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1561

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1562

return false;

1562

return false;

1563

1564

return should_fail(&fail_page_alloc.attr, 1 << order);

1564

return should_fail(&fail_page_alloc.attr, 1 << order);

1565

}

1565

}

1566

1567

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1567

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1568

1569

static int __init fail_page_alloc_debugfs(void)

1569

static int __init fail_page_alloc_debugfs(void)

1570

{

1570

{

1571

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1571

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1572

struct dentry *dir;

1572

struct dentry *dir;

1573

1574

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1574

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1575

&fail_page_alloc.attr);

1575

&fail_page_alloc.attr);

1576

if (IS_ERR(dir))

1576

if (IS_ERR(dir))

1577

return PTR_ERR(dir);

1577

return PTR_ERR(dir);

1578

1579

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1579

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1580

&fail_page_alloc.ignore_gfp_wait))

1580

&fail_page_alloc.ignore_gfp_wait))

1581

goto fail;

1581

goto fail;

1582

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1582

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1583

&fail_page_alloc.ignore_gfp_highmem))

1583

&fail_page_alloc.ignore_gfp_highmem))

1584

goto fail;

1584

goto fail;

1585

if (!debugfs_create_u32("min-order", mode, dir,

1585

if (!debugfs_create_u32("min-order", mode, dir,

1586

&fail_page_alloc.min_order))

1586

&fail_page_alloc.min_order))

1587

goto fail;

1587

goto fail;

1588

1589

return 0;

1589

return 0;

1590

fail:

1590

fail:

1591

debugfs_remove_recursive(dir);

1591

debugfs_remove_recursive(dir);

1592

1593

return -ENOMEM;

1593

return -ENOMEM;

1594

}

1594

}

1595

1596

late_initcall(fail_page_alloc_debugfs);

1596

late_initcall(fail_page_alloc_debugfs);

1597

1598

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1598

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1599

1600

#else /* CONFIG_FAIL_PAGE_ALLOC */

1600

#else /* CONFIG_FAIL_PAGE_ALLOC */

1601

1602

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1602

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1603

{

1603

{

1604

return false;

1604

return false;

1605

}

1605

}

1606

1607

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1607

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1608

1609

/*

1609

/*

1610

* Return true if free pages are above 'mark'. This takes into account the order

1610

* Return true if free pages are above 'mark'. This takes into account the order

1611

* of the allocation.

1611

* of the allocation.

1612

*/

1612

*/

1613

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1613

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1614

int classzone_idx, int alloc_flags, long free_pages)

1614

int classzone_idx, int alloc_flags, long free_pages)

1615

{

1615

{

1616

/* free_pages my go negative - that's OK */

1616

/* free_pages my go negative - that's OK */

1617

long min = mark;

1617

long min = mark;

1618

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1618

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1619

int o;

1619

int o;

1620

1621

free_pages -= (1 << order) - 1;

1621

free_pages -= (1 << order) - 1;

1622

if (alloc_flags & ALLOC_HIGH)

1622

if (alloc_flags & ALLOC_HIGH)

1623

min -= min / 2;

1623

min -= min / 2;

1624

if (alloc_flags & ALLOC_HARDER)

1624

if (alloc_flags & ALLOC_HARDER)

1625

min -= min / 4;

1625

min -= min / 4;

1626

#ifdef CONFIG_CMA

1626

#ifdef CONFIG_CMA

1627

/* If allocation can't use CMA areas don't use free CMA pages */

1627

/* If allocation can't use CMA areas don't use free CMA pages */

1628

if (!(alloc_flags & ALLOC_CMA))

1628

if (!(alloc_flags & ALLOC_CMA))

1629

free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

1629

free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

1630

#endif

1630

#endif

1631

if (free_pages <= min + lowmem_reserve)

1631

if (free_pages <= min + lowmem_reserve)

1632

return false;

1632

return false;

1633

for (o = 0; o < order; o++) {

1633

for (o = 0; o < order; o++) {

1634

/* At the next order, this order's pages become unavailable */

1634

/* At the next order, this order's pages become unavailable */

1635

free_pages -= z->free_area[o].nr_free << o;

1635

free_pages -= z->free_area[o].nr_free << o;

1636

1637

/* Require fewer higher order pages to be free */

1637

/* Require fewer higher order pages to be free */

1638

min >>= 1;

1638

min >>= 1;

1639

1640

if (free_pages <= min)

1640

if (free_pages <= min)

1641

return false;

1641

return false;

1642

}

1642

}

1643

return true;

1643

return true;

1644

}

1644

}

1645

1646

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1646

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1647

int classzone_idx, int alloc_flags)

1647

int classzone_idx, int alloc_flags)

1648

{

1648

{

1649

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1649

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1650

zone_page_state(z, NR_FREE_PAGES));

1650

zone_page_state(z, NR_FREE_PAGES));

1651

}

1651

}

1652

1653

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1653

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1654

int classzone_idx, int alloc_flags)

1654

int classzone_idx, int alloc_flags)

1655

{

1655

{

1656

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1656

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1657

1658

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1658

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1659

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1659

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1660

1661

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1661

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1662

free_pages);

1662

free_pages);

1663

}

1663

}

1664

1665

#ifdef CONFIG_NUMA

1665

#ifdef CONFIG_NUMA

1666

/*

1666

/*

1667

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1667

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1668

* skip over zones that are not allowed by the cpuset, or that have

1668

* skip over zones that are not allowed by the cpuset, or that have

1669

* been recently (in last second) found to be nearly full. See further

1669

* been recently (in last second) found to be nearly full. See further

1670

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1670

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1671

* that have to skip over a lot of full or unallowed zones.

1671

* that have to skip over a lot of full or unallowed zones.

1672

*

1672

*

1673

* If the zonelist cache is present in the passed in zonelist, then

1673

* If the zonelist cache is present in the passed in zonelist, then

1674

* returns a pointer to the allowed node mask (either the current

1674

* returns a pointer to the allowed node mask (either the current

1675

* tasks mems_allowed, or node_states[N_MEMORY].)

1675

* tasks mems_allowed, or node_states[N_MEMORY].)

1676

*

1676

*

1677

* If the zonelist cache is not available for this zonelist, does

1677

* If the zonelist cache is not available for this zonelist, does

1678

* nothing and returns NULL.

1678

* nothing and returns NULL.

1679

*

1679

*

1680

* If the fullzones BITMAP in the zonelist cache is stale (more than

1680

* If the fullzones BITMAP in the zonelist cache is stale (more than

1681

* a second since last zap'd) then we zap it out (clear its bits.)

1681

* a second since last zap'd) then we zap it out (clear its bits.)

1682

*

1682

*

1683

* We hold off even calling zlc_setup, until after we've checked the

1683

* We hold off even calling zlc_setup, until after we've checked the

1684

* first zone in the zonelist, on the theory that most allocations will

1684

* first zone in the zonelist, on the theory that most allocations will

1685

* be satisfied from that first zone, so best to examine that zone as

1685

* be satisfied from that first zone, so best to examine that zone as

1686

* quickly as we can.

1686

* quickly as we can.

1687

*/

1687

*/

1688

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1688

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1689

{

1689

{

1690

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1690

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1691

nodemask_t *allowednodes; /* zonelist_cache approximation */

1691

nodemask_t *allowednodes; /* zonelist_cache approximation */

1692

1693

zlc = zonelist->zlcache_ptr;

1693

zlc = zonelist->zlcache_ptr;

1694

if (!zlc)

1694

if (!zlc)

1695

return NULL;

1695

return NULL;

1696

1697

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1697

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1698

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1698

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1699

zlc->last_full_zap = jiffies;

1699

zlc->last_full_zap = jiffies;

1700

}

1700

}

1701

1702

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1702

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1703

&cpuset_current_mems_allowed :

1703

&cpuset_current_mems_allowed :

1704

&node_states[N_MEMORY];

1704

&node_states[N_MEMORY];

1705

return allowednodes;

1705

return allowednodes;

1706

}

1706

}

1707

1708

/*

1708

/*

1709

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1709

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1710

* if it is worth looking at further for free memory:

1710

* if it is worth looking at further for free memory:

1711

* 1) Check that the zone isn't thought to be full (doesn't have its

1711

* 1) Check that the zone isn't thought to be full (doesn't have its

1712

* bit set in the zonelist_cache fullzones BITMAP).

1712

* bit set in the zonelist_cache fullzones BITMAP).

1713

* 2) Check that the zones node (obtained from the zonelist_cache

1713

* 2) Check that the zones node (obtained from the zonelist_cache

1714

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1714

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1715

* Return true (non-zero) if zone is worth looking at further, or

1715

* Return true (non-zero) if zone is worth looking at further, or

1716

* else return false (zero) if it is not.

1716

* else return false (zero) if it is not.

1717

*

1717

*

1718

* This check -ignores- the distinction between various watermarks,

1718

* This check -ignores- the distinction between various watermarks,

1719

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1719

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1720

* found to be full for any variation of these watermarks, it will

1720

* found to be full for any variation of these watermarks, it will

1721

* be considered full for up to one second by all requests, unless

1721

* be considered full for up to one second by all requests, unless

1722

* we are so low on memory on all allowed nodes that we are forced

1722

* we are so low on memory on all allowed nodes that we are forced

1723

* into the second scan of the zonelist.

1723

* into the second scan of the zonelist.

1724

*

1724

*

1725

* In the second scan we ignore this zonelist cache and exactly

1725

* In the second scan we ignore this zonelist cache and exactly

1726

* apply the watermarks to all zones, even it is slower to do so.

1726

* apply the watermarks to all zones, even it is slower to do so.

1727

* We are low on memory in the second scan, and should leave no stone

1727

* We are low on memory in the second scan, and should leave no stone

1728

* unturned looking for a free page.

1728

* unturned looking for a free page.

1729

*/

1729

*/

1730

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1730

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1731

nodemask_t *allowednodes)

1731

nodemask_t *allowednodes)

1732

{

1732

{

1733

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1733

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1734

int i; /* index of *z in zonelist zones */

1734

int i; /* index of *z in zonelist zones */

1735

int n; /* node that zone *z is on */

1735

int n; /* node that zone *z is on */

1736

1737

zlc = zonelist->zlcache_ptr;

1737

zlc = zonelist->zlcache_ptr;

1738

if (!zlc)

1738

if (!zlc)

1739

return 1;

1739

return 1;

1740

1741

i = z - zonelist->_zonerefs;

1741

i = z - zonelist->_zonerefs;

1742

n = zlc->z_to_n[i];

1742

n = zlc->z_to_n[i];

1743

1744

/* This zone is worth trying if it is allowed but not full */

1744

/* This zone is worth trying if it is allowed but not full */

1745

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1745

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1746

}

1746

}

1747

1748

/*

1748

/*

1749

* Given 'z' scanning a zonelist, set the corresponding bit in

1749

* Given 'z' scanning a zonelist, set the corresponding bit in

1750

* zlc->fullzones, so that subsequent attempts to allocate a page

1750

* zlc->fullzones, so that subsequent attempts to allocate a page

1751

* from that zone don't waste time re-examining it.

1751

* from that zone don't waste time re-examining it.

1752

*/

1752

*/

1753

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1753

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1754

{

1754

{

1755

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1755

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1756

int i; /* index of *z in zonelist zones */

1756

int i; /* index of *z in zonelist zones */

1757

1758

zlc = zonelist->zlcache_ptr;

1758

zlc = zonelist->zlcache_ptr;

1759

if (!zlc)

1759

if (!zlc)

1760

return;

1760

return;

1761

1762

i = z - zonelist->_zonerefs;

1762

i = z - zonelist->_zonerefs;

1763

1764

set_bit(i, zlc->fullzones);

1764

set_bit(i, zlc->fullzones);

1765

}

1765

}

1766

1767

/*

1767

/*

1768

* clear all zones full, called after direct reclaim makes progress so that

1768

* clear all zones full, called after direct reclaim makes progress so that

1769

* a zone that was recently full is not skipped over for up to a second

1769

* a zone that was recently full is not skipped over for up to a second

1770

*/

1770

*/

1771

static void zlc_clear_zones_full(struct zonelist *zonelist)

1771

static void zlc_clear_zones_full(struct zonelist *zonelist)

1772

{

1772

{

1773

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1773

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1774

1775

zlc = zonelist->zlcache_ptr;

1775

zlc = zonelist->zlcache_ptr;

1776

if (!zlc)

1776

if (!zlc)

1777

return;

1777

return;

1778

1779

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1779

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1780

}

1780

}

1781

1782

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1782

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1783

{

1783

{

1784

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1784

return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);

1785

}

1785

}

1786

1787

static void __paginginit init_zone_allows_reclaim(int nid)

1787

static void __paginginit init_zone_allows_reclaim(int nid)

1788

{

1788

{

1789

int i;

1789

int i;

1790

1791

for_each_online_node(i)

1791

for_each_online_node(i)

1792

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1792

if (node_distance(nid, i) <= RECLAIM_DISTANCE)

1793

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1793

node_set(i, NODE_DATA(nid)->reclaim_nodes);

1794

else

1794

else

1795

zone_reclaim_mode = 1;

1795

zone_reclaim_mode = 1;

1796

}

1796

}

1797

1798

#else /* CONFIG_NUMA */

1798

#else /* CONFIG_NUMA */

1799

1800

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1800

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1801

{

1801

{

1802

return NULL;

1802

return NULL;

1803

}

1803

}

1804

1805

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1805

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1806

nodemask_t *allowednodes)

1806

nodemask_t *allowednodes)

1807

{

1807

{

1808

return 1;

1808

return 1;

1809

}

1809

}

1810

1811

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1811

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1812

{

1812

{

1813

}

1813

}

1814

1815

static void zlc_clear_zones_full(struct zonelist *zonelist)

1815

static void zlc_clear_zones_full(struct zonelist *zonelist)

1816

{

1816

{

1817

}

1817

}

1818

1819

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1819

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)

1820

{

1820

{

1821

return true;

1821

return true;

1822

}

1822

}

1823

1824

static inline void init_zone_allows_reclaim(int nid)

1824

static inline void init_zone_allows_reclaim(int nid)

1825

{

1825

{

1826

}

1826

}

1827

#endif /* CONFIG_NUMA */

1827

#endif /* CONFIG_NUMA */

1828

1829

/*

1829

/*

1830

* get_page_from_freelist goes through the zonelist trying to allocate

1830

* get_page_from_freelist goes through the zonelist trying to allocate

1831

* a page.

1831

* a page.

1832

*/

1832

*/

1833

static struct page *

1833

static struct page *

1834

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1834

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1835

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1835

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1836

struct zone *preferred_zone, int migratetype)

1836

struct zone *preferred_zone, int migratetype)

1837

{

1837

{

1838

struct zoneref *z;

1838

struct zoneref *z;

1839

struct page *page = NULL;

1839

struct page *page = NULL;

1840

int classzone_idx;

1840

int classzone_idx;

1841

struct zone *zone;

1841

struct zone *zone;

1842

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1842

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1843

int zlc_active = 0; /* set if using zonelist_cache */

1843

int zlc_active = 0; /* set if using zonelist_cache */

1844

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1844

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1845

1846

classzone_idx = zone_idx(preferred_zone);

1846

classzone_idx = zone_idx(preferred_zone);

1847

zonelist_scan:

1847

zonelist_scan:

1848

/*

1848

/*

1849

* Scan zonelist, looking for a zone with enough free.

1849

* Scan zonelist, looking for a zone with enough free.

1850

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1850

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1851

*/

1851

*/

1852

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1852

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1853

high_zoneidx, nodemask) {

1853

high_zoneidx, nodemask) {

1854

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1854

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1855

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1855

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1856

continue;

1856

continue;

1857

if ((alloc_flags & ALLOC_CPUSET) &&

1857

if ((alloc_flags & ALLOC_CPUSET) &&

1858

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1858

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1859

continue;

1859

continue;

1860

/*

1860

/*

1861

* When allocating a page cache page for writing, we

1861

* When allocating a page cache page for writing, we

1862

* want to get it from a zone that is within its dirty

1862

* want to get it from a zone that is within its dirty

1863

* limit, such that no single zone holds more than its

1863

* limit, such that no single zone holds more than its

1864

* proportional share of globally allowed dirty pages.

1864

* proportional share of globally allowed dirty pages.

1865

* The dirty limits take into account the zone's

1865

* The dirty limits take into account the zone's

1866

* lowmem reserves and high watermark so that kswapd

1866

* lowmem reserves and high watermark so that kswapd

1867

* should be able to balance it without having to

1867

* should be able to balance it without having to

1868

* write pages from its LRU list.

1868

* write pages from its LRU list.

1869

*

1869

*

1870

* This may look like it could increase pressure on

1870

* This may look like it could increase pressure on

1871

* lower zones by failing allocations in higher zones

1871

* lower zones by failing allocations in higher zones

1872

* before they are full. But the pages that do spill

1872

* before they are full. But the pages that do spill

1873

* over are limited as the lower zones are protected

1873

* over are limited as the lower zones are protected

1874

* by this very same mechanism. It should not become

1874

* by this very same mechanism. It should not become

1875

* a practical burden to them.

1875

* a practical burden to them.

1876

*

1876

*

1877

* XXX: For now, allow allocations to potentially

1877

* XXX: For now, allow allocations to potentially

1878

* exceed the per-zone dirty limit in the slowpath

1878

* exceed the per-zone dirty limit in the slowpath

1879

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1879

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1880

* which is important when on a NUMA setup the allowed

1880

* which is important when on a NUMA setup the allowed

1881

* zones are together not big enough to reach the

1881

* zones are together not big enough to reach the

1882

* global limit. The proper fix for these situations

1882

* global limit. The proper fix for these situations

1883

* will require awareness of zones in the

1883

* will require awareness of zones in the

1884

* dirty-throttling and the flusher threads.

1884

* dirty-throttling and the flusher threads.

1885

*/

1885

*/

1886

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1886

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1887

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1887

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1888

goto this_zone_full;

1888

goto this_zone_full;

1889

1890

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1890

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1891

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1891

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1892

unsigned long mark;

1892

unsigned long mark;

1893

int ret;

1893

int ret;

1894

1895

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1895

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1896

if (zone_watermark_ok(zone, order, mark,

1896

if (zone_watermark_ok(zone, order, mark,

1897

classzone_idx, alloc_flags))

1897

classzone_idx, alloc_flags))

1898

goto try_this_zone;

1898

goto try_this_zone;

1899

1900

if (IS_ENABLED(CONFIG_NUMA) &&

1900

if (IS_ENABLED(CONFIG_NUMA) &&

1901

!did_zlc_setup && nr_online_nodes > 1) {

1901

!did_zlc_setup && nr_online_nodes > 1) {

1902

/*

1902

/*

1903

* we do zlc_setup if there are multiple nodes

1903

* we do zlc_setup if there are multiple nodes

1904

* and before considering the first zone allowed

1904

* and before considering the first zone allowed

1905

* by the cpuset.

1905

* by the cpuset.

1906

*/

1906

*/

1907

allowednodes = zlc_setup(zonelist, alloc_flags);

1907

allowednodes = zlc_setup(zonelist, alloc_flags);

1908

zlc_active = 1;

1908

zlc_active = 1;

1909

did_zlc_setup = 1;

1909

did_zlc_setup = 1;

1910

}

1910

}

1911

1912

if (zone_reclaim_mode == 0 ||

1912

if (zone_reclaim_mode == 0 ||

1913

!zone_allows_reclaim(preferred_zone, zone))

1913

!zone_allows_reclaim(preferred_zone, zone))

1914

goto this_zone_full;

1914

goto this_zone_full;

1915

1916

/*

1916

/*

1917

* As we may have just activated ZLC, check if the first

1917

* As we may have just activated ZLC, check if the first

1918

* eligible zone has failed zone_reclaim recently.

1918

* eligible zone has failed zone_reclaim recently.

1919

*/

1919

*/

1920

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1920

if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&

1921

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1921

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1922

continue;

1922

continue;

1923

1924

ret = zone_reclaim(zone, gfp_mask, order);

1924

ret = zone_reclaim(zone, gfp_mask, order);

1925

switch (ret) {

1925

switch (ret) {

1926

case ZONE_RECLAIM_NOSCAN:

1926

case ZONE_RECLAIM_NOSCAN:

1927

/* did not scan */

1927

/* did not scan */

1928

continue;

1928

continue;

1929

case ZONE_RECLAIM_FULL:

1929

case ZONE_RECLAIM_FULL:

1930

/* scanned but unreclaimable */

1930

/* scanned but unreclaimable */

1931

continue;

1931

continue;

1932

default:

1932

default:

1933

/* did we reclaim enough */

1933

/* did we reclaim enough */

1934

if (!zone_watermark_ok(zone, order, mark,

1934

if (!zone_watermark_ok(zone, order, mark,

1935

classzone_idx, alloc_flags))

1935

classzone_idx, alloc_flags))

1936

goto this_zone_full;

1936

goto this_zone_full;

1937

}

1937

}

1938

}

1938

}

1939

1940

try_this_zone:

1940

try_this_zone:

1941

page = buffered_rmqueue(preferred_zone, zone, order,

1941

page = buffered_rmqueue(preferred_zone, zone, order,

1942

gfp_mask, migratetype);

1942

gfp_mask, migratetype);

1943

if (page)

1943

if (page)

1944

break;

1944

break;

1945

this_zone_full:

1945

this_zone_full:

1946

if (IS_ENABLED(CONFIG_NUMA))

1946

if (IS_ENABLED(CONFIG_NUMA))

1947

zlc_mark_zone_full(zonelist, z);

1947

zlc_mark_zone_full(zonelist, z);

1948

}

1948

}

1949

1950

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

1950

if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {

1951

/* Disable zlc cache for second zonelist scan */

1951

/* Disable zlc cache for second zonelist scan */

1952

zlc_active = 0;

1952

zlc_active = 0;

1953

goto zonelist_scan;

1953

goto zonelist_scan;

1954

}

1954

}

1955

1956

if (page)

1956

if (page)

1957

/*

1957

/*

1958

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

1958

* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was

1959

* necessary to allocate the page. The expectation is

1959

* necessary to allocate the page. The expectation is

1960

* that the caller is taking steps that will free more

1960

* that the caller is taking steps that will free more

1961

* memory. The caller should avoid the page being used

1961

* memory. The caller should avoid the page being used

1962

* for !PFMEMALLOC purposes.

1962

* for !PFMEMALLOC purposes.

1963

*/

1963

*/

1964

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

1964

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

1965

1966

return page;

1966

return page;

1967

}

1967

}

1968

1969

/*

1969

/*

1970

* Large machines with many possible nodes should not always dump per-node

1970

* Large machines with many possible nodes should not always dump per-node

1971

* meminfo in irq context.

1971

* meminfo in irq context.

1972

*/

1972

*/

1973

static inline bool should_suppress_show_mem(void)

1973

static inline bool should_suppress_show_mem(void)

1974

{

1974

{

1975

bool ret = false;

1975

bool ret = false;

1976

1977

#if NODES_SHIFT > 8

1977

#if NODES_SHIFT > 8

1978

ret = in_interrupt();

1978

ret = in_interrupt();

1979

#endif

1979

#endif

1980

return ret;

1980

return ret;

1981

}

1981

}

1982

1983

static DEFINE_RATELIMIT_STATE(nopage_rs,

1983

static DEFINE_RATELIMIT_STATE(nopage_rs,

1984

DEFAULT_RATELIMIT_INTERVAL,

1984

DEFAULT_RATELIMIT_INTERVAL,

1985

DEFAULT_RATELIMIT_BURST);

1985

DEFAULT_RATELIMIT_BURST);

1986

1987

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1987

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1988

{

1988

{

1989

unsigned int filter = SHOW_MEM_FILTER_NODES;

1989

unsigned int filter = SHOW_MEM_FILTER_NODES;

1990

1991

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1991

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1992

debug_guardpage_minorder() > 0)

1992

debug_guardpage_minorder() > 0)

1993

return;

1993

return;

1994

1995

/*

1995

/*

1996

* This documents exceptions given to allocations in certain

1996

* This documents exceptions given to allocations in certain

1997

* contexts that are allowed to allocate outside current's set

1997

* contexts that are allowed to allocate outside current's set

1998

* of allowed nodes.

1998

* of allowed nodes.

1999

*/

1999

*/

2000

if (!(gfp_mask & __GFP_NOMEMALLOC))

2000

if (!(gfp_mask & __GFP_NOMEMALLOC))

2001

if (test_thread_flag(TIF_MEMDIE) ||

2001

if (test_thread_flag(TIF_MEMDIE) ||

2002

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2002

(current->flags & (PF_MEMALLOC | PF_EXITING)))

2003

filter &= ~SHOW_MEM_FILTER_NODES;

2003

filter &= ~SHOW_MEM_FILTER_NODES;

2004

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2004

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

2005

filter &= ~SHOW_MEM_FILTER_NODES;

2005

filter &= ~SHOW_MEM_FILTER_NODES;

2006

2007

if (fmt) {

2007

if (fmt) {

2008

struct va_format vaf;

2008

struct va_format vaf;

2009

va_list args;

2009

va_list args;

2010

2011

va_start(args, fmt);

2011

va_start(args, fmt);

2012

2013

vaf.fmt = fmt;

2013

vaf.fmt = fmt;

2014

vaf.va = &args;

2014

vaf.va = &args;

2015

2016

pr_warn("%pV", &vaf);

2016

pr_warn("%pV", &vaf);

2017

2018

va_end(args);

2018

va_end(args);

2019

}

2019

}

2020

2021

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2021

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

2022

current->comm, order, gfp_mask);

2022

current->comm, order, gfp_mask);

2023

2024

dump_stack();

2024

dump_stack();

2025

if (!should_suppress_show_mem())

2025

if (!should_suppress_show_mem())

2026

show_mem(filter);

2026

show_mem(filter);

2027

}

2027

}

2028

2029

static inline int

2029

static inline int

2030

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2030

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

2031

unsigned long did_some_progress,

2031

unsigned long did_some_progress,

2032

unsigned long pages_reclaimed)

2032

unsigned long pages_reclaimed)

2033

{

2033

{

2034

/* Do not loop if specifically requested */

2034

/* Do not loop if specifically requested */

2035

if (gfp_mask & __GFP_NORETRY)

2035

if (gfp_mask & __GFP_NORETRY)

2036

return 0;

2036

return 0;

2037

2038

/* Always retry if specifically requested */

2038

/* Always retry if specifically requested */

2039

if (gfp_mask & __GFP_NOFAIL)

2039

if (gfp_mask & __GFP_NOFAIL)

2040

return 1;

2040

return 1;

2041

2042

/*

2042

/*

2043

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2043

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2044

* making forward progress without invoking OOM. Suspend also disables

2044

* making forward progress without invoking OOM. Suspend also disables

2045

* storage devices so kswapd will not help. Bail if we are suspending.

2045

* storage devices so kswapd will not help. Bail if we are suspending.

2046

*/

2046

*/

2047

if (!did_some_progress && pm_suspended_storage())

2047

if (!did_some_progress && pm_suspended_storage())

2048

return 0;

2048

return 0;

2049

2050

/*

2050

/*

2051

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2051

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2052

* means __GFP_NOFAIL, but that may not be true in other

2052

* means __GFP_NOFAIL, but that may not be true in other

2053

* implementations.

2053

* implementations.

2054

*/

2054

*/

2055

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2055

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2056

return 1;

2056

return 1;

2057

2058

/*

2058

/*

2059

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2059

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2060

* specified, then we retry until we no longer reclaim any pages

2060

* specified, then we retry until we no longer reclaim any pages

2061

* (above), or we've reclaimed an order of pages at least as

2061

* (above), or we've reclaimed an order of pages at least as

2062

* large as the allocation's order. In both cases, if the

2062

* large as the allocation's order. In both cases, if the

2063

* allocation still fails, we stop retrying.

2063

* allocation still fails, we stop retrying.

2064

*/

2064

*/

2065

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2065

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2066

return 1;

2066

return 1;

2067

2068

return 0;

2068

return 0;

2069

}

2069

}

2070

2071

static inline struct page *

2071

static inline struct page *

2072

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2072

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2073

struct zonelist *zonelist, enum zone_type high_zoneidx,

2073

struct zonelist *zonelist, enum zone_type high_zoneidx,

2074

nodemask_t *nodemask, struct zone *preferred_zone,

2074

nodemask_t *nodemask, struct zone *preferred_zone,

2075

int migratetype)

2075

int migratetype)

2076

{

2076

{

2077

struct page *page;

2077

struct page *page;

2078

2079

/* Acquire the OOM killer lock for the zones in zonelist */

2079

/* Acquire the OOM killer lock for the zones in zonelist */

2080

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2080

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2081

schedule_timeout_uninterruptible(1);

2081

schedule_timeout_uninterruptible(1);

2082

return NULL;

2082

return NULL;

2083

}

2083

}

2084

2085

/*

2085

/*

2086

* Go through the zonelist yet one more time, keep very high watermark

2086

* Go through the zonelist yet one more time, keep very high watermark

2087

* here, this is only to catch a parallel oom killing, we must fail if

2087

* here, this is only to catch a parallel oom killing, we must fail if

2088

* we're still under heavy pressure.

2088

* we're still under heavy pressure.

2089

*/

2089

*/

2090

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2090

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2091

order, zonelist, high_zoneidx,

2091

order, zonelist, high_zoneidx,

2092

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2092

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2093

preferred_zone, migratetype);

2093

preferred_zone, migratetype);

2094

if (page)

2094

if (page)

2095

goto out;

2095

goto out;

2096

2097

if (!(gfp_mask & __GFP_NOFAIL)) {

2097

if (!(gfp_mask & __GFP_NOFAIL)) {

2098

/* The OOM killer will not help higher order allocs */

2098

/* The OOM killer will not help higher order allocs */

2099

if (order > PAGE_ALLOC_COSTLY_ORDER)

2099

if (order > PAGE_ALLOC_COSTLY_ORDER)

2100

goto out;

2100

goto out;

2101

/* The OOM killer does not needlessly kill tasks for lowmem */

2101

/* The OOM killer does not needlessly kill tasks for lowmem */

2102

if (high_zoneidx < ZONE_NORMAL)

2102

if (high_zoneidx < ZONE_NORMAL)

2103

goto out;

2103

goto out;

2104

/*

2104

/*

2105

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2105

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2106

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2106

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2107

* The caller should handle page allocation failure by itself if

2107

* The caller should handle page allocation failure by itself if

2108

* it specifies __GFP_THISNODE.

2108

* it specifies __GFP_THISNODE.

2109

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2109

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2110

*/

2110

*/

2111

if (gfp_mask & __GFP_THISNODE)

2111

if (gfp_mask & __GFP_THISNODE)

2112

goto out;

2112

goto out;

2113

}

2113

}

2114

/* Exhausted what can be done so it's blamo time */

2114

/* Exhausted what can be done so it's blamo time */

2115

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2115

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2116

2117

out:

2117

out:

2118

clear_zonelist_oom(zonelist, gfp_mask);

2118

clear_zonelist_oom(zonelist, gfp_mask);

2119

return page;

2119

return page;

2120

}

2120

}

2121

2122

#ifdef CONFIG_COMPACTION

2122

#ifdef CONFIG_COMPACTION

2123

/* Try memory compaction for high-order allocations before reclaim */

2123

/* Try memory compaction for high-order allocations before reclaim */

2124

static struct page *

2124

static struct page *

2125

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2125

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2126

struct zonelist *zonelist, enum zone_type high_zoneidx,

2126

struct zonelist *zonelist, enum zone_type high_zoneidx,

2127

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2127

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2128

int migratetype, bool sync_migration,

2128

int migratetype, bool sync_migration,

2129

bool *contended_compaction, bool *deferred_compaction,

2129

bool *contended_compaction, bool *deferred_compaction,

2130

unsigned long *did_some_progress)

2130

unsigned long *did_some_progress)

2131

{

2131

{

2132

if (!order)

2132

if (!order)

2133

return NULL;

2133

return NULL;

2134

2135

if (compaction_deferred(preferred_zone, order)) {

2135

if (compaction_deferred(preferred_zone, order)) {

2136

*deferred_compaction = true;

2136

*deferred_compaction = true;

2137

return NULL;

2137

return NULL;

2138

}

2138

}

2139

2140

current->flags |= PF_MEMALLOC;

2140

current->flags |= PF_MEMALLOC;

2141

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2141

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2142

nodemask, sync_migration,

2142

nodemask, sync_migration,

2143

contended_compaction);

2143

contended_compaction);

2144

current->flags &= ~PF_MEMALLOC;

2144

current->flags &= ~PF_MEMALLOC;

2145

2146

if (*did_some_progress != COMPACT_SKIPPED) {

2146

if (*did_some_progress != COMPACT_SKIPPED) {

2147

struct page *page;

2147

struct page *page;

2148

2149

/* Page migration frees to the PCP lists but we want merging */

2149

/* Page migration frees to the PCP lists but we want merging */

2150

drain_pages(get_cpu());

2150

drain_pages(get_cpu());

2151

put_cpu();

2151

put_cpu();

2152

2153

page = get_page_from_freelist(gfp_mask, nodemask,

2153

page = get_page_from_freelist(gfp_mask, nodemask,

2154

order, zonelist, high_zoneidx,

2154

order, zonelist, high_zoneidx,

2155

alloc_flags & ~ALLOC_NO_WATERMARKS,

2155

alloc_flags & ~ALLOC_NO_WATERMARKS,

2156

preferred_zone, migratetype);

2156

preferred_zone, migratetype);

2157

if (page) {

2157

if (page) {

2158

preferred_zone->compact_blockskip_flush = false;

2158

preferred_zone->compact_blockskip_flush = false;

2159

preferred_zone->compact_considered = 0;

2159

preferred_zone->compact_considered = 0;

2160

preferred_zone->compact_defer_shift = 0;

2160

preferred_zone->compact_defer_shift = 0;

2161

if (order >= preferred_zone->compact_order_failed)

2161

if (order >= preferred_zone->compact_order_failed)

2162

preferred_zone->compact_order_failed = order + 1;

2162

preferred_zone->compact_order_failed = order + 1;

2163

count_vm_event(COMPACTSUCCESS);

2163

count_vm_event(COMPACTSUCCESS);

2164

return page;

2164

return page;

2165

}

2165

}

2166

2167

/*

2167

/*

2168

* It's bad if compaction run occurs and fails.

2168

* It's bad if compaction run occurs and fails.

2169

* The most likely reason is that pages exist,

2169

* The most likely reason is that pages exist,

2170

* but not enough to satisfy watermarks.

2170

* but not enough to satisfy watermarks.

2171

*/

2171

*/

2172

count_vm_event(COMPACTFAIL);

2172

count_vm_event(COMPACTFAIL);

2173

2174

/*

2174

/*

2175

* As async compaction considers a subset of pageblocks, only

2175

* As async compaction considers a subset of pageblocks, only

2176

* defer if the failure was a sync compaction failure.

2176

* defer if the failure was a sync compaction failure.

2177

*/

2177

*/

2178

if (sync_migration)

2178

if (sync_migration)

2179

defer_compaction(preferred_zone, order);

2179

defer_compaction(preferred_zone, order);

2180

2181

cond_resched();

2181

cond_resched();

2182

}

2182

}

2183

2184

return NULL;

2184

return NULL;

2185

}

2185

}

2186

#else

2186

#else

2187

static inline struct page *

2187

static inline struct page *

2188

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2188

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2189

struct zonelist *zonelist, enum zone_type high_zoneidx,

2189

struct zonelist *zonelist, enum zone_type high_zoneidx,

2190

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2190

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2191

int migratetype, bool sync_migration,

2191

int migratetype, bool sync_migration,

2192

bool *contended_compaction, bool *deferred_compaction,

2192

bool *contended_compaction, bool *deferred_compaction,

2193

unsigned long *did_some_progress)

2193

unsigned long *did_some_progress)

2194

{

2194

{

2195

return NULL;

2195

return NULL;

2196

}

2196

}

2197

#endif /* CONFIG_COMPACTION */

2197

#endif /* CONFIG_COMPACTION */

2198

2199

/* Perform direct synchronous page reclaim */

2199

/* Perform direct synchronous page reclaim */

2200

static int

2200

static int

2201

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2201

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2202

nodemask_t *nodemask)

2202

nodemask_t *nodemask)

2203

{

2203

{

2204

struct reclaim_state reclaim_state;

2204

struct reclaim_state reclaim_state;

2205

int progress;

2205

int progress;

2206

2207

cond_resched();

2207

cond_resched();

2208

2209

/* We now go into synchronous reclaim */

2209

/* We now go into synchronous reclaim */

2210

cpuset_memory_pressure_bump();

2210

cpuset_memory_pressure_bump();

2211

current->flags |= PF_MEMALLOC;

2211

current->flags |= PF_MEMALLOC;

2212

lockdep_set_current_reclaim_state(gfp_mask);

2212

lockdep_set_current_reclaim_state(gfp_mask);

2213

reclaim_state.reclaimed_slab = 0;

2213

reclaim_state.reclaimed_slab = 0;

2214

current->reclaim_state = &reclaim_state;

2214

current->reclaim_state = &reclaim_state;

2215

2216

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2216

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2217

2218

current->reclaim_state = NULL;

2218

current->reclaim_state = NULL;

2219

lockdep_clear_current_reclaim_state();

2219

lockdep_clear_current_reclaim_state();

2220

current->flags &= ~PF_MEMALLOC;

2220

current->flags &= ~PF_MEMALLOC;

2221

2222

cond_resched();

2222

cond_resched();

2223

2224

return progress;

2224

return progress;

2225

}

2225

}

2226

2227

/* The really slow allocator path where we enter direct reclaim */

2227

/* The really slow allocator path where we enter direct reclaim */

2228

static inline struct page *

2228

static inline struct page *

2229

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2229

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2230

struct zonelist *zonelist, enum zone_type high_zoneidx,

2230

struct zonelist *zonelist, enum zone_type high_zoneidx,

2231

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2231

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2232

int migratetype, unsigned long *did_some_progress)

2232

int migratetype, unsigned long *did_some_progress)

2233

{

2233

{

2234

struct page *page = NULL;

2234

struct page *page = NULL;

2235

bool drained = false;

2235

bool drained = false;

2236

2237

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2237

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2238

nodemask);

2238

nodemask);

2239

if (unlikely(!(*did_some_progress)))

2239

if (unlikely(!(*did_some_progress)))

2240

return NULL;

2240

return NULL;

2241

2242

/* After successful reclaim, reconsider all zones for allocation */

2242

/* After successful reclaim, reconsider all zones for allocation */

2243

if (IS_ENABLED(CONFIG_NUMA))

2243

if (IS_ENABLED(CONFIG_NUMA))

2244

zlc_clear_zones_full(zonelist);

2244

zlc_clear_zones_full(zonelist);

2245

2246

retry:

2246

retry:

2247

page = get_page_from_freelist(gfp_mask, nodemask, order,

2247

page = get_page_from_freelist(gfp_mask, nodemask, order,

2248

zonelist, high_zoneidx,

2248

zonelist, high_zoneidx,

2249

alloc_flags & ~ALLOC_NO_WATERMARKS,

2249

alloc_flags & ~ALLOC_NO_WATERMARKS,

2250

preferred_zone, migratetype);

2250

preferred_zone, migratetype);

2251

2252

/*

2252

/*

2253

* If an allocation failed after direct reclaim, it could be because

2253

* If an allocation failed after direct reclaim, it could be because

2254

* pages are pinned on the per-cpu lists. Drain them and try again

2254

* pages are pinned on the per-cpu lists. Drain them and try again

2255

*/

2255

*/

2256

if (!page && !drained) {

2256

if (!page && !drained) {

2257

drain_all_pages();

2257

drain_all_pages();

2258

drained = true;

2258

drained = true;

2259

goto retry;

2259

goto retry;

2260

}

2260

}

2261

2262

return page;

2262

return page;

2263

}

2263

}

2264

2265

/*

2265

/*

2266

* This is called in the allocator slow-path if the allocation request is of

2266

* This is called in the allocator slow-path if the allocation request is of

2267

* sufficient urgency to ignore watermarks and take other desperate measures

2267

* sufficient urgency to ignore watermarks and take other desperate measures

2268

*/

2268

*/

2269

static inline struct page *

2269

static inline struct page *

2270

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2270

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2271

struct zonelist *zonelist, enum zone_type high_zoneidx,

2271

struct zonelist *zonelist, enum zone_type high_zoneidx,

2272

nodemask_t *nodemask, struct zone *preferred_zone,

2272

nodemask_t *nodemask, struct zone *preferred_zone,

2273

int migratetype)

2273

int migratetype)

2274

{

2274

{

2275

struct page *page;

2275

struct page *page;

2276

2277

do {

2277

do {

2278

page = get_page_from_freelist(gfp_mask, nodemask, order,

2278

page = get_page_from_freelist(gfp_mask, nodemask, order,

2279

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2279

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2280

preferred_zone, migratetype);

2280

preferred_zone, migratetype);

2281

2282

if (!page && gfp_mask & __GFP_NOFAIL)

2282

if (!page && gfp_mask & __GFP_NOFAIL)

2283

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2283

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2284

} while (!page && (gfp_mask & __GFP_NOFAIL));

2284

} while (!page && (gfp_mask & __GFP_NOFAIL));

2285

2286

return page;

2286

return page;

2287

}

2287

}

2288

2289

static inline

2289

static inline

2290

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2290

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2291

enum zone_type high_zoneidx,

2291

enum zone_type high_zoneidx,

2292

enum zone_type classzone_idx)

2292

enum zone_type classzone_idx)

2293

{

2293

{

2294

struct zoneref *z;

2294

struct zoneref *z;

2295

struct zone *zone;

2295

struct zone *zone;

2296

2297

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2297

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2298

wakeup_kswapd(zone, order, classzone_idx);

2298

wakeup_kswapd(zone, order, classzone_idx);

2299

}

2299

}

2300

2301

static inline int

2301

static inline int

2302

gfp_to_alloc_flags(gfp_t gfp_mask)

2302

gfp_to_alloc_flags(gfp_t gfp_mask)

2303

{

2303

{

2304

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2304

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2305

const gfp_t wait = gfp_mask & __GFP_WAIT;

2305

const gfp_t wait = gfp_mask & __GFP_WAIT;

2306

2307

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2307

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2308

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2308

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2309

2310

/*

2310

/*

2311

* The caller may dip into page reserves a bit more if the caller

2311

* The caller may dip into page reserves a bit more if the caller

2312

* cannot run direct reclaim, or if the caller has realtime scheduling

2312

* cannot run direct reclaim, or if the caller has realtime scheduling

2313

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2313

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2314

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2314

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2315

*/

2315

*/

2316

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2316

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2317

2318

if (!wait) {

2318

if (!wait) {

2319

/*

2319

/*

2320

* Not worth trying to allocate harder for

2320

* Not worth trying to allocate harder for

2321

* __GFP_NOMEMALLOC even if it can't schedule.

2321

* __GFP_NOMEMALLOC even if it can't schedule.

2322

*/

2322

*/

2323

if (!(gfp_mask & __GFP_NOMEMALLOC))

2323

if (!(gfp_mask & __GFP_NOMEMALLOC))

2324

alloc_flags |= ALLOC_HARDER;

2324

alloc_flags |= ALLOC_HARDER;

2325

/*

2325

/*

2326

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2326

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2327

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2327

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2328

*/

2328

*/

2329

alloc_flags &= ~ALLOC_CPUSET;

2329

alloc_flags &= ~ALLOC_CPUSET;

2330

} else if (unlikely(rt_task(current)) && !in_interrupt())

2330

} else if (unlikely(rt_task(current)) && !in_interrupt())

2331

alloc_flags |= ALLOC_HARDER;

2331

alloc_flags |= ALLOC_HARDER;

2332

2333

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2333

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2334

if (gfp_mask & __GFP_MEMALLOC)

2334

if (gfp_mask & __GFP_MEMALLOC)

2335

alloc_flags |= ALLOC_NO_WATERMARKS;

2335

alloc_flags |= ALLOC_NO_WATERMARKS;

2336

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2336

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2337

alloc_flags |= ALLOC_NO_WATERMARKS;

2337

alloc_flags |= ALLOC_NO_WATERMARKS;

2338

else if (!in_interrupt() &&

2338

else if (!in_interrupt() &&

2339

((current->flags & PF_MEMALLOC) ||

2339

((current->flags & PF_MEMALLOC) ||

2340

unlikely(test_thread_flag(TIF_MEMDIE))))

2340

unlikely(test_thread_flag(TIF_MEMDIE))))

2341

alloc_flags |= ALLOC_NO_WATERMARKS;

2341

alloc_flags |= ALLOC_NO_WATERMARKS;

2342

}

2342

}

2343

#ifdef CONFIG_CMA

2343

#ifdef CONFIG_CMA

2344

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2344

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2345

alloc_flags |= ALLOC_CMA;

2345

alloc_flags |= ALLOC_CMA;

2346

#endif

2346

#endif

2347

return alloc_flags;

2347

return alloc_flags;

2348

}

2348

}

2349

2350

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2350

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2351

{

2351

{

2352

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2352

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2353

}

2353

}

2354

2355

static inline struct page *

2355

static inline struct page *

2356

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2356

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2357

struct zonelist *zonelist, enum zone_type high_zoneidx,

2357

struct zonelist *zonelist, enum zone_type high_zoneidx,

2358

nodemask_t *nodemask, struct zone *preferred_zone,

2358

nodemask_t *nodemask, struct zone *preferred_zone,

2359

int migratetype)

2359

int migratetype)

2360

{

2360

{

2361

const gfp_t wait = gfp_mask & __GFP_WAIT;

2361

const gfp_t wait = gfp_mask & __GFP_WAIT;

2362

struct page *page = NULL;

2362

struct page *page = NULL;

2363

int alloc_flags;

2363

int alloc_flags;

2364

unsigned long pages_reclaimed = 0;

2364

unsigned long pages_reclaimed = 0;

2365

unsigned long did_some_progress;

2365

unsigned long did_some_progress;

2366

bool sync_migration = false;

2366

bool sync_migration = false;

2367

bool deferred_compaction = false;

2367

bool deferred_compaction = false;

2368

bool contended_compaction = false;

2368

bool contended_compaction = false;

2369

2370

/*

2370

/*

2371

* In the slowpath, we sanity check order to avoid ever trying to

2371

* In the slowpath, we sanity check order to avoid ever trying to

2372

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2372

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2373

* be using allocators in order of preference for an area that is

2373

* be using allocators in order of preference for an area that is

2374

* too large.

2374

* too large.

2375

*/

2375

*/

2376

if (order >= MAX_ORDER) {

2376

if (order >= MAX_ORDER) {

2377

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2377

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2378

return NULL;

2378

return NULL;

2379

}

2379

}

2380

2381

/*

2381

/*

2382

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2382

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2383

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2383

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2384

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2384

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2385

* using a larger set of nodes after it has established that the

2385

* using a larger set of nodes after it has established that the

2386

* allowed per node queues are empty and that nodes are

2386

* allowed per node queues are empty and that nodes are

2387

* over allocated.

2387

* over allocated.

2388

*/

2388

*/

2389

if (IS_ENABLED(CONFIG_NUMA) &&

2389

if (IS_ENABLED(CONFIG_NUMA) &&

2390

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2390

(gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2391

goto nopage;

2391

goto nopage;

2392

2393

restart:

2393

restart:

2394

if (!(gfp_mask & __GFP_NO_KSWAPD))

2394

if (!(gfp_mask & __GFP_NO_KSWAPD))

2395

wake_all_kswapd(order, zonelist, high_zoneidx,

2395

wake_all_kswapd(order, zonelist, high_zoneidx,

2396

zone_idx(preferred_zone));

2396

zone_idx(preferred_zone));

2397

2398

/*

2398

/*

2399

* OK, we're below the kswapd watermark and have kicked background

2399

* OK, we're below the kswapd watermark and have kicked background

2400

* reclaim. Now things get more complex, so set up alloc_flags according

2400

* reclaim. Now things get more complex, so set up alloc_flags according

2401

* to how we want to proceed.

2401

* to how we want to proceed.

2402

*/

2402

*/

2403

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2403

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2404

2405

/*

2405

/*

2406

* Find the true preferred zone if the allocation is unconstrained by

2406

* Find the true preferred zone if the allocation is unconstrained by

2407

* cpusets.

2407

* cpusets.

2408

*/

2408

*/

2409

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2409

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2410

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2410

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2411

&preferred_zone);

2411

&preferred_zone);

2412

2413

rebalance:

2413

rebalance:

2414

/* This is the last chance, in general, before the goto nopage. */

2414

/* This is the last chance, in general, before the goto nopage. */

2415

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2415

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2416

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2416

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2417

preferred_zone, migratetype);

2417

preferred_zone, migratetype);

2418

if (page)

2418

if (page)

2419

goto got_pg;

2419

goto got_pg;

2420

2421

/* Allocate without watermarks if the context allows */

2421

/* Allocate without watermarks if the context allows */

2422

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2422

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2423

/*

2423

/*

2424

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2424

* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds

2425

* the allocation is high priority and these type of

2425

* the allocation is high priority and these type of

2426

* allocations are system rather than user orientated

2426

* allocations are system rather than user orientated

2427

*/

2427

*/

2428

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2428

zonelist = node_zonelist(numa_node_id(), gfp_mask);

2429

2430

page = __alloc_pages_high_priority(gfp_mask, order,

2430

page = __alloc_pages_high_priority(gfp_mask, order,

2431

zonelist, high_zoneidx, nodemask,

2431

zonelist, high_zoneidx, nodemask,

2432

preferred_zone, migratetype);

2432

preferred_zone, migratetype);

2433

if (page) {

2433

if (page) {

2434

goto got_pg;

2434

goto got_pg;

2435

}

2435

}

2436

}

2436

}

2437

2438

/* Atomic allocations - we can't balance anything */

2438

/* Atomic allocations - we can't balance anything */

2439

if (!wait)

2439

if (!wait)

2440

goto nopage;

2440

goto nopage;

2441

2442

/* Avoid recursion of direct reclaim */

2442

/* Avoid recursion of direct reclaim */

2443

if (current->flags & PF_MEMALLOC)

2443

if (current->flags & PF_MEMALLOC)

2444

goto nopage;

2444

goto nopage;

2445

2446

/* Avoid allocations with no watermarks from looping endlessly */

2446

/* Avoid allocations with no watermarks from looping endlessly */

2447

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2447

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2448

goto nopage;

2448

goto nopage;

2449

2450

/*

2450

/*

2451

* Try direct compaction. The first pass is asynchronous. Subsequent

2451

* Try direct compaction. The first pass is asynchronous. Subsequent

2452

* attempts after direct reclaim are synchronous

2452

* attempts after direct reclaim are synchronous

2453

*/

2453

*/

2454

page = __alloc_pages_direct_compact(gfp_mask, order,

2454

page = __alloc_pages_direct_compact(gfp_mask, order,

2455

zonelist, high_zoneidx,

2455

zonelist, high_zoneidx,

2456

nodemask,

2456

nodemask,

2457

alloc_flags, preferred_zone,

2457

alloc_flags, preferred_zone,

2458

migratetype, sync_migration,

2458

migratetype, sync_migration,

2459

&contended_compaction,

2459

&contended_compaction,

2460

&deferred_compaction,

2460

&deferred_compaction,

2461

&did_some_progress);

2461

&did_some_progress);

2462

if (page)

2462

if (page)

2463

goto got_pg;

2463

goto got_pg;

2464

sync_migration = true;

2464

sync_migration = true;

2465

2466

/*

2466

/*

2467

* If compaction is deferred for high-order allocations, it is because

2467

* If compaction is deferred for high-order allocations, it is because

2468

* sync compaction recently failed. In this is the case and the caller

2468

* sync compaction recently failed. In this is the case and the caller

2469

* requested a movable allocation that does not heavily disrupt the

2469

* requested a movable allocation that does not heavily disrupt the

2470

* system then fail the allocation instead of entering direct reclaim.

2470

* system then fail the allocation instead of entering direct reclaim.

2471

*/

2471

*/

2472

if ((deferred_compaction || contended_compaction) &&

2472

if ((deferred_compaction || contended_compaction) &&

2473

(gfp_mask & __GFP_NO_KSWAPD))

2473

(gfp_mask & __GFP_NO_KSWAPD))

2474

goto nopage;

2474

goto nopage;

2475

2476

/* Try direct reclaim and then allocating */

2476

/* Try direct reclaim and then allocating */

2477

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2477

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2478

zonelist, high_zoneidx,

2478

zonelist, high_zoneidx,

2479

nodemask,

2479

nodemask,

2480

alloc_flags, preferred_zone,

2480

alloc_flags, preferred_zone,

2481

migratetype, &did_some_progress);

2481

migratetype, &did_some_progress);

2482

if (page)

2482

if (page)

2483

goto got_pg;

2483

goto got_pg;

2484

2485

/*

2485

/*

2486

* If we failed to make any progress reclaiming, then we are

2486

* If we failed to make any progress reclaiming, then we are

2487

* running out of options and have to consider going OOM

2487

* running out of options and have to consider going OOM

2488

*/

2488

*/

2489

if (!did_some_progress) {

2489

if (!did_some_progress) {

2490

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2490

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2491

if (oom_killer_disabled)

2491

if (oom_killer_disabled)

2492

goto nopage;

2492

goto nopage;

2493

/* Coredumps can quickly deplete all memory reserves */

2493

/* Coredumps can quickly deplete all memory reserves */

2494

if ((current->flags & PF_DUMPCORE) &&

2494

if ((current->flags & PF_DUMPCORE) &&

2495

!(gfp_mask & __GFP_NOFAIL))

2495

!(gfp_mask & __GFP_NOFAIL))

2496

goto nopage;

2496

goto nopage;

2497

page = __alloc_pages_may_oom(gfp_mask, order,

2497

page = __alloc_pages_may_oom(gfp_mask, order,

2498

zonelist, high_zoneidx,

2498

zonelist, high_zoneidx,

2499

nodemask, preferred_zone,

2499

nodemask, preferred_zone,

2500

migratetype);

2500

migratetype);

2501

if (page)

2501

if (page)

2502

goto got_pg;

2502

goto got_pg;

2503

2504

if (!(gfp_mask & __GFP_NOFAIL)) {

2504

if (!(gfp_mask & __GFP_NOFAIL)) {

2505

/*

2505

/*

2506

* The oom killer is not called for high-order

2506

* The oom killer is not called for high-order

2507

* allocations that may fail, so if no progress

2507

* allocations that may fail, so if no progress

2508

* is being made, there are no other options and

2508

* is being made, there are no other options and

2509

* retrying is unlikely to help.

2509

* retrying is unlikely to help.

2510

*/

2510

*/

2511

if (order > PAGE_ALLOC_COSTLY_ORDER)

2511

if (order > PAGE_ALLOC_COSTLY_ORDER)

2512

goto nopage;

2512

goto nopage;

2513

/*

2513

/*

2514

* The oom killer is not called for lowmem

2514

* The oom killer is not called for lowmem

2515

* allocations to prevent needlessly killing

2515

* allocations to prevent needlessly killing

2516

* innocent tasks.

2516

* innocent tasks.

2517

*/

2517

*/

2518

if (high_zoneidx < ZONE_NORMAL)

2518

if (high_zoneidx < ZONE_NORMAL)

2519

goto nopage;

2519

goto nopage;

2520

}

2520

}

2521

2522

goto restart;

2522

goto restart;

2523

}

2523

}

2524

}

2524

}

2525

2526

/* Check if we should retry the allocation */

2526

/* Check if we should retry the allocation */

2527

pages_reclaimed += did_some_progress;

2527

pages_reclaimed += did_some_progress;

2528

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2528

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2529

pages_reclaimed)) {

2529

pages_reclaimed)) {

2530

/* Wait for some write requests to complete then retry */

2530

/* Wait for some write requests to complete then retry */

2531

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2531

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2532

goto rebalance;

2532

goto rebalance;

2533

} else {

2533

} else {

2534

/*

2534

/*

2535

* High-order allocations do not necessarily loop after

2535

* High-order allocations do not necessarily loop after

2536

* direct reclaim and reclaim/compaction depends on compaction

2536

* direct reclaim and reclaim/compaction depends on compaction

2537

* being called after reclaim so call directly if necessary

2537

* being called after reclaim so call directly if necessary

2538

*/

2538

*/

2539

page = __alloc_pages_direct_compact(gfp_mask, order,

2539

page = __alloc_pages_direct_compact(gfp_mask, order,

2540

zonelist, high_zoneidx,

2540

zonelist, high_zoneidx,

2541

nodemask,

2541

nodemask,

2542

alloc_flags, preferred_zone,

2542

alloc_flags, preferred_zone,

2543

migratetype, sync_migration,

2543

migratetype, sync_migration,

2544

&contended_compaction,

2544

&contended_compaction,

2545

&deferred_compaction,

2545

&deferred_compaction,

2546

&did_some_progress);

2546

&did_some_progress);

2547

if (page)

2547

if (page)

2548

goto got_pg;

2548

goto got_pg;

2549

}

2549

}

2550

2551

nopage:

2551

nopage:

2552

warn_alloc_failed(gfp_mask, order, NULL);

2552

warn_alloc_failed(gfp_mask, order, NULL);

2553

return page;

2553

return page;

2554

got_pg:

2554

got_pg:

2555

if (kmemcheck_enabled)

2555

if (kmemcheck_enabled)

2556

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2556

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2557

2558

return page;

2558

return page;

2559

}

2559

}

2560

2561

/*

2561

/*

2562

* This is the 'heart' of the zoned buddy allocator.

2562

* This is the 'heart' of the zoned buddy allocator.

2563

*/

2563

*/

2564

struct page *

2564

struct page *

2565

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2565

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2566

struct zonelist *zonelist, nodemask_t *nodemask)

2566

struct zonelist *zonelist, nodemask_t *nodemask)

2567

{

2567

{

2568

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2568

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2569

struct zone *preferred_zone;

2569

struct zone *preferred_zone;

2570

struct page *page = NULL;

2570

struct page *page = NULL;

2571

int migratetype = allocflags_to_migratetype(gfp_mask);

2571

int migratetype = allocflags_to_migratetype(gfp_mask);

2572

unsigned int cpuset_mems_cookie;

2572

unsigned int cpuset_mems_cookie;

2573

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;

2573

int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;

2574

struct mem_cgroup *memcg = NULL;

2574

struct mem_cgroup *memcg = NULL;

2575

2576

gfp_mask &= gfp_allowed_mask;

2576

gfp_mask &= gfp_allowed_mask;

2577

2578

lockdep_trace_alloc(gfp_mask);

2578

lockdep_trace_alloc(gfp_mask);

2579

2580

might_sleep_if(gfp_mask & __GFP_WAIT);

2580

might_sleep_if(gfp_mask & __GFP_WAIT);

2581

2582

if (should_fail_alloc_page(gfp_mask, order))

2582

if (should_fail_alloc_page(gfp_mask, order))

2583

return NULL;

2583

return NULL;

2584

2585

/*

2585

/*

2586

* Check the zones suitable for the gfp_mask contain at least one

2586

* Check the zones suitable for the gfp_mask contain at least one

2587

* valid zone. It's possible to have an empty zonelist as a result

2587

* valid zone. It's possible to have an empty zonelist as a result

2588

* of GFP_THISNODE and a memoryless node

2588

* of GFP_THISNODE and a memoryless node

2589

*/

2589

*/

2590

if (unlikely(!zonelist->_zonerefs->zone))

2590

if (unlikely(!zonelist->_zonerefs->zone))

2591

return NULL;

2591

return NULL;

2592

2593

/*

2593

/*

2594

* Will only have any effect when __GFP_KMEMCG is set. This is

2594

* Will only have any effect when __GFP_KMEMCG is set. This is

2595

* verified in the (always inline) callee

2595

* verified in the (always inline) callee

2596

*/

2596

*/

2597

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2597

if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))

2598

return NULL;

2598

return NULL;

2599

2600

retry_cpuset:

2600

retry_cpuset:

2601

cpuset_mems_cookie = get_mems_allowed();

2601

cpuset_mems_cookie = get_mems_allowed();

2602

2603

/* The preferred zone is used for statistics later */

2603

/* The preferred zone is used for statistics later */

2604

first_zones_zonelist(zonelist, high_zoneidx,

2604

first_zones_zonelist(zonelist, high_zoneidx,

2605

nodemask ? : &cpuset_current_mems_allowed,

2605

nodemask ? : &cpuset_current_mems_allowed,

2606

&preferred_zone);

2606

&preferred_zone);

2607

if (!preferred_zone)

2607

if (!preferred_zone)

2608

goto out;

2608

goto out;

2609

2610

#ifdef CONFIG_CMA

2610

#ifdef CONFIG_CMA

2611

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2611

if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)

2612

alloc_flags |= ALLOC_CMA;

2612

alloc_flags |= ALLOC_CMA;

2613

#endif

2613

#endif

2614

/* First allocation attempt */

2614

/* First allocation attempt */

2615

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2615

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2616

zonelist, high_zoneidx, alloc_flags,

2616

zonelist, high_zoneidx, alloc_flags,

2617

preferred_zone, migratetype);

2617

preferred_zone, migratetype);

2618

if (unlikely(!page))

2618

if (unlikely(!page))

2619

page = __alloc_pages_slowpath(gfp_mask, order,

2619

page = __alloc_pages_slowpath(gfp_mask, order,

2620

zonelist, high_zoneidx, nodemask,

2620

zonelist, high_zoneidx, nodemask,

2621

preferred_zone, migratetype);

2621

preferred_zone, migratetype);

2622

2623

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2623

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2624

2625

out:

2625

out:

2626

/*

2626

/*

2627

* When updating a task's mems_allowed, it is possible to race with

2627

* When updating a task's mems_allowed, it is possible to race with

2628

* parallel threads in such a way that an allocation can fail while

2628

* parallel threads in such a way that an allocation can fail while

2629

* the mask is being updated. If a page allocation is about to fail,

2629

* the mask is being updated. If a page allocation is about to fail,

2630

* check if the cpuset changed during allocation and if so, retry.

2630

* check if the cpuset changed during allocation and if so, retry.

2631

*/

2631

*/

2632

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2632

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2633

goto retry_cpuset;

2633

goto retry_cpuset;

2634

2635

memcg_kmem_commit_charge(page, memcg, order);

2635

memcg_kmem_commit_charge(page, memcg, order);

2636

2637

return page;

2637

return page;

2638

}

2638

}

2639

EXPORT_SYMBOL(__alloc_pages_nodemask);

2639

EXPORT_SYMBOL(__alloc_pages_nodemask);

2640

2641

/*

2641

/*

2642

* Common helper functions.

2642

* Common helper functions.

2643

*/

2643

*/

2644

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2644

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2645

{

2645

{

2646

struct page *page;

2646

struct page *page;

2647

2648

/*

2648

/*

2649

* __get_free_pages() returns a 32-bit address, which cannot represent

2649

* __get_free_pages() returns a 32-bit address, which cannot represent

2650

* a highmem page

2650

* a highmem page

2651

*/

2651

*/

2652

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2652

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2653

2654

page = alloc_pages(gfp_mask, order);

2654

page = alloc_pages(gfp_mask, order);

2655

if (!page)

2655

if (!page)

2656

return 0;

2656

return 0;

2657

return (unsigned long) page_address(page);

2657

return (unsigned long) page_address(page);

2658

}

2658

}

2659

EXPORT_SYMBOL(__get_free_pages);

2659

EXPORT_SYMBOL(__get_free_pages);

2660

2661

unsigned long get_zeroed_page(gfp_t gfp_mask)

2661

unsigned long get_zeroed_page(gfp_t gfp_mask)

2662

{

2662

{

2663

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2663

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2664

}

2664

}

2665

EXPORT_SYMBOL(get_zeroed_page);

2665

EXPORT_SYMBOL(get_zeroed_page);

2666

2667

void __free_pages(struct page *page, unsigned int order)

2667

void __free_pages(struct page *page, unsigned int order)

2668

{

2668

{

2669

if (put_page_testzero(page)) {

2669

if (put_page_testzero(page)) {

2670

if (order == 0)

2670

if (order == 0)

2671

free_hot_cold_page(page, 0);

2671

free_hot_cold_page(page, 0);

2672

else

2672

else

2673

__free_pages_ok(page, order);

2673

__free_pages_ok(page, order);

2674

}

2674

}

2675

}

2675

}

2676

2677

EXPORT_SYMBOL(__free_pages);

2677

EXPORT_SYMBOL(__free_pages);

2678

2679

void free_pages(unsigned long addr, unsigned int order)

2679

void free_pages(unsigned long addr, unsigned int order)

2680

{

2680

{

2681

if (addr != 0) {

2681

if (addr != 0) {

2682

VM_BUG_ON(!virt_addr_valid((void *)addr));

2682

VM_BUG_ON(!virt_addr_valid((void *)addr));

2683

__free_pages(virt_to_page((void *)addr), order);

2683

__free_pages(virt_to_page((void *)addr), order);

2684

}

2684

}

2685

}

2685

}

2686

2687

EXPORT_SYMBOL(free_pages);

2687

EXPORT_SYMBOL(free_pages);

2688

2689

/*

2689

/*

2690

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2690

* __free_memcg_kmem_pages and free_memcg_kmem_pages will free

2691

* pages allocated with __GFP_KMEMCG.

2691

* pages allocated with __GFP_KMEMCG.

2692

*

2692

*

2693

* Those pages are accounted to a particular memcg, embedded in the

2693

* Those pages are accounted to a particular memcg, embedded in the

2694

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2694

* corresponding page_cgroup. To avoid adding a hit in the allocator to search

2695

* for that information only to find out that it is NULL for users who have no

2695

* for that information only to find out that it is NULL for users who have no

2696

* interest in that whatsoever, we provide these functions.

2696

* interest in that whatsoever, we provide these functions.

2697

*

2697

*

2698

* The caller knows better which flags it relies on.

2698

* The caller knows better which flags it relies on.

2699

*/

2699

*/

2700

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2700

void __free_memcg_kmem_pages(struct page *page, unsigned int order)

2701

{

2701

{

2702

memcg_kmem_uncharge_pages(page, order);

2702

memcg_kmem_uncharge_pages(page, order);

2703

__free_pages(page, order);

2703

__free_pages(page, order);

2704

}

2704

}

2705

2706

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2706

void free_memcg_kmem_pages(unsigned long addr, unsigned int order)

2707

{

2707

{

2708

if (addr != 0) {

2708

if (addr != 0) {

2709

VM_BUG_ON(!virt_addr_valid((void *)addr));

2709

VM_BUG_ON(!virt_addr_valid((void *)addr));

2710

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2710

__free_memcg_kmem_pages(virt_to_page((void *)addr), order);

2711

}

2711

}

2712

}

2712

}

2713

2714

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2714

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2715

{

2715

{

2716

if (addr) {

2716

if (addr) {

2717

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2717

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2718

unsigned long used = addr + PAGE_ALIGN(size);

2718

unsigned long used = addr + PAGE_ALIGN(size);

2719

2720

split_page(virt_to_page((void *)addr), order);

2720

split_page(virt_to_page((void *)addr), order);

2721

while (used < alloc_end) {

2721

while (used < alloc_end) {

2722

free_page(used);

2722

free_page(used);

2723

used += PAGE_SIZE;

2723

used += PAGE_SIZE;

2724

}

2724

}

2725

}

2725

}

2726

return (void *)addr;

2726

return (void *)addr;

2727

}

2727

}

2728

2729

/**

2729

/**

2730

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2730

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2731

* @size: the number of bytes to allocate

2731

* @size: the number of bytes to allocate

2732

* @gfp_mask: GFP flags for the allocation

2732

* @gfp_mask: GFP flags for the allocation

2733

*

2733

*

2734

* This function is similar to alloc_pages(), except that it allocates the

2734

* This function is similar to alloc_pages(), except that it allocates the

2735

* minimum number of pages to satisfy the request. alloc_pages() can only

2735

* minimum number of pages to satisfy the request. alloc_pages() can only

2736

* allocate memory in power-of-two pages.

2736

* allocate memory in power-of-two pages.

2737

*

2737

*

2738

* This function is also limited by MAX_ORDER.

2738

* This function is also limited by MAX_ORDER.

2739

*

2739

*

2740

* Memory allocated by this function must be released by free_pages_exact().

2740

* Memory allocated by this function must be released by free_pages_exact().

2741

*/

2741

*/

2742

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2742

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2743

{

2743

{

2744

unsigned int order = get_order(size);

2744

unsigned int order = get_order(size);

2745

unsigned long addr;

2745

unsigned long addr;

2746

2747

addr = __get_free_pages(gfp_mask, order);

2747

addr = __get_free_pages(gfp_mask, order);

2748

return make_alloc_exact(addr, order, size);

2748

return make_alloc_exact(addr, order, size);

2749

}

2749

}

2750

EXPORT_SYMBOL(alloc_pages_exact);

2750

EXPORT_SYMBOL(alloc_pages_exact);

2751

2752

/**

2752

/**

2753

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2753

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2754

* pages on a node.

2754

* pages on a node.

2755

* @nid: the preferred node ID where memory should be allocated

2755

* @nid: the preferred node ID where memory should be allocated

2756

* @size: the number of bytes to allocate

2756

* @size: the number of bytes to allocate

2757

* @gfp_mask: GFP flags for the allocation

2757

* @gfp_mask: GFP flags for the allocation

2758

*

2758

*

2759

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2759

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2760

* back.

2760

* back.

2761

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2761

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2762

* but is not exact.

2762

* but is not exact.

2763

*/

2763

*/

2764

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2764

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2765

{

2765

{

2766

unsigned order = get_order(size);

2766

unsigned order = get_order(size);

2767

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2767

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2768

if (!p)

2768

if (!p)

2769

return NULL;

2769

return NULL;

2770

return make_alloc_exact((unsigned long)page_address(p), order, size);

2770

return make_alloc_exact((unsigned long)page_address(p), order, size);

2771

}

2771

}

2772

EXPORT_SYMBOL(alloc_pages_exact_nid);

2772

EXPORT_SYMBOL(alloc_pages_exact_nid);

2773

2774

/**

2774

/**

2775

* free_pages_exact - release memory allocated via alloc_pages_exact()

2775

* free_pages_exact - release memory allocated via alloc_pages_exact()

2776

* @virt: the value returned by alloc_pages_exact.

2776

* @virt: the value returned by alloc_pages_exact.

2777

* @size: size of allocation, same value as passed to alloc_pages_exact().

2777

* @size: size of allocation, same value as passed to alloc_pages_exact().

2778

*

2778

*

2779

* Release the memory allocated by a previous call to alloc_pages_exact.

2779

* Release the memory allocated by a previous call to alloc_pages_exact.

2780

*/

2780

*/

2781

void free_pages_exact(void *virt, size_t size)

2781

void free_pages_exact(void *virt, size_t size)

2782

{

2782

{

2783

unsigned long addr = (unsigned long)virt;

2783

unsigned long addr = (unsigned long)virt;

2784

unsigned long end = addr + PAGE_ALIGN(size);

2784

unsigned long end = addr + PAGE_ALIGN(size);

2785

2786

while (addr < end) {

2786

while (addr < end) {

2787

free_page(addr);

2787

free_page(addr);

2788

addr += PAGE_SIZE;

2788

addr += PAGE_SIZE;

2789

}

2789

}

2790

}

2790

}

2791

EXPORT_SYMBOL(free_pages_exact);

2791

EXPORT_SYMBOL(free_pages_exact);

2792

2793

static unsigned int nr_free_zone_pages(int offset)

2793

static unsigned int nr_free_zone_pages(int offset)

2794

{

2794

{

2795

struct zoneref *z;

2795

struct zoneref *z;

2796

struct zone *zone;

2796

struct zone *zone;

2797

2798

/* Just pick one node, since fallback list is circular */

2798

/* Just pick one node, since fallback list is circular */

2799

unsigned int sum = 0;

2799

unsigned int sum = 0;

2800

2801

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2801

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2802

2803

for_each_zone_zonelist(zone, z, zonelist, offset) {

2803

for_each_zone_zonelist(zone, z, zonelist, offset) {

2804

unsigned long size = zone->present_pages;

2804

unsigned long size = zone->present_pages;

2805

unsigned long high = high_wmark_pages(zone);

2805

unsigned long high = high_wmark_pages(zone);

2806

if (size > high)

2806

if (size > high)

2807

sum += size - high;

2807

sum += size - high;

2808

}

2808

}

2809

2810

return sum;

2810

return sum;

2811

}

2811

}

2812

2813

/*

2813

/*

2814

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2814

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2815

*/

2815

*/

2816

unsigned int nr_free_buffer_pages(void)

2816

unsigned int nr_free_buffer_pages(void)

2817

{

2817

{

2818

return nr_free_zone_pages(gfp_zone(GFP_USER));

2818

return nr_free_zone_pages(gfp_zone(GFP_USER));

2819

}

2819

}

2820

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2820

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2821

2822

/*

2822

/*

2823

* Amount of free RAM allocatable within all zones

2823

* Amount of free RAM allocatable within all zones

2824

*/

2824

*/

2825

unsigned int nr_free_pagecache_pages(void)

2825

unsigned int nr_free_pagecache_pages(void)

2826

{

2826

{

2827

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2827

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2828

}

2828

}

2829

2830

static inline void show_node(struct zone *zone)

2830

static inline void show_node(struct zone *zone)

2831

{

2831

{

2832

if (IS_ENABLED(CONFIG_NUMA))

2832

if (IS_ENABLED(CONFIG_NUMA))

2833

printk("Node %d ", zone_to_nid(zone));

2833

printk("Node %d ", zone_to_nid(zone));

2834

}

2834

}

2835

2836

void si_meminfo(struct sysinfo *val)

2836

void si_meminfo(struct sysinfo *val)

2837

{

2837

{

2838

val->totalram = totalram_pages;

2838

val->totalram = totalram_pages;

2839

val->sharedram = 0;

2839

val->sharedram = 0;

2840

val->freeram = global_page_state(NR_FREE_PAGES);

2840

val->freeram = global_page_state(NR_FREE_PAGES);

2841

val->bufferram = nr_blockdev_pages();

2841

val->bufferram = nr_blockdev_pages();

2842

val->totalhigh = totalhigh_pages;

2842

val->totalhigh = totalhigh_pages;

2843

val->freehigh = nr_free_highpages();

2843

val->freehigh = nr_free_highpages();

2844

val->mem_unit = PAGE_SIZE;

2844

val->mem_unit = PAGE_SIZE;

2845

}

2845

}

2846

2847

EXPORT_SYMBOL(si_meminfo);

2847

EXPORT_SYMBOL(si_meminfo);

2848

2849

#ifdef CONFIG_NUMA

2849

#ifdef CONFIG_NUMA

2850

void si_meminfo_node(struct sysinfo *val, int nid)

2850

void si_meminfo_node(struct sysinfo *val, int nid)

2851

{

2851

{

2852

pg_data_t *pgdat = NODE_DATA(nid);

2852

pg_data_t *pgdat = NODE_DATA(nid);

2853

2854

val->totalram = pgdat->node_present_pages;

2854

val->totalram = pgdat->node_present_pages;

2855

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2855

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2856

#ifdef CONFIG_HIGHMEM

2856

#ifdef CONFIG_HIGHMEM

2857

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2857

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2858

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2858

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2859

NR_FREE_PAGES);

2859

NR_FREE_PAGES);

2860

#else

2860

#else

2861

val->totalhigh = 0;

2861

val->totalhigh = 0;

2862

val->freehigh = 0;

2862

val->freehigh = 0;

2863

#endif

2863

#endif

2864

val->mem_unit = PAGE_SIZE;

2864

val->mem_unit = PAGE_SIZE;

2865

}

2865

}

2866

#endif

2866

#endif

2867

2868

/*

2868

/*

2869

* Determine whether the node should be displayed or not, depending on whether

2869

* Determine whether the node should be displayed or not, depending on whether

2870

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2870

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2871

*/

2871

*/

2872

bool skip_free_areas_node(unsigned int flags, int nid)

2872

bool skip_free_areas_node(unsigned int flags, int nid)

2873

{

2873

{

2874

bool ret = false;

2874

bool ret = false;

2875

unsigned int cpuset_mems_cookie;

2875

unsigned int cpuset_mems_cookie;

2876

2877

if (!(flags & SHOW_MEM_FILTER_NODES))

2877

if (!(flags & SHOW_MEM_FILTER_NODES))

2878

goto out;

2878

goto out;

2879

2880

do {

2880

do {

2881

cpuset_mems_cookie = get_mems_allowed();

2881

cpuset_mems_cookie = get_mems_allowed();

2882

ret = !node_isset(nid, cpuset_current_mems_allowed);

2882

ret = !node_isset(nid, cpuset_current_mems_allowed);

2883

} while (!put_mems_allowed(cpuset_mems_cookie));

2883

} while (!put_mems_allowed(cpuset_mems_cookie));

2884

out:

2884

out:

2885

return ret;

2885

return ret;

2886

}

2886

}

2887

2888

#define K(x) ((x) << (PAGE_SHIFT-10))

2888

#define K(x) ((x) << (PAGE_SHIFT-10))

2889

2890

static void show_migration_types(unsigned char type)

2890

static void show_migration_types(unsigned char type)

2891

{

2891

{

2892

static const char types[MIGRATE_TYPES] = {

2892

static const char types[MIGRATE_TYPES] = {

2893

[MIGRATE_UNMOVABLE] = 'U',

2893

[MIGRATE_UNMOVABLE] = 'U',

2894

[MIGRATE_RECLAIMABLE] = 'E',

2894

[MIGRATE_RECLAIMABLE] = 'E',

2895

[MIGRATE_MOVABLE] = 'M',

2895

[MIGRATE_MOVABLE] = 'M',

2896

[MIGRATE_RESERVE] = 'R',

2896

[MIGRATE_RESERVE] = 'R',

2897

#ifdef CONFIG_CMA

2897

#ifdef CONFIG_CMA

2898

[MIGRATE_CMA] = 'C',

2898

[MIGRATE_CMA] = 'C',

2899

#endif

2899

#endif

2900

[MIGRATE_ISOLATE] = 'I',

2900

[MIGRATE_ISOLATE] = 'I',

2901

};

2901

};

2902

char tmp[MIGRATE_TYPES + 1];

2902

char tmp[MIGRATE_TYPES + 1];

2903

char *p = tmp;

2903

char *p = tmp;

2904

int i;

2904

int i;

2905

2906

for (i = 0; i < MIGRATE_TYPES; i++) {

2906

for (i = 0; i < MIGRATE_TYPES; i++) {

2907

if (type & (1 << i))

2907

if (type & (1 << i))

2908

*p++ = types[i];

2908

*p++ = types[i];

2909

}

2909

}

2910

2911

*p = '\0';

2911

*p = '\0';

2912

printk("(%s) ", tmp);

2912

printk("(%s) ", tmp);

2913

}

2913

}

2914

2915

/*

2915

/*

2916

* Show free area list (used inside shift_scroll-lock stuff)

2916

* Show free area list (used inside shift_scroll-lock stuff)

2917

* We also calculate the percentage fragmentation. We do this by counting the

2917

* We also calculate the percentage fragmentation. We do this by counting the

2918

* memory on each free list with the exception of the first item on the list.

2918

* memory on each free list with the exception of the first item on the list.

2919

* Suppresses nodes that are not allowed by current's cpuset if

2919

* Suppresses nodes that are not allowed by current's cpuset if

2920

* SHOW_MEM_FILTER_NODES is passed.

2920

* SHOW_MEM_FILTER_NODES is passed.

2921

*/

2921

*/

2922

void show_free_areas(unsigned int filter)

2922

void show_free_areas(unsigned int filter)

2923

{

2923

{

2924

int cpu;

2924

int cpu;

2925

struct zone *zone;

2925

struct zone *zone;

2926

2927

for_each_populated_zone(zone) {

2927

for_each_populated_zone(zone) {

2928

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2928

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2929

continue;

2929

continue;

2930

show_node(zone);

2930

show_node(zone);

2931

printk("%s per-cpu:\n", zone->name);

2931

printk("%s per-cpu:\n", zone->name);

2932

2933

for_each_online_cpu(cpu) {

2933

for_each_online_cpu(cpu) {

2934

struct per_cpu_pageset *pageset;

2934

struct per_cpu_pageset *pageset;

2935

2936

pageset = per_cpu_ptr(zone->pageset, cpu);

2936

pageset = per_cpu_ptr(zone->pageset, cpu);

2937

2938

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2938

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2939

cpu, pageset->pcp.high,

2939

cpu, pageset->pcp.high,

2940

pageset->pcp.batch, pageset->pcp.count);

2940

pageset->pcp.batch, pageset->pcp.count);

2941

}

2941

}

2942

}

2942

}

2943

2944

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2944

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2945

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2945

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2946

" unevictable:%lu"

2946

" unevictable:%lu"

2947

" dirty:%lu writeback:%lu unstable:%lu\n"

2947

" dirty:%lu writeback:%lu unstable:%lu\n"

2948

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2948

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2949

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

2949

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"

2950

" free_cma:%lu\n",

2950

" free_cma:%lu\n",

2951

global_page_state(NR_ACTIVE_ANON),

2951

global_page_state(NR_ACTIVE_ANON),

2952

global_page_state(NR_INACTIVE_ANON),

2952

global_page_state(NR_INACTIVE_ANON),

2953

global_page_state(NR_ISOLATED_ANON),

2953

global_page_state(NR_ISOLATED_ANON),

2954

global_page_state(NR_ACTIVE_FILE),

2954

global_page_state(NR_ACTIVE_FILE),

2955

global_page_state(NR_INACTIVE_FILE),

2955

global_page_state(NR_INACTIVE_FILE),

2956

global_page_state(NR_ISOLATED_FILE),

2956

global_page_state(NR_ISOLATED_FILE),

2957

global_page_state(NR_UNEVICTABLE),

2957

global_page_state(NR_UNEVICTABLE),

2958

global_page_state(NR_FILE_DIRTY),

2958

global_page_state(NR_FILE_DIRTY),

2959

global_page_state(NR_WRITEBACK),

2959

global_page_state(NR_WRITEBACK),

2960

global_page_state(NR_UNSTABLE_NFS),

2960

global_page_state(NR_UNSTABLE_NFS),

2961

global_page_state(NR_FREE_PAGES),

2961

global_page_state(NR_FREE_PAGES),

2962

global_page_state(NR_SLAB_RECLAIMABLE),

2962

global_page_state(NR_SLAB_RECLAIMABLE),

2963

global_page_state(NR_SLAB_UNRECLAIMABLE),

2963

global_page_state(NR_SLAB_UNRECLAIMABLE),

2964

global_page_state(NR_FILE_MAPPED),

2964

global_page_state(NR_FILE_MAPPED),

2965

global_page_state(NR_SHMEM),

2965

global_page_state(NR_SHMEM),

2966

global_page_state(NR_PAGETABLE),

2966

global_page_state(NR_PAGETABLE),

2967

global_page_state(NR_BOUNCE),

2967

global_page_state(NR_BOUNCE),

2968

global_page_state(NR_FREE_CMA_PAGES));

2968

global_page_state(NR_FREE_CMA_PAGES));

2969

2970

for_each_populated_zone(zone) {

2970

for_each_populated_zone(zone) {

2971

int i;

2971

int i;

2972

2973

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2973

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2974

continue;

2974

continue;

2975

show_node(zone);

2975

show_node(zone);

2976

printk("%s"

2976

printk("%s"

2977

" free:%lukB"

2977

" free:%lukB"

2978

" min:%lukB"

2978

" min:%lukB"

2979

" low:%lukB"

2979

" low:%lukB"

2980

" high:%lukB"

2980

" high:%lukB"

2981

" active_anon:%lukB"

2981

" active_anon:%lukB"

2982

" inactive_anon:%lukB"

2982

" inactive_anon:%lukB"

2983

" active_file:%lukB"

2983

" active_file:%lukB"

2984

" inactive_file:%lukB"

2984

" inactive_file:%lukB"

2985

" unevictable:%lukB"

2985

" unevictable:%lukB"

2986

" isolated(anon):%lukB"

2986

" isolated(anon):%lukB"

2987

" isolated(file):%lukB"

2987

" isolated(file):%lukB"

2988

" present:%lukB"

2988

" present:%lukB"

2989

" managed:%lukB"

2989

" managed:%lukB"

2990

" mlocked:%lukB"

2990

" mlocked:%lukB"

2991

" dirty:%lukB"

2991

" dirty:%lukB"

2992

" writeback:%lukB"

2992

" writeback:%lukB"

2993

" mapped:%lukB"

2993

" mapped:%lukB"

2994

" shmem:%lukB"

2994

" shmem:%lukB"

2995

" slab_reclaimable:%lukB"

2995

" slab_reclaimable:%lukB"

2996

" slab_unreclaimable:%lukB"

2996

" slab_unreclaimable:%lukB"

2997

" kernel_stack:%lukB"

2997

" kernel_stack:%lukB"

2998

" pagetables:%lukB"

2998

" pagetables:%lukB"

2999

" unstable:%lukB"

2999

" unstable:%lukB"

3000

" bounce:%lukB"

3000

" bounce:%lukB"

3001

" free_cma:%lukB"

3001

" free_cma:%lukB"

3002

" writeback_tmp:%lukB"

3002

" writeback_tmp:%lukB"

3003

" pages_scanned:%lu"

3003

" pages_scanned:%lu"

3004

" all_unreclaimable? %s"

3004

" all_unreclaimable? %s"

3005

"\n",

3005

"\n",

3006

zone->name,

3006

zone->name,

3007

K(zone_page_state(zone, NR_FREE_PAGES)),

3007

K(zone_page_state(zone, NR_FREE_PAGES)),

3008

K(min_wmark_pages(zone)),

3008

K(min_wmark_pages(zone)),

3009

K(low_wmark_pages(zone)),

3009

K(low_wmark_pages(zone)),

3010

K(high_wmark_pages(zone)),

3010

K(high_wmark_pages(zone)),

3011

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3011

K(zone_page_state(zone, NR_ACTIVE_ANON)),

3012

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3012

K(zone_page_state(zone, NR_INACTIVE_ANON)),

3013

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3013

K(zone_page_state(zone, NR_ACTIVE_FILE)),

3014

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3014

K(zone_page_state(zone, NR_INACTIVE_FILE)),

3015

K(zone_page_state(zone, NR_UNEVICTABLE)),

3015

K(zone_page_state(zone, NR_UNEVICTABLE)),

3016

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3016

K(zone_page_state(zone, NR_ISOLATED_ANON)),

3017

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3017

K(zone_page_state(zone, NR_ISOLATED_FILE)),

3018

K(zone->present_pages),

3018

K(zone->present_pages),

3019

K(zone->managed_pages),

3019

K(zone->managed_pages),

3020

K(zone_page_state(zone, NR_MLOCK)),

3020

K(zone_page_state(zone, NR_MLOCK)),

3021

K(zone_page_state(zone, NR_FILE_DIRTY)),

3021

K(zone_page_state(zone, NR_FILE_DIRTY)),

3022

K(zone_page_state(zone, NR_WRITEBACK)),

3022

K(zone_page_state(zone, NR_WRITEBACK)),

3023

K(zone_page_state(zone, NR_FILE_MAPPED)),

3023

K(zone_page_state(zone, NR_FILE_MAPPED)),

3024

K(zone_page_state(zone, NR_SHMEM)),

3024

K(zone_page_state(zone, NR_SHMEM)),

3025

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3025

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

3026

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3026

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

3027

zone_page_state(zone, NR_KERNEL_STACK) *

3027

zone_page_state(zone, NR_KERNEL_STACK) *

3028

THREAD_SIZE / 1024,

3028

THREAD_SIZE / 1024,

3029

K(zone_page_state(zone, NR_PAGETABLE)),

3029

K(zone_page_state(zone, NR_PAGETABLE)),

3030

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3030

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

3031

K(zone_page_state(zone, NR_BOUNCE)),

3031

K(zone_page_state(zone, NR_BOUNCE)),

3032

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3032

K(zone_page_state(zone, NR_FREE_CMA_PAGES)),

3033

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3033

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

3034

zone->pages_scanned,

3034

zone->pages_scanned,

3035

(zone->all_unreclaimable ? "yes" : "no")

3035

(zone->all_unreclaimable ? "yes" : "no")

3036

);

3036

);

3037

printk("lowmem_reserve[]:");

3037

printk("lowmem_reserve[]:");

3038

for (i = 0; i < MAX_NR_ZONES; i++)

3038

for (i = 0; i < MAX_NR_ZONES; i++)

3039

printk(" %lu", zone->lowmem_reserve[i]);

3039

printk(" %lu", zone->lowmem_reserve[i]);

3040

printk("\n");

3040

printk("\n");

3041

}

3041

}

3042

3043

for_each_populated_zone(zone) {

3043

for_each_populated_zone(zone) {

3044

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3044

unsigned long nr[MAX_ORDER], flags, order, total = 0;

3045

unsigned char types[MAX_ORDER];

3045

unsigned char types[MAX_ORDER];

3046

3047

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3047

if (skip_free_areas_node(filter, zone_to_nid(zone)))

3048

continue;

3048

continue;

3049

show_node(zone);

3049

show_node(zone);

3050

printk("%s: ", zone->name);

3050

printk("%s: ", zone->name);

3051

3052

spin_lock_irqsave(&zone->lock, flags);

3052

spin_lock_irqsave(&zone->lock, flags);

3053

for (order = 0; order < MAX_ORDER; order++) {

3053

for (order = 0; order < MAX_ORDER; order++) {

3054

struct free_area *area = &zone->free_area[order];

3054

struct free_area *area = &zone->free_area[order];

3055

int type;

3055

int type;

3056

3057

nr[order] = area->nr_free;

3057

nr[order] = area->nr_free;

3058

total += nr[order] << order;

3058

total += nr[order] << order;

3059

3060

types[order] = 0;

3060

types[order] = 0;

3061

for (type = 0; type < MIGRATE_TYPES; type++) {

3061

for (type = 0; type < MIGRATE_TYPES; type++) {

3062

if (!list_empty(&area->free_list[type]))

3062

if (!list_empty(&area->free_list[type]))

3063

types[order] |= 1 << type;

3063

types[order] |= 1 << type;

3064

}

3064

}

3065

}

3065

}

3066

spin_unlock_irqrestore(&zone->lock, flags);

3066

spin_unlock_irqrestore(&zone->lock, flags);

3067

for (order = 0; order < MAX_ORDER; order++) {

3067

for (order = 0; order < MAX_ORDER; order++) {

3068

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3068

printk("%lu*%lukB ", nr[order], K(1UL) << order);

3069

if (nr[order])

3069

if (nr[order])

3070

show_migration_types(types[order]);

3070

show_migration_types(types[order]);

3071

}

3071

}

3072

printk("= %lukB\n", K(total));

3072

printk("= %lukB\n", K(total));

3073

}

3073

}

3074

3075

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3075

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

3076

3077

show_swap_cache_info();

3077

show_swap_cache_info();

3078

}

3078

}

3079

3080

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3080

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

3081

{

3081

{

3082

zoneref->zone = zone;

3082

zoneref->zone = zone;

3083

zoneref->zone_idx = zone_idx(zone);

3083

zoneref->zone_idx = zone_idx(zone);

3084

}

3084

}

3085

3086

/*

3086

/*

3087

* Builds allocation fallback zone lists.

3087

* Builds allocation fallback zone lists.

3088

*

3088

*

3089

* Add all populated zones of a node to the zonelist.

3089

* Add all populated zones of a node to the zonelist.

3090

*/

3090

*/

3091

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3091

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

3092

int nr_zones, enum zone_type zone_type)

3092

int nr_zones, enum zone_type zone_type)

3093

{

3093

{

3094

struct zone *zone;

3094

struct zone *zone;

3095

3096

BUG_ON(zone_type >= MAX_NR_ZONES);

3096

BUG_ON(zone_type >= MAX_NR_ZONES);

3097

zone_type++;

3097

zone_type++;

3098

3099

do {

3099

do {

3100

zone_type--;

3100

zone_type--;

3101

zone = pgdat->node_zones + zone_type;

3101

zone = pgdat->node_zones + zone_type;

3102

if (populated_zone(zone)) {

3102

if (populated_zone(zone)) {

3103

zoneref_set_zone(zone,

3103

zoneref_set_zone(zone,

3104

&zonelist->_zonerefs[nr_zones++]);

3104

&zonelist->_zonerefs[nr_zones++]);

3105

check_highest_zone(zone_type);

3105

check_highest_zone(zone_type);

3106

}

3106

}

3107

3108

} while (zone_type);

3108

} while (zone_type);

3109

return nr_zones;

3109

return nr_zones;

3110

}

3110

}

3111

3112

3113

/*

3113

/*

3114

* zonelist_order:

3114

* zonelist_order:

3115

* 0 = automatic detection of better ordering.

3115

* 0 = automatic detection of better ordering.

3116

* 1 = order by ([node] distance, -zonetype)

3116

* 1 = order by ([node] distance, -zonetype)

3117

* 2 = order by (-zonetype, [node] distance)

3117

* 2 = order by (-zonetype, [node] distance)

3118

*

3118

*

3119

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3119

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

3120

* the same zonelist. So only NUMA can configure this param.

3120

* the same zonelist. So only NUMA can configure this param.

3121

*/

3121

*/

3122

#define ZONELIST_ORDER_DEFAULT 0

3122

#define ZONELIST_ORDER_DEFAULT 0

3123

#define ZONELIST_ORDER_NODE 1

3123

#define ZONELIST_ORDER_NODE 1

3124

#define ZONELIST_ORDER_ZONE 2

3124

#define ZONELIST_ORDER_ZONE 2

3125

3126

/* zonelist order in the kernel.

3126

/* zonelist order in the kernel.

3127

* set_zonelist_order() will set this to NODE or ZONE.

3127

* set_zonelist_order() will set this to NODE or ZONE.

3128

*/

3128

*/

3129

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3129

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3130

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3130

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3131

3132

3133

#ifdef CONFIG_NUMA

3133

#ifdef CONFIG_NUMA

3134

/* The value user specified ....changed by config */

3134

/* The value user specified ....changed by config */

3135

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3135

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3136

/* string for sysctl */

3136

/* string for sysctl */

3137

#define NUMA_ZONELIST_ORDER_LEN 16

3137

#define NUMA_ZONELIST_ORDER_LEN 16

3138

char numa_zonelist_order[16] = "default";

3138

char numa_zonelist_order[16] = "default";

3139

3140

/*

3140

/*

3141

* interface for configure zonelist ordering.

3141

* interface for configure zonelist ordering.

3142

* command line option "numa_zonelist_order"

3142

* command line option "numa_zonelist_order"

3143

* = "[dD]efault - default, automatic configuration.

3143

* = "[dD]efault - default, automatic configuration.

3144

* = "[nN]ode - order by node locality, then by zone within node

3144

* = "[nN]ode - order by node locality, then by zone within node

3145

* = "[zZ]one - order by zone, then by locality within zone

3145

* = "[zZ]one - order by zone, then by locality within zone

3146

*/

3146

*/

3147

3148

static int __parse_numa_zonelist_order(char *s)

3148

static int __parse_numa_zonelist_order(char *s)

3149

{

3149

{

3150

if (*s == 'd' || *s == 'D') {

3150

if (*s == 'd' || *s == 'D') {

3151

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3151

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3152

} else if (*s == 'n' || *s == 'N') {

3152

} else if (*s == 'n' || *s == 'N') {

3153

user_zonelist_order = ZONELIST_ORDER_NODE;

3153

user_zonelist_order = ZONELIST_ORDER_NODE;

3154

} else if (*s == 'z' || *s == 'Z') {

3154

} else if (*s == 'z' || *s == 'Z') {

3155

user_zonelist_order = ZONELIST_ORDER_ZONE;

3155

user_zonelist_order = ZONELIST_ORDER_ZONE;

3156

} else {

3156

} else {

3157

printk(KERN_WARNING

3157

printk(KERN_WARNING

3158

"Ignoring invalid numa_zonelist_order value: "

3158

"Ignoring invalid numa_zonelist_order value: "

3159

"%s\n", s);

3159

"%s\n", s);

3160

return -EINVAL;

3160

return -EINVAL;

3161

}

3161

}

3162

return 0;

3162

return 0;

3163

}

3163

}

3164

3165

static __init int setup_numa_zonelist_order(char *s)

3165

static __init int setup_numa_zonelist_order(char *s)

3166

{

3166

{

3167

int ret;

3167

int ret;

3168

3169

if (!s)

3169

if (!s)

3170

return 0;

3170

return 0;

3171

3172

ret = __parse_numa_zonelist_order(s);

3172

ret = __parse_numa_zonelist_order(s);

3173

if (ret == 0)

3173

if (ret == 0)

3174

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3174

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3175

3176

return ret;

3176

return ret;

3177

}

3177

}

3178

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3178

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3179

3180

/*

3180

/*

3181

* sysctl handler for numa_zonelist_order

3181

* sysctl handler for numa_zonelist_order

3182

*/

3182

*/

3183

int numa_zonelist_order_handler(ctl_table *table, int write,

3183

int numa_zonelist_order_handler(ctl_table *table, int write,

3184

void __user *buffer, size_t *length,

3184

void __user *buffer, size_t *length,

3185

loff_t *ppos)

3185

loff_t *ppos)

3186

{

3186

{

3187

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3187

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3188

int ret;

3188

int ret;

3189

static DEFINE_MUTEX(zl_order_mutex);

3189

static DEFINE_MUTEX(zl_order_mutex);

3190

3191

mutex_lock(&zl_order_mutex);

3191

mutex_lock(&zl_order_mutex);

3192

if (write)

3192

if (write)

3193

strcpy(saved_string, (char*)table->data);

3193

strcpy(saved_string, (char*)table->data);

3194

ret = proc_dostring(table, write, buffer, length, ppos);

3194

ret = proc_dostring(table, write, buffer, length, ppos);

3195

if (ret)

3195

if (ret)

3196

goto out;

3196

goto out;

3197

if (write) {

3197

if (write) {

3198

int oldval = user_zonelist_order;

3198

int oldval = user_zonelist_order;

3199

if (__parse_numa_zonelist_order((char*)table->data)) {

3199

if (__parse_numa_zonelist_order((char*)table->data)) {

3200

/*

3200

/*

3201

* bogus value. restore saved string

3201

* bogus value. restore saved string

3202

*/

3202

*/

3203

strncpy((char*)table->data, saved_string,

3203

strncpy((char*)table->data, saved_string,

3204

NUMA_ZONELIST_ORDER_LEN);

3204

NUMA_ZONELIST_ORDER_LEN);

3205

user_zonelist_order = oldval;

3205

user_zonelist_order = oldval;

3206

} else if (oldval != user_zonelist_order) {

3206

} else if (oldval != user_zonelist_order) {

3207

mutex_lock(&zonelists_mutex);

3207

mutex_lock(&zonelists_mutex);

3208

build_all_zonelists(NULL, NULL);

3208

build_all_zonelists(NULL, NULL);

3209

mutex_unlock(&zonelists_mutex);

3209

mutex_unlock(&zonelists_mutex);

3210

}

3210

}

3211

}

3211

}

3212

out:

3212

out:

3213

mutex_unlock(&zl_order_mutex);

3213

mutex_unlock(&zl_order_mutex);

3214

return ret;

3214

return ret;

3215

}

3215

}

3216

3217

3218

#define MAX_NODE_LOAD (nr_online_nodes)

3218

#define MAX_NODE_LOAD (nr_online_nodes)

3219

static int node_load[MAX_NUMNODES];

3219

static int node_load[MAX_NUMNODES];

3220

3221

/**

3221

/**

3222

* find_next_best_node - find the next node that should appear in a given node's fallback list

3222

* find_next_best_node - find the next node that should appear in a given node's fallback list

3223

* @node: node whose fallback list we're appending

3223

* @node: node whose fallback list we're appending

3224

* @used_node_mask: nodemask_t of already used nodes

3224

* @used_node_mask: nodemask_t of already used nodes

3225

*

3225

*

3226

* We use a number of factors to determine which is the next node that should

3226

* We use a number of factors to determine which is the next node that should

3227

* appear on a given node's fallback list. The node should not have appeared

3227

* appear on a given node's fallback list. The node should not have appeared

3228

* already in @node's fallback list, and it should be the next closest node

3228

* already in @node's fallback list, and it should be the next closest node

3229

* according to the distance array (which contains arbitrary distance values

3229

* according to the distance array (which contains arbitrary distance values

3230

* from each node to each node in the system), and should also prefer nodes

3230

* from each node to each node in the system), and should also prefer nodes

3231

* with no CPUs, since presumably they'll have very little allocation pressure

3231

* with no CPUs, since presumably they'll have very little allocation pressure

3232

* on them otherwise.

3232

* on them otherwise.

3233

* It returns -1 if no node is found.

3233

* It returns -1 if no node is found.

3234

*/

3234

*/

3235

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3235

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3236

{

3236

{

3237

int n, val;

3237

int n, val;

3238

int min_val = INT_MAX;

3238

int min_val = INT_MAX;

3239

int best_node = -1;

3239

int best_node = -1;

3240

const struct cpumask *tmp = cpumask_of_node(0);

3240

const struct cpumask *tmp = cpumask_of_node(0);

3241

3242

/* Use the local node if we haven't already */

3242

/* Use the local node if we haven't already */

3243

if (!node_isset(node, *used_node_mask)) {

3243

if (!node_isset(node, *used_node_mask)) {

3244

node_set(node, *used_node_mask);

3244

node_set(node, *used_node_mask);

3245

return node;

3245

return node;

3246

}

3246

}

3247

3248

for_each_node_state(n, N_MEMORY) {

3248

for_each_node_state(n, N_MEMORY) {

3249

3250

/* Don't want a node to appear more than once */

3250

/* Don't want a node to appear more than once */

3251

if (node_isset(n, *used_node_mask))

3251

if (node_isset(n, *used_node_mask))

3252

continue;

3252

continue;

3253

3254

/* Use the distance array to find the distance */

3254

/* Use the distance array to find the distance */

3255

val = node_distance(node, n);

3255

val = node_distance(node, n);

3256

3257

/* Penalize nodes under us ("prefer the next node") */

3257

/* Penalize nodes under us ("prefer the next node") */

3258

val += (n < node);

3258

val += (n < node);

3259

3260

/* Give preference to headless and unused nodes */

3260

/* Give preference to headless and unused nodes */

3261

tmp = cpumask_of_node(n);

3261

tmp = cpumask_of_node(n);

3262

if (!cpumask_empty(tmp))

3262

if (!cpumask_empty(tmp))

3263

val += PENALTY_FOR_NODE_WITH_CPUS;

3263

val += PENALTY_FOR_NODE_WITH_CPUS;

3264

3265

/* Slight preference for less loaded node */

3265

/* Slight preference for less loaded node */

3266

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3266

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3267

val += node_load[n];

3267

val += node_load[n];

3268

3269

if (val < min_val) {

3269

if (val < min_val) {

3270

min_val = val;

3270

min_val = val;

3271

best_node = n;

3271

best_node = n;

3272

}

3272

}

3273

}

3273

}

3274

3275

if (best_node >= 0)

3275

if (best_node >= 0)

3276

node_set(best_node, *used_node_mask);

3276

node_set(best_node, *used_node_mask);

3277

3278

return best_node;

3278

return best_node;

3279

}

3279

}

3280

3281

3282

/*

3282

/*

3283

* Build zonelists ordered by node and zones within node.

3283

* Build zonelists ordered by node and zones within node.

3284

* This results in maximum locality--normal zone overflows into local

3284

* This results in maximum locality--normal zone overflows into local

3285

* DMA zone, if any--but risks exhausting DMA zone.

3285

* DMA zone, if any--but risks exhausting DMA zone.

3286

*/

3286

*/

3287

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3287

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3288

{

3288

{

3289

int j;

3289

int j;

3290

struct zonelist *zonelist;

3290

struct zonelist *zonelist;

3291

3292

zonelist = &pgdat->node_zonelists[0];

3292

zonelist = &pgdat->node_zonelists[0];

3293

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3293

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3294

;

3294

;

3295

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3295

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3296

MAX_NR_ZONES - 1);

3296

MAX_NR_ZONES - 1);

3297

zonelist->_zonerefs[j].zone = NULL;

3297

zonelist->_zonerefs[j].zone = NULL;

3298

zonelist->_zonerefs[j].zone_idx = 0;

3298

zonelist->_zonerefs[j].zone_idx = 0;

3299

}

3299

}

3300

3301

/*

3301

/*

3302

* Build gfp_thisnode zonelists

3302

* Build gfp_thisnode zonelists

3303

*/

3303

*/

3304

static void build_thisnode_zonelists(pg_data_t *pgdat)

3304

static void build_thisnode_zonelists(pg_data_t *pgdat)

3305

{

3305

{

3306

int j;

3306

int j;

3307

struct zonelist *zonelist;

3307

struct zonelist *zonelist;

3308

3309

zonelist = &pgdat->node_zonelists[1];

3309

zonelist = &pgdat->node_zonelists[1];

3310

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3310

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3311

zonelist->_zonerefs[j].zone = NULL;

3311

zonelist->_zonerefs[j].zone = NULL;

3312

zonelist->_zonerefs[j].zone_idx = 0;

3312

zonelist->_zonerefs[j].zone_idx = 0;

3313

}

3313

}

3314

3315

/*

3315

/*

3316

* Build zonelists ordered by zone and nodes within zones.

3316

* Build zonelists ordered by zone and nodes within zones.

3317

* This results in conserving DMA zone[s] until all Normal memory is

3317

* This results in conserving DMA zone[s] until all Normal memory is

3318

* exhausted, but results in overflowing to remote node while memory

3318

* exhausted, but results in overflowing to remote node while memory

3319

* may still exist in local DMA zone.

3319

* may still exist in local DMA zone.

3320

*/

3320

*/

3321

static int node_order[MAX_NUMNODES];

3321

static int node_order[MAX_NUMNODES];

3322

3323

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3323

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3324

{

3324

{

3325

int pos, j, node;

3325

int pos, j, node;

3326

int zone_type; /* needs to be signed */

3326

int zone_type; /* needs to be signed */

3327

struct zone *z;

3327

struct zone *z;

3328

struct zonelist *zonelist;

3328

struct zonelist *zonelist;

3329

3330

zonelist = &pgdat->node_zonelists[0];

3330

zonelist = &pgdat->node_zonelists[0];

3331

pos = 0;

3331

pos = 0;

3332

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3332

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3333

for (j = 0; j < nr_nodes; j++) {

3333

for (j = 0; j < nr_nodes; j++) {

3334

node = node_order[j];

3334

node = node_order[j];

3335

z = &NODE_DATA(node)->node_zones[zone_type];

3335

z = &NODE_DATA(node)->node_zones[zone_type];

3336

if (populated_zone(z)) {

3336

if (populated_zone(z)) {

3337

zoneref_set_zone(z,

3337

zoneref_set_zone(z,

3338

&zonelist->_zonerefs[pos++]);

3338

&zonelist->_zonerefs[pos++]);

3339

check_highest_zone(zone_type);

3339

check_highest_zone(zone_type);

3340

}

3340

}

3341

}

3341

}

3342

}

3342

}

3343

zonelist->_zonerefs[pos].zone = NULL;

3343

zonelist->_zonerefs[pos].zone = NULL;

3344

zonelist->_zonerefs[pos].zone_idx = 0;

3344

zonelist->_zonerefs[pos].zone_idx = 0;

3345

}

3345

}

3346

3347

static int default_zonelist_order(void)

3347

static int default_zonelist_order(void)

3348

{

3348

{

3349

int nid, zone_type;

3349

int nid, zone_type;

3350

unsigned long low_kmem_size,total_size;

3350

unsigned long low_kmem_size,total_size;

3351

struct zone *z;

3351

struct zone *z;

3352

int average_size;

3352

int average_size;

3353

/*

3353

/*

3354

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3354

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3355

* If they are really small and used heavily, the system can fall

3355

* If they are really small and used heavily, the system can fall

3356

* into OOM very easily.

3356

* into OOM very easily.

3357

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3357

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3358

*/

3358

*/

3359

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3359

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3360

low_kmem_size = 0;

3360

low_kmem_size = 0;

3361

total_size = 0;

3361

total_size = 0;

3362

for_each_online_node(nid) {

3362

for_each_online_node(nid) {

3363

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3363

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3364

z = &NODE_DATA(nid)->node_zones[zone_type];

3364

z = &NODE_DATA(nid)->node_zones[zone_type];

3365

if (populated_zone(z)) {

3365

if (populated_zone(z)) {

3366

if (zone_type < ZONE_NORMAL)

3366

if (zone_type < ZONE_NORMAL)

3367

low_kmem_size += z->present_pages;

3367

low_kmem_size += z->present_pages;

3368

total_size += z->present_pages;

3368

total_size += z->present_pages;

3369

} else if (zone_type == ZONE_NORMAL) {

3369

} else if (zone_type == ZONE_NORMAL) {

3370

/*

3370

/*

3371

* If any node has only lowmem, then node order

3371

* If any node has only lowmem, then node order

3372

* is preferred to allow kernel allocations

3372

* is preferred to allow kernel allocations

3373

* locally; otherwise, they can easily infringe

3373

* locally; otherwise, they can easily infringe

3374

* on other nodes when there is an abundance of

3374

* on other nodes when there is an abundance of

3375

* lowmem available to allocate from.

3375

* lowmem available to allocate from.

3376

*/

3376

*/

3377

return ZONELIST_ORDER_NODE;

3377

return ZONELIST_ORDER_NODE;

3378

}

3378

}

3379

}

3379

}

3380

}

3380

}

3381

if (!low_kmem_size || /* there are no DMA area. */

3381

if (!low_kmem_size || /* there are no DMA area. */

3382

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3382

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3383

return ZONELIST_ORDER_NODE;

3383

return ZONELIST_ORDER_NODE;

3384

/*

3384

/*

3385

* look into each node's config.

3385

* look into each node's config.

3386

* If there is a node whose DMA/DMA32 memory is very big area on

3386

* If there is a node whose DMA/DMA32 memory is very big area on

3387

* local memory, NODE_ORDER may be suitable.

3387

* local memory, NODE_ORDER may be suitable.

3388

*/

3388

*/

3389

average_size = total_size /

3389

average_size = total_size /

3390

(nodes_weight(node_states[N_MEMORY]) + 1);

3390

(nodes_weight(node_states[N_MEMORY]) + 1);

3391

for_each_online_node(nid) {

3391

for_each_online_node(nid) {

3392

low_kmem_size = 0;

3392

low_kmem_size = 0;

3393

total_size = 0;

3393

total_size = 0;

3394

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3394

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3395

z = &NODE_DATA(nid)->node_zones[zone_type];

3395

z = &NODE_DATA(nid)->node_zones[zone_type];

3396

if (populated_zone(z)) {

3396

if (populated_zone(z)) {

3397

if (zone_type < ZONE_NORMAL)

3397

if (zone_type < ZONE_NORMAL)

3398

low_kmem_size += z->present_pages;

3398

low_kmem_size += z->present_pages;

3399

total_size += z->present_pages;

3399

total_size += z->present_pages;

3400

}

3400

}

3401

}

3401

}

3402

if (low_kmem_size &&

3402

if (low_kmem_size &&

3403

total_size > average_size && /* ignore small node */

3403

total_size > average_size && /* ignore small node */

3404

low_kmem_size > total_size * 70/100)

3404

low_kmem_size > total_size * 70/100)

3405

return ZONELIST_ORDER_NODE;

3405

return ZONELIST_ORDER_NODE;

3406

}

3406

}

3407

return ZONELIST_ORDER_ZONE;

3407

return ZONELIST_ORDER_ZONE;

3408

}

3408

}

3409

3410

static void set_zonelist_order(void)

3410

static void set_zonelist_order(void)

3411

{

3411

{

3412

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3412

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3413

current_zonelist_order = default_zonelist_order();

3413

current_zonelist_order = default_zonelist_order();

3414

else

3414

else

3415

current_zonelist_order = user_zonelist_order;

3415

current_zonelist_order = user_zonelist_order;

3416

}

3416

}

3417

3418

static void build_zonelists(pg_data_t *pgdat)

3418

static void build_zonelists(pg_data_t *pgdat)

3419

{

3419

{

3420

int j, node, load;

3420

int j, node, load;

3421

enum zone_type i;

3421

enum zone_type i;

3422

nodemask_t used_mask;

3422

nodemask_t used_mask;

3423

int local_node, prev_node;

3423

int local_node, prev_node;

3424

struct zonelist *zonelist;

3424

struct zonelist *zonelist;

3425

int order = current_zonelist_order;

3425

int order = current_zonelist_order;

3426

3427

/* initialize zonelists */

3427

/* initialize zonelists */

3428

for (i = 0; i < MAX_ZONELISTS; i++) {

3428

for (i = 0; i < MAX_ZONELISTS; i++) {

3429

zonelist = pgdat->node_zonelists + i;

3429

zonelist = pgdat->node_zonelists + i;

3430

zonelist->_zonerefs[0].zone = NULL;

3430

zonelist->_zonerefs[0].zone = NULL;

3431

zonelist->_zonerefs[0].zone_idx = 0;

3431

zonelist->_zonerefs[0].zone_idx = 0;

3432

}

3432

}

3433

3434

/* NUMA-aware ordering of nodes */

3434

/* NUMA-aware ordering of nodes */

3435

local_node = pgdat->node_id;

3435

local_node = pgdat->node_id;

3436

load = nr_online_nodes;

3436

load = nr_online_nodes;

3437

prev_node = local_node;

3437

prev_node = local_node;

3438

nodes_clear(used_mask);

3438

nodes_clear(used_mask);

3439

3440

memset(node_order, 0, sizeof(node_order));

3440

memset(node_order, 0, sizeof(node_order));

3441

j = 0;

3441

j = 0;

3442

3443

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3443

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3444

/*

3444

/*

3445

* We don't want to pressure a particular node.

3445

* We don't want to pressure a particular node.

3446

* So adding penalty to the first node in same

3446

* So adding penalty to the first node in same

3447

* distance group to make it round-robin.

3447

* distance group to make it round-robin.

3448

*/

3448

*/

3449

if (node_distance(local_node, node) !=

3449

if (node_distance(local_node, node) !=

3450

node_distance(local_node, prev_node))

3450

node_distance(local_node, prev_node))

3451

node_load[node] = load;

3451

node_load[node] = load;

3452

3453

prev_node = node;

3453

prev_node = node;

3454

load--;

3454

load--;

3455

if (order == ZONELIST_ORDER_NODE)

3455

if (order == ZONELIST_ORDER_NODE)

3456

build_zonelists_in_node_order(pgdat, node);

3456

build_zonelists_in_node_order(pgdat, node);

3457

else

3457

else

3458

node_order[j++] = node; /* remember order */

3458

node_order[j++] = node; /* remember order */

3459

}

3459

}

3460

3461

if (order == ZONELIST_ORDER_ZONE) {

3461

if (order == ZONELIST_ORDER_ZONE) {

3462

/* calculate node order -- i.e., DMA last! */

3462

/* calculate node order -- i.e., DMA last! */

3463

build_zonelists_in_zone_order(pgdat, j);

3463

build_zonelists_in_zone_order(pgdat, j);

3464

}

3464

}

3465

3466

build_thisnode_zonelists(pgdat);

3466

build_thisnode_zonelists(pgdat);

3467

}

3467

}

3468

3469

/* Construct the zonelist performance cache - see further mmzone.h */

3469

/* Construct the zonelist performance cache - see further mmzone.h */

3470

static void build_zonelist_cache(pg_data_t *pgdat)

3470

static void build_zonelist_cache(pg_data_t *pgdat)

3471

{

3471

{

3472

struct zonelist *zonelist;

3472

struct zonelist *zonelist;

3473

struct zonelist_cache *zlc;

3473

struct zonelist_cache *zlc;

3474

struct zoneref *z;

3474

struct zoneref *z;

3475

3476

zonelist = &pgdat->node_zonelists[0];

3476

zonelist = &pgdat->node_zonelists[0];

3477

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3477

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3478

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3478

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3479

for (z = zonelist->_zonerefs; z->zone; z++)

3479

for (z = zonelist->_zonerefs; z->zone; z++)

3480

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3480

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3481

}

3481

}

3482

3483

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3483

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3484

/*

3484

/*

3485

* Return node id of node used for "local" allocations.

3485

* Return node id of node used for "local" allocations.

3486

* I.e., first node id of first zone in arg node's generic zonelist.

3486

* I.e., first node id of first zone in arg node's generic zonelist.

3487

* Used for initializing percpu 'numa_mem', which is used primarily

3487

* Used for initializing percpu 'numa_mem', which is used primarily

3488

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3488

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3489

*/

3489

*/

3490

int local_memory_node(int node)

3490

int local_memory_node(int node)

3491

{

3491

{

3492

struct zone *zone;

3492

struct zone *zone;

3493

3494

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3494

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3495

gfp_zone(GFP_KERNEL),

3495

gfp_zone(GFP_KERNEL),

3496

NULL,

3496

NULL,

3497

&zone);

3497

&zone);

3498

return zone->node;

3498

return zone->node;

3499

}

3499

}

3500

#endif

3500

#endif

3501

3502

#else /* CONFIG_NUMA */

3502

#else /* CONFIG_NUMA */

3503

3504

static void set_zonelist_order(void)

3504

static void set_zonelist_order(void)

3505

{

3505

{

3506

current_zonelist_order = ZONELIST_ORDER_ZONE;

3506

current_zonelist_order = ZONELIST_ORDER_ZONE;

3507

}

3507

}

3508

3509

static void build_zonelists(pg_data_t *pgdat)

3509

static void build_zonelists(pg_data_t *pgdat)

3510

{

3510

{

3511

int node, local_node;

3511

int node, local_node;

3512

enum zone_type j;

3512

enum zone_type j;

3513

struct zonelist *zonelist;

3513

struct zonelist *zonelist;

3514

3515

local_node = pgdat->node_id;

3515

local_node = pgdat->node_id;

3516

3517

zonelist = &pgdat->node_zonelists[0];

3517

zonelist = &pgdat->node_zonelists[0];

3518

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3518

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3519

3520

/*

3520

/*

3521

* Now we build the zonelist so that it contains the zones

3521

* Now we build the zonelist so that it contains the zones

3522

* of all the other nodes.

3522

* of all the other nodes.

3523

* We don't want to pressure a particular node, so when

3523

* We don't want to pressure a particular node, so when

3524

* building the zones for node N, we make sure that the

3524

* building the zones for node N, we make sure that the

3525

* zones coming right after the local ones are those from

3525

* zones coming right after the local ones are those from

3526

* node N+1 (modulo N)

3526

* node N+1 (modulo N)

3527

*/

3527

*/

3528

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3528

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3529

if (!node_online(node))

3529

if (!node_online(node))

3530

continue;

3530

continue;

3531

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3531

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3532

MAX_NR_ZONES - 1);

3532

MAX_NR_ZONES - 1);

3533

}

3533

}

3534

for (node = 0; node < local_node; node++) {

3534

for (node = 0; node < local_node; node++) {

3535

if (!node_online(node))

3535

if (!node_online(node))

3536

continue;

3536

continue;

3537

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3537

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3538

MAX_NR_ZONES - 1);

3538

MAX_NR_ZONES - 1);

3539

}

3539

}

3540

3541

zonelist->_zonerefs[j].zone = NULL;

3541

zonelist->_zonerefs[j].zone = NULL;

3542

zonelist->_zonerefs[j].zone_idx = 0;

3542

zonelist->_zonerefs[j].zone_idx = 0;

3543

}

3543

}

3544

3545

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3545

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3546

static void build_zonelist_cache(pg_data_t *pgdat)

3546

static void build_zonelist_cache(pg_data_t *pgdat)

3547

{

3547

{

3548

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3548

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3549

}

3549

}

3550

3551

#endif /* CONFIG_NUMA */

3551

#endif /* CONFIG_NUMA */

3552

3553

/*

3553

/*

3554

* Boot pageset table. One per cpu which is going to be used for all

3554

* Boot pageset table. One per cpu which is going to be used for all

3555

* zones and all nodes. The parameters will be set in such a way

3555

* zones and all nodes. The parameters will be set in such a way

3556

* that an item put on a list will immediately be handed over to

3556

* that an item put on a list will immediately be handed over to

3557

* the buddy list. This is safe since pageset manipulation is done

3557

* the buddy list. This is safe since pageset manipulation is done

3558

* with interrupts disabled.

3558

* with interrupts disabled.

3559

*

3559

*

3560

* The boot_pagesets must be kept even after bootup is complete for

3560

* The boot_pagesets must be kept even after bootup is complete for

3561

* unused processors and/or zones. They do play a role for bootstrapping

3561

* unused processors and/or zones. They do play a role for bootstrapping

3562

* hotplugged processors.

3562

* hotplugged processors.

3563

*

3563

*

3564

* zoneinfo_show() and maybe other functions do

3564

* zoneinfo_show() and maybe other functions do

3565

* not check if the processor is online before following the pageset pointer.

3565

* not check if the processor is online before following the pageset pointer.

3566

* Other parts of the kernel may not check if the zone is available.

3566

* Other parts of the kernel may not check if the zone is available.

3567

*/

3567

*/

3568

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3568

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3569

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3569

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3570

static void setup_zone_pageset(struct zone *zone);

3570

static void setup_zone_pageset(struct zone *zone);

3571

3572

/*

3572

/*

3573

* Global mutex to protect against size modification of zonelists

3573

* Global mutex to protect against size modification of zonelists

3574

* as well as to serialize pageset setup for the new populated zone.

3574

* as well as to serialize pageset setup for the new populated zone.

3575

*/

3575

*/

3576

DEFINE_MUTEX(zonelists_mutex);

3576

DEFINE_MUTEX(zonelists_mutex);

3577

3578

/* return values int ....just for stop_machine() */

3578

/* return values int ....just for stop_machine() */

3579

static int __build_all_zonelists(void *data)

3579

static int __build_all_zonelists(void *data)

3580

{

3580

{

3581

int nid;

3581

int nid;

3582

int cpu;

3582

int cpu;

3583

pg_data_t *self = data;

3583

pg_data_t *self = data;

3584

3585

#ifdef CONFIG_NUMA

3585

#ifdef CONFIG_NUMA

3586

memset(node_load, 0, sizeof(node_load));

3586

memset(node_load, 0, sizeof(node_load));

3587

#endif

3587

#endif

3588

3589

if (self && !node_online(self->node_id)) {

3589

if (self && !node_online(self->node_id)) {

3590

build_zonelists(self);

3590

build_zonelists(self);

3591

build_zonelist_cache(self);

3591

build_zonelist_cache(self);

3592

}

3592

}

3593

3594

for_each_online_node(nid) {

3594

for_each_online_node(nid) {

3595

pg_data_t *pgdat = NODE_DATA(nid);

3595

pg_data_t *pgdat = NODE_DATA(nid);

3596

3597

build_zonelists(pgdat);

3597

build_zonelists(pgdat);

3598

build_zonelist_cache(pgdat);

3598

build_zonelist_cache(pgdat);

3599

}

3599

}

3600

3601

/*

3601

/*

3602

* Initialize the boot_pagesets that are going to be used

3602

* Initialize the boot_pagesets that are going to be used

3603

* for bootstrapping processors. The real pagesets for

3603

* for bootstrapping processors. The real pagesets for

3604

* each zone will be allocated later when the per cpu

3604

* each zone will be allocated later when the per cpu

3605

* allocator is available.

3605

* allocator is available.

3606

*

3606

*

3607

* boot_pagesets are used also for bootstrapping offline

3607

* boot_pagesets are used also for bootstrapping offline

3608

* cpus if the system is already booted because the pagesets

3608

* cpus if the system is already booted because the pagesets

3609

* are needed to initialize allocators on a specific cpu too.

3609

* are needed to initialize allocators on a specific cpu too.

3610

* F.e. the percpu allocator needs the page allocator which

3610

* F.e. the percpu allocator needs the page allocator which

3611

* needs the percpu allocator in order to allocate its pagesets

3611

* needs the percpu allocator in order to allocate its pagesets

3612

* (a chicken-egg dilemma).

3612

* (a chicken-egg dilemma).

3613

*/

3613

*/

3614

for_each_possible_cpu(cpu) {

3614

for_each_possible_cpu(cpu) {

3615

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3615

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3616

3617

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3617

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3618

/*

3618

/*

3619

* We now know the "local memory node" for each node--

3619

* We now know the "local memory node" for each node--

3620

* i.e., the node of the first zone in the generic zonelist.

3620

* i.e., the node of the first zone in the generic zonelist.

3621

* Set up numa_mem percpu variable for on-line cpus. During

3621

* Set up numa_mem percpu variable for on-line cpus. During

3622

* boot, only the boot cpu should be on-line; we'll init the

3622

* boot, only the boot cpu should be on-line; we'll init the

3623

* secondary cpus' numa_mem as they come on-line. During

3623

* secondary cpus' numa_mem as they come on-line. During

3624

* node/memory hotplug, we'll fixup all on-line cpus.

3624

* node/memory hotplug, we'll fixup all on-line cpus.

3625

*/

3625

*/

3626

if (cpu_online(cpu))

3626

if (cpu_online(cpu))

3627

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3627

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3628

#endif

3628

#endif

3629

}

3629

}

3630

3631

return 0;

3631

return 0;

3632

}

3632

}

3633

3634

/*

3634

/*

3635

* Called with zonelists_mutex held always

3635

* Called with zonelists_mutex held always

3636

* unless system_state == SYSTEM_BOOTING.

3636

* unless system_state == SYSTEM_BOOTING.

3637

*/

3637

*/

3638

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3638

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3639

{

3639

{

3640

set_zonelist_order();

3640

set_zonelist_order();

3641

3642

if (system_state == SYSTEM_BOOTING) {

3642

if (system_state == SYSTEM_BOOTING) {

3643

__build_all_zonelists(NULL);

3643

__build_all_zonelists(NULL);

3644

mminit_verify_zonelist();

3644

mminit_verify_zonelist();

3645

cpuset_init_current_mems_allowed();

3645

cpuset_init_current_mems_allowed();

3646

} else {

3646

} else {

3647

/* we have to stop all cpus to guarantee there is no user

3647

/* we have to stop all cpus to guarantee there is no user

3648

of zonelist */

3648

of zonelist */

3649

#ifdef CONFIG_MEMORY_HOTPLUG

3649

#ifdef CONFIG_MEMORY_HOTPLUG

3650

if (zone)

3650

if (zone)

3651

setup_zone_pageset(zone);

3651

setup_zone_pageset(zone);

3652

#endif

3652

#endif

3653

stop_machine(__build_all_zonelists, pgdat, NULL);

3653

stop_machine(__build_all_zonelists, pgdat, NULL);

3654

/* cpuset refresh routine should be here */

3654

/* cpuset refresh routine should be here */

3655

}

3655

}

3656

vm_total_pages = nr_free_pagecache_pages();

3656

vm_total_pages = nr_free_pagecache_pages();

3657

/*

3657

/*

3658

* Disable grouping by mobility if the number of pages in the

3658

* Disable grouping by mobility if the number of pages in the

3659

* system is too low to allow the mechanism to work. It would be

3659

* system is too low to allow the mechanism to work. It would be

3660

* more accurate, but expensive to check per-zone. This check is

3660

* more accurate, but expensive to check per-zone. This check is

3661

* made on memory-hotadd so a system can start with mobility

3661

* made on memory-hotadd so a system can start with mobility

3662

* disabled and enable it later

3662

* disabled and enable it later

3663

*/

3663

*/

3664

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3664

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3665

page_group_by_mobility_disabled = 1;

3665

page_group_by_mobility_disabled = 1;

3666

else

3666

else

3667

page_group_by_mobility_disabled = 0;

3667

page_group_by_mobility_disabled = 0;

3668

3669

printk("Built %i zonelists in %s order, mobility grouping %s. "

3669

printk("Built %i zonelists in %s order, mobility grouping %s. "

3670

"Total pages: %ld\n",

3670

"Total pages: %ld\n",

3671

nr_online_nodes,

3671

nr_online_nodes,

3672

zonelist_order_name[current_zonelist_order],

3672

zonelist_order_name[current_zonelist_order],

3673

page_group_by_mobility_disabled ? "off" : "on",

3673

page_group_by_mobility_disabled ? "off" : "on",

3674

vm_total_pages);

3674

vm_total_pages);

3675

#ifdef CONFIG_NUMA

3675

#ifdef CONFIG_NUMA

3676

printk("Policy zone: %s\n", zone_names[policy_zone]);

3676

printk("Policy zone: %s\n", zone_names[policy_zone]);

3677

#endif

3677

#endif

3678

}

3678

}

3679

3680

/*

3680

/*

3681

* Helper functions to size the waitqueue hash table.

3681

* Helper functions to size the waitqueue hash table.

3682

* Essentially these want to choose hash table sizes sufficiently

3682

* Essentially these want to choose hash table sizes sufficiently

3683

* large so that collisions trying to wait on pages are rare.

3683

* large so that collisions trying to wait on pages are rare.

3684

* But in fact, the number of active page waitqueues on typical

3684

* But in fact, the number of active page waitqueues on typical

3685

* systems is ridiculously low, less than 200. So this is even

3685

* systems is ridiculously low, less than 200. So this is even

3686

* conservative, even though it seems large.

3686

* conservative, even though it seems large.

3687

*

3687

*

3688

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3688

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3689

* waitqueues, i.e. the size of the waitq table given the number of pages.

3689

* waitqueues, i.e. the size of the waitq table given the number of pages.

3690

*/

3690

*/

3691

#define PAGES_PER_WAITQUEUE 256

3691

#define PAGES_PER_WAITQUEUE 256

3692

3693

#ifndef CONFIG_MEMORY_HOTPLUG

3693

#ifndef CONFIG_MEMORY_HOTPLUG

3694

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3694

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3695

{

3695

{

3696

unsigned long size = 1;

3696

unsigned long size = 1;

3697

3698

pages /= PAGES_PER_WAITQUEUE;

3698

pages /= PAGES_PER_WAITQUEUE;

3699

3700

while (size < pages)

3700

while (size < pages)

3701

size <<= 1;

3701

size <<= 1;

3702

3703

/*

3703

/*

3704

* Once we have dozens or even hundreds of threads sleeping

3704

* Once we have dozens or even hundreds of threads sleeping

3705

* on IO we've got bigger problems than wait queue collision.

3705

* on IO we've got bigger problems than wait queue collision.

3706

* Limit the size of the wait table to a reasonable size.

3706

* Limit the size of the wait table to a reasonable size.

3707

*/

3707

*/

3708

size = min(size, 4096UL);

3708

size = min(size, 4096UL);

3709

3710

return max(size, 4UL);

3710

return max(size, 4UL);

3711

}

3711

}

3712

#else

3712

#else

3713

/*

3713

/*

3714

* A zone's size might be changed by hot-add, so it is not possible to determine

3714

* A zone's size might be changed by hot-add, so it is not possible to determine

3715

* a suitable size for its wait_table. So we use the maximum size now.

3715

* a suitable size for its wait_table. So we use the maximum size now.

3716

*

3716

*

3717

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3717

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3718

*

3718

*

3719

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3719

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3720

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3720

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3721

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3721

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3722

*

3722

*

3723

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3723

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3724

* or more by the traditional way. (See above). It equals:

3724

* or more by the traditional way. (See above). It equals:

3725

*

3725

*

3726

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3726

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3727

* ia64(16K page size) : = ( 8G + 4M)byte.

3727

* ia64(16K page size) : = ( 8G + 4M)byte.

3728

* powerpc (64K page size) : = (32G +16M)byte.

3728

* powerpc (64K page size) : = (32G +16M)byte.

3729

*/

3729

*/

3730

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3730

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3731

{

3731

{

3732

return 4096UL;

3732

return 4096UL;

3733

}

3733

}

3734

#endif

3734

#endif

3735

3736

/*

3736

/*

3737

* This is an integer logarithm so that shifts can be used later

3737

* This is an integer logarithm so that shifts can be used later

3738

* to extract the more random high bits from the multiplicative

3738

* to extract the more random high bits from the multiplicative

3739

* hash function before the remainder is taken.

3739

* hash function before the remainder is taken.

3740

*/

3740

*/

3741

static inline unsigned long wait_table_bits(unsigned long size)

3741

static inline unsigned long wait_table_bits(unsigned long size)

3742

{

3742

{

3743

return ffz(~size);

3743

return ffz(~size);

3744

}

3744

}

3745

3746

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3746

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3747

3748

/*

3748

/*

3749

* Check if a pageblock contains reserved pages

3749

* Check if a pageblock contains reserved pages

3750

*/

3750

*/

3751

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3751

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3752

{

3752

{

3753

unsigned long pfn;

3753

unsigned long pfn;

3754

3755

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3755

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3756

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3756

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3757

return 1;

3757

return 1;

3758

}

3758

}

3759

return 0;

3759

return 0;

3760

}

3760

}

3761

3762

/*

3762

/*

3763

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3763

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3764

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3764

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3765

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3765

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3766

* higher will lead to a bigger reserve which will get freed as contiguous

3766

* higher will lead to a bigger reserve which will get freed as contiguous

3767

* blocks as reclaim kicks in

3767

* blocks as reclaim kicks in

3768

*/

3768

*/

3769

static void setup_zone_migrate_reserve(struct zone *zone)

3769

static void setup_zone_migrate_reserve(struct zone *zone)

3770

{

3770

{

3771

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3771

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3772

struct page *page;

3772

struct page *page;

3773

unsigned long block_migratetype;

3773

unsigned long block_migratetype;

3774

int reserve;

3774

int reserve;

3775

3776

/*

3776

/*

3777

* Get the start pfn, end pfn and the number of blocks to reserve

3777

* Get the start pfn, end pfn and the number of blocks to reserve

3778

* We have to be careful to be aligned to pageblock_nr_pages to

3778

* We have to be careful to be aligned to pageblock_nr_pages to

3779

* make sure that we always check pfn_valid for the first page in

3779

* make sure that we always check pfn_valid for the first page in

3780

* the block.

3780

* the block.

3781

*/

3781

*/

3782

start_pfn = zone->zone_start_pfn;

3782

start_pfn = zone->zone_start_pfn;

3783

end_pfn = start_pfn + zone->spanned_pages;

3783

end_pfn = start_pfn + zone->spanned_pages;

3784

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3784

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3785

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3785

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3786

pageblock_order;

3786

pageblock_order;

3787

3788

/*

3788

/*

3789

* Reserve blocks are generally in place to help high-order atomic

3789

* Reserve blocks are generally in place to help high-order atomic

3790

* allocations that are short-lived. A min_free_kbytes value that

3790

* allocations that are short-lived. A min_free_kbytes value that

3791

* would result in more than 2 reserve blocks for atomic allocations

3791

* would result in more than 2 reserve blocks for atomic allocations

3792

* is assumed to be in place to help anti-fragmentation for the

3792

* is assumed to be in place to help anti-fragmentation for the

3793

* future allocation of hugepages at runtime.

3793

* future allocation of hugepages at runtime.

3794

*/

3794

*/

3795

reserve = min(2, reserve);

3795

reserve = min(2, reserve);

3796

3797

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3797

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3798

if (!pfn_valid(pfn))

3798

if (!pfn_valid(pfn))

3799

continue;

3799

continue;

3800

page = pfn_to_page(pfn);

3800

page = pfn_to_page(pfn);

3801

3802

/* Watch out for overlapping nodes */

3802

/* Watch out for overlapping nodes */

3803

if (page_to_nid(page) != zone_to_nid(zone))

3803

if (page_to_nid(page) != zone_to_nid(zone))

3804

continue;

3804

continue;

3805

3806

block_migratetype = get_pageblock_migratetype(page);

3806

block_migratetype = get_pageblock_migratetype(page);

3807

3808

/* Only test what is necessary when the reserves are not met */

3808

/* Only test what is necessary when the reserves are not met */

3809

if (reserve > 0) {

3809

if (reserve > 0) {

3810

/*

3810

/*

3811

* Blocks with reserved pages will never free, skip

3811

* Blocks with reserved pages will never free, skip

3812

* them.

3812

* them.

3813

*/

3813

*/

3814

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3814

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3815

if (pageblock_is_reserved(pfn, block_end_pfn))

3815

if (pageblock_is_reserved(pfn, block_end_pfn))

3816

continue;

3816

continue;

3817

3818

/* If this block is reserved, account for it */

3818

/* If this block is reserved, account for it */

3819

if (block_migratetype == MIGRATE_RESERVE) {

3819

if (block_migratetype == MIGRATE_RESERVE) {

3820

reserve--;

3820

reserve--;

3821

continue;

3821

continue;

3822

}

3822

}

3823

3824

/* Suitable for reserving if this block is movable */

3824

/* Suitable for reserving if this block is movable */

3825

if (block_migratetype == MIGRATE_MOVABLE) {

3825

if (block_migratetype == MIGRATE_MOVABLE) {

3826

set_pageblock_migratetype(page,

3826

set_pageblock_migratetype(page,

3827

MIGRATE_RESERVE);

3827

MIGRATE_RESERVE);

3828

move_freepages_block(zone, page,

3828

move_freepages_block(zone, page,

3829

MIGRATE_RESERVE);

3829

MIGRATE_RESERVE);

3830

reserve--;

3830

reserve--;

3831

continue;

3831

continue;

3832

}

3832

}

3833

}

3833

}

3834

3835

/*

3835

/*

3836

* If the reserve is met and this is a previous reserved block,

3836

* If the reserve is met and this is a previous reserved block,

3837

* take it back

3837

* take it back

3838

*/

3838

*/

3839

if (block_migratetype == MIGRATE_RESERVE) {

3839

if (block_migratetype == MIGRATE_RESERVE) {

3840

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3840

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3841

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3841

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3842

}

3842

}

3843

}

3843

}

3844

}

3844

}

3845

3846

/*

3846

/*

3847

* Initially all pages are reserved - free ones are freed

3847

* Initially all pages are reserved - free ones are freed

3848

* up by free_all_bootmem() once the early boot process is

3848

* up by free_all_bootmem() once the early boot process is

3849

* done. Non-atomic initialization, single-pass.

3849

* done. Non-atomic initialization, single-pass.

3850

*/

3850

*/

3851

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3851

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3852

unsigned long start_pfn, enum memmap_context context)

3852

unsigned long start_pfn, enum memmap_context context)

3853

{

3853

{

3854

struct page *page;

3854

struct page *page;

3855

unsigned long end_pfn = start_pfn + size;

3855

unsigned long end_pfn = start_pfn + size;

3856

unsigned long pfn;

3856

unsigned long pfn;

3857

struct zone *z;

3857

struct zone *z;

3858

3859

if (highest_memmap_pfn < end_pfn - 1)

3859

if (highest_memmap_pfn < end_pfn - 1)

3860

highest_memmap_pfn = end_pfn - 1;

3860

highest_memmap_pfn = end_pfn - 1;

3861

3862

z = &NODE_DATA(nid)->node_zones[zone];

3862

z = &NODE_DATA(nid)->node_zones[zone];

3863

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3863

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3864

/*

3864

/*

3865

* There can be holes in boot-time mem_map[]s

3865

* There can be holes in boot-time mem_map[]s

3866

* handed to this function. They do not

3866

* handed to this function. They do not

3867

* exist on hotplugged memory.

3867

* exist on hotplugged memory.

3868

*/

3868

*/

3869

if (context == MEMMAP_EARLY) {

3869

if (context == MEMMAP_EARLY) {

3870

if (!early_pfn_valid(pfn))

3870

if (!early_pfn_valid(pfn))

3871

continue;

3871

continue;

3872

if (!early_pfn_in_nid(pfn, nid))

3872

if (!early_pfn_in_nid(pfn, nid))

3873

continue;

3873

continue;

3874

}

3874

}

3875

page = pfn_to_page(pfn);

3875

page = pfn_to_page(pfn);

3876

set_page_links(page, zone, nid, pfn);

3876

set_page_links(page, zone, nid, pfn);

3877

mminit_verify_page_links(page, zone, nid, pfn);

3877

mminit_verify_page_links(page, zone, nid, pfn);

3878

init_page_count(page);

3878

init_page_count(page);

3879

reset_page_mapcount(page);

3879

reset_page_mapcount(page);

3880

reset_page_last_nid(page);

3880

reset_page_last_nid(page);

3881

SetPageReserved(page);

3881

SetPageReserved(page);

3882

/*

3882

/*

3883

* Mark the block movable so that blocks are reserved for

3883

* Mark the block movable so that blocks are reserved for

3884

* movable at startup. This will force kernel allocations

3884

* movable at startup. This will force kernel allocations

3885

* to reserve their blocks rather than leaking throughout

3885

* to reserve their blocks rather than leaking throughout

3886

* the address space during boot when many long-lived

3886

* the address space during boot when many long-lived

3887

* kernel allocations are made. Later some blocks near

3887

* kernel allocations are made. Later some blocks near

3888

* the start are marked MIGRATE_RESERVE by

3888

* the start are marked MIGRATE_RESERVE by

3889

* setup_zone_migrate_reserve()

3889

* setup_zone_migrate_reserve()

3890

*

3890

*

3891

* bitmap is created for zone's valid pfn range. but memmap

3891

* bitmap is created for zone's valid pfn range. but memmap

3892

* can be created for invalid pages (for alignment)

3892

* can be created for invalid pages (for alignment)

3893

* check here not to call set_pageblock_migratetype() against

3893

* check here not to call set_pageblock_migratetype() against

3894

* pfn out of zone.

3894

* pfn out of zone.

3895

*/

3895

*/

3896

if ((z->zone_start_pfn <= pfn)

3896

if ((z->zone_start_pfn <= pfn)

3897

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3897

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3898

&& !(pfn & (pageblock_nr_pages - 1)))

3898

&& !(pfn & (pageblock_nr_pages - 1)))

3899

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3899

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3900

3901

INIT_LIST_HEAD(&page->lru);

3901

INIT_LIST_HEAD(&page->lru);

3902

#ifdef WANT_PAGE_VIRTUAL

3902

#ifdef WANT_PAGE_VIRTUAL

3903

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3903

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3904

if (!is_highmem_idx(zone))

3904

if (!is_highmem_idx(zone))

3905

set_page_address(page, __va(pfn << PAGE_SHIFT));

3905

set_page_address(page, __va(pfn << PAGE_SHIFT));

3906

#endif

3906

#endif

3907

}

3907

}

3908

}

3908

}

3909

3910

static void __meminit zone_init_free_lists(struct zone *zone)

3910

static void __meminit zone_init_free_lists(struct zone *zone)

3911

{

3911

{

3912

int order, t;

3912

int order, t;

3913

for_each_migratetype_order(order, t) {

3913

for_each_migratetype_order(order, t) {

3914

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3914

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3915

zone->free_area[order].nr_free = 0;

3915

zone->free_area[order].nr_free = 0;

3916

}

3916

}

3917

}

3917

}

3918

3919

#ifndef __HAVE_ARCH_MEMMAP_INIT

3919

#ifndef __HAVE_ARCH_MEMMAP_INIT

3920

#define memmap_init(size, nid, zone, start_pfn) \

3920

#define memmap_init(size, nid, zone, start_pfn) \

3921

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3921

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3922

#endif

3922

#endif

3923

3924

static int __meminit zone_batchsize(struct zone *zone)

3924

static int __meminit zone_batchsize(struct zone *zone)

3925

{

3925

{

3926

#ifdef CONFIG_MMU

3926

#ifdef CONFIG_MMU

3927

int batch;

3927

int batch;

3928

3929

/*

3929

/*

3930

* The per-cpu-pages pools are set to around 1000th of the

3930

* The per-cpu-pages pools are set to around 1000th of the

3931

* size of the zone. But no more than 1/2 of a meg.

3931

* size of the zone. But no more than 1/2 of a meg.

3932

*

3932

*

3933

* OK, so we don't know how big the cache is. So guess.

3933

* OK, so we don't know how big the cache is. So guess.

3934

*/

3934

*/

3935

batch = zone->present_pages / 1024;

3935

batch = zone->present_pages / 1024;

3936

if (batch * PAGE_SIZE > 512 * 1024)

3936

if (batch * PAGE_SIZE > 512 * 1024)

3937

batch = (512 * 1024) / PAGE_SIZE;

3937

batch = (512 * 1024) / PAGE_SIZE;

3938

batch /= 4; /* We effectively *= 4 below */

3938

batch /= 4; /* We effectively *= 4 below */

3939

if (batch < 1)

3939

if (batch < 1)

3940

batch = 1;

3940

batch = 1;

3941

3942

/*

3942

/*

3943

* Clamp the batch to a 2^n - 1 value. Having a power

3943

* Clamp the batch to a 2^n - 1 value. Having a power

3944

* of 2 value was found to be more likely to have

3944

* of 2 value was found to be more likely to have

3945

* suboptimal cache aliasing properties in some cases.

3945

* suboptimal cache aliasing properties in some cases.

3946

*

3946

*

3947

* For example if 2 tasks are alternately allocating

3947

* For example if 2 tasks are alternately allocating

3948

* batches of pages, one task can end up with a lot

3948

* batches of pages, one task can end up with a lot

3949

* of pages of one half of the possible page colors

3949

* of pages of one half of the possible page colors

3950

* and the other with pages of the other colors.

3950

* and the other with pages of the other colors.

3951

*/

3951

*/

3952

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3952

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3953

3954

return batch;

3954

return batch;

3955

3956

#else

3956

#else

3957

/* The deferral and batching of frees should be suppressed under NOMMU

3957

/* The deferral and batching of frees should be suppressed under NOMMU

3958

* conditions.

3958

* conditions.

3959

*

3959

*

3960

* The problem is that NOMMU needs to be able to allocate large chunks

3960

* The problem is that NOMMU needs to be able to allocate large chunks

3961

* of contiguous memory as there's no hardware page translation to

3961

* of contiguous memory as there's no hardware page translation to

3962

* assemble apparent contiguous memory from discontiguous pages.

3962

* assemble apparent contiguous memory from discontiguous pages.

3963

*

3963

*

3964

* Queueing large contiguous runs of pages for batching, however,

3964

* Queueing large contiguous runs of pages for batching, however,

3965

* causes the pages to actually be freed in smaller chunks. As there

3965

* causes the pages to actually be freed in smaller chunks. As there

3966

* can be a significant delay between the individual batches being

3966

* can be a significant delay between the individual batches being

3967

* recycled, this leads to the once large chunks of space being

3967

* recycled, this leads to the once large chunks of space being

3968

* fragmented and becoming unavailable for high-order allocations.

3968

* fragmented and becoming unavailable for high-order allocations.

3969

*/

3969

*/

3970

return 0;

3970

return 0;

3971

#endif

3971

#endif

3972

}

3972

}

3973

3974

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3974

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3975

{

3975

{

3976

struct per_cpu_pages *pcp;

3976

struct per_cpu_pages *pcp;

3977

int migratetype;

3977

int migratetype;

3978

3979

memset(p, 0, sizeof(*p));

3979

memset(p, 0, sizeof(*p));

3980

3981

pcp = &p->pcp;

3981

pcp = &p->pcp;

3982

pcp->count = 0;

3982

pcp->count = 0;

3983

pcp->high = 6 * batch;

3983

pcp->high = 6 * batch;

3984

pcp->batch = max(1UL, 1 * batch);

3984

pcp->batch = max(1UL, 1 * batch);

3985

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3985

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3986

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3986

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3987

}

3987

}

3988

3989

/*

3989

/*

3990

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3990

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3991

* to the value high for the pageset p.

3991

* to the value high for the pageset p.

3992

*/

3992

*/

3993

3994

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3994

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3995

unsigned long high)

3995

unsigned long high)

3996

{

3996

{

3997

struct per_cpu_pages *pcp;

3997

struct per_cpu_pages *pcp;

3998

3999

pcp = &p->pcp;

3999

pcp = &p->pcp;

4000

pcp->high = high;

4000

pcp->high = high;

4001

pcp->batch = max(1UL, high/4);

4001

pcp->batch = max(1UL, high/4);

4002

if ((high/4) > (PAGE_SHIFT * 8))

4002

if ((high/4) > (PAGE_SHIFT * 8))

4003

pcp->batch = PAGE_SHIFT * 8;

4003

pcp->batch = PAGE_SHIFT * 8;

4004

}

4004

}

4005

4006

static void __meminit setup_zone_pageset(struct zone *zone)

4006

static void __meminit setup_zone_pageset(struct zone *zone)

4007

{

4007

{

4008

int cpu;

4008

int cpu;

4009

4010

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4010

zone->pageset = alloc_percpu(struct per_cpu_pageset);

4011

4012

for_each_possible_cpu(cpu) {

4012

for_each_possible_cpu(cpu) {

4013

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4013

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

4014

4015

setup_pageset(pcp, zone_batchsize(zone));

4015

setup_pageset(pcp, zone_batchsize(zone));

4016

4017

if (percpu_pagelist_fraction)

4017

if (percpu_pagelist_fraction)

4018

setup_pagelist_highmark(pcp,

4018

setup_pagelist_highmark(pcp,

4019

(zone->present_pages /

4019

(zone->present_pages /

4020

percpu_pagelist_fraction));

4020

percpu_pagelist_fraction));

4021

}

4021

}

4022

}

4022

}

4023

4024

/*

4024

/*

4025

* Allocate per cpu pagesets and initialize them.

4025

* Allocate per cpu pagesets and initialize them.

4026

* Before this call only boot pagesets were available.

4026

* Before this call only boot pagesets were available.

4027

*/

4027

*/

4028

void __init setup_per_cpu_pageset(void)

4028

void __init setup_per_cpu_pageset(void)

4029

{

4029

{

4030

struct zone *zone;

4030

struct zone *zone;

4031

4032

for_each_populated_zone(zone)

4032

for_each_populated_zone(zone)

4033

setup_zone_pageset(zone);

4033

setup_zone_pageset(zone);

4034

}

4034

}

4035

4036

static noinline __init_refok

4036

static noinline __init_refok

4037

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4037

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

4038

{

4038

{

4039

int i;

4039

int i;

4040

struct pglist_data *pgdat = zone->zone_pgdat;

4040

struct pglist_data *pgdat = zone->zone_pgdat;

4041

size_t alloc_size;

4041

size_t alloc_size;

4042

4043

/*

4043

/*

4044

* The per-page waitqueue mechanism uses hashed waitqueues

4044

* The per-page waitqueue mechanism uses hashed waitqueues

4045

* per zone.

4045

* per zone.

4046

*/

4046

*/

4047

zone->wait_table_hash_nr_entries =

4047

zone->wait_table_hash_nr_entries =

4048

wait_table_hash_nr_entries(zone_size_pages);

4048

wait_table_hash_nr_entries(zone_size_pages);

4049

zone->wait_table_bits =

4049

zone->wait_table_bits =

4050

wait_table_bits(zone->wait_table_hash_nr_entries);

4050

wait_table_bits(zone->wait_table_hash_nr_entries);

4051

alloc_size = zone->wait_table_hash_nr_entries

4051

alloc_size = zone->wait_table_hash_nr_entries

4052

* sizeof(wait_queue_head_t);

4052

* sizeof(wait_queue_head_t);

4053

4054

if (!slab_is_available()) {

4054

if (!slab_is_available()) {

4055

zone->wait_table = (wait_queue_head_t *)

4055

zone->wait_table = (wait_queue_head_t *)

4056

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4056

alloc_bootmem_node_nopanic(pgdat, alloc_size);

4057

} else {

4057

} else {

4058

/*

4058

/*

4059

* This case means that a zone whose size was 0 gets new memory

4059

* This case means that a zone whose size was 0 gets new memory

4060

* via memory hot-add.

4060

* via memory hot-add.

4061

* But it may be the case that a new node was hot-added. In

4061

* But it may be the case that a new node was hot-added. In

4062

* this case vmalloc() will not be able to use this new node's

4062

* this case vmalloc() will not be able to use this new node's

4063

* memory - this wait_table must be initialized to use this new

4063

* memory - this wait_table must be initialized to use this new

4064

* node itself as well.

4064

* node itself as well.

4065

* To use this new node's memory, further consideration will be

4065

* To use this new node's memory, further consideration will be

4066

* necessary.

4066

* necessary.

4067

*/

4067

*/

4068

zone->wait_table = vmalloc(alloc_size);

4068

zone->wait_table = vmalloc(alloc_size);

4069

}

4069

}

4070

if (!zone->wait_table)

4070

if (!zone->wait_table)

4071

return -ENOMEM;

4071

return -ENOMEM;

4072

4073

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4073

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

4074

init_waitqueue_head(zone->wait_table + i);

4074

init_waitqueue_head(zone->wait_table + i);

4075

4076

return 0;

4076

return 0;

4077

}

4077

}

4078

4079

static __meminit void zone_pcp_init(struct zone *zone)

4079

static __meminit void zone_pcp_init(struct zone *zone)

4080

{

4080

{

4081

/*

4081

/*

4082

* per cpu subsystem is not up at this point. The following code

4082

* per cpu subsystem is not up at this point. The following code

4083

* relies on the ability of the linker to provide the

4083

* relies on the ability of the linker to provide the

4084

* offset of a (static) per cpu variable into the per cpu area.

4084

* offset of a (static) per cpu variable into the per cpu area.

4085

*/

4085

*/

4086

zone->pageset = &boot_pageset;

4086

zone->pageset = &boot_pageset;

4087

4088

if (zone->present_pages)

4088

if (zone->present_pages)

4089

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4089

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

4090

zone->name, zone->present_pages,

4090

zone->name, zone->present_pages,

4091

zone_batchsize(zone));

4091

zone_batchsize(zone));

4092

}

4092

}

4093

4094

int __meminit init_currently_empty_zone(struct zone *zone,

4094

int __meminit init_currently_empty_zone(struct zone *zone,

4095

unsigned long zone_start_pfn,

4095

unsigned long zone_start_pfn,

4096

unsigned long size,

4096

unsigned long size,

4097

enum memmap_context context)

4097

enum memmap_context context)

4098

{

4098

{

4099

struct pglist_data *pgdat = zone->zone_pgdat;

4099

struct pglist_data *pgdat = zone->zone_pgdat;

4100

int ret;

4100

int ret;

4101

ret = zone_wait_table_init(zone, size);

4101

ret = zone_wait_table_init(zone, size);

4102

if (ret)

4102

if (ret)

4103

return ret;

4103

return ret;

4104

pgdat->nr_zones = zone_idx(zone) + 1;

4104

pgdat->nr_zones = zone_idx(zone) + 1;

4105

4106

zone->zone_start_pfn = zone_start_pfn;

4106

zone->zone_start_pfn = zone_start_pfn;

4107

4108

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4108

mminit_dprintk(MMINIT_TRACE, "memmap_init",

4109

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4109

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

4110

pgdat->node_id,

4110

pgdat->node_id,

4111

(unsigned long)zone_idx(zone),

4111

(unsigned long)zone_idx(zone),

4112

zone_start_pfn, (zone_start_pfn + size));

4112

zone_start_pfn, (zone_start_pfn + size));

4113

4114

zone_init_free_lists(zone);

4114

zone_init_free_lists(zone);

4115

4116

return 0;

4116

return 0;

4117

}

4117

}

4118

4119

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4119

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4120

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4120

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4121

/*

4121

/*

4122

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4122

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4123

* Architectures may implement their own version but if add_active_range()

4123

* Architectures may implement their own version but if add_active_range()

4124

* was used and there are no special requirements, this is a convenient

4124

* was used and there are no special requirements, this is a convenient

4125

* alternative

4125

* alternative

4126

*/

4126

*/

4127

int __meminit __early_pfn_to_nid(unsigned long pfn)

4127

int __meminit __early_pfn_to_nid(unsigned long pfn)

4128

{

4128

{

4129

unsigned long start_pfn, end_pfn;

4129

unsigned long start_pfn, end_pfn;

4130

int i, nid;

4130

int i, nid;

4131

4132

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4132

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4133

if (start_pfn <= pfn && pfn < end_pfn)

4133

if (start_pfn <= pfn && pfn < end_pfn)

4134

return nid;

4134

return nid;

4135

/* This is a memory hole */

4135

/* This is a memory hole */

4136

return -1;

4136

return -1;

4137

}

4137

}

4138

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4138

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4139

4140

int __meminit early_pfn_to_nid(unsigned long pfn)

4140

int __meminit early_pfn_to_nid(unsigned long pfn)

4141

{

4141

{

4142

int nid;

4142

int nid;

4143

4144

nid = __early_pfn_to_nid(pfn);

4144

nid = __early_pfn_to_nid(pfn);

4145

if (nid >= 0)

4145

if (nid >= 0)

4146

return nid;

4146

return nid;

4147

/* just returns 0 */

4147

/* just returns 0 */

4148

return 0;

4148

return 0;

4149

}

4149

}

4150

4151

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4151

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4152

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4152

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4153

{

4153

{

4154

int nid;

4154

int nid;

4155

4156

nid = __early_pfn_to_nid(pfn);

4156

nid = __early_pfn_to_nid(pfn);

4157

if (nid >= 0 && nid != node)

4157

if (nid >= 0 && nid != node)

4158

return false;

4158

return false;

4159

return true;

4159

return true;

4160

}

4160

}

4161

#endif

4161

#endif

4162

4163

/**

4163

/**

4164

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4164

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4165

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4165

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4166

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4166

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4167

*

4167

*

4168

* If an architecture guarantees that all ranges registered with

4168

* If an architecture guarantees that all ranges registered with

4169

* add_active_ranges() contain no holes and may be freed, this

4169

* add_active_ranges() contain no holes and may be freed, this

4170

* this function may be used instead of calling free_bootmem() manually.

4170

* this function may be used instead of calling free_bootmem() manually.

4171

*/

4171

*/

4172

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4172

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4173

{

4173

{

4174

unsigned long start_pfn, end_pfn;

4174

unsigned long start_pfn, end_pfn;

4175

int i, this_nid;

4175

int i, this_nid;

4176

4177

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4177

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4178

start_pfn = min(start_pfn, max_low_pfn);

4178

start_pfn = min(start_pfn, max_low_pfn);

4179

end_pfn = min(end_pfn, max_low_pfn);

4179

end_pfn = min(end_pfn, max_low_pfn);

4180

4181

if (start_pfn < end_pfn)

4181

if (start_pfn < end_pfn)

4182

free_bootmem_node(NODE_DATA(this_nid),

4182

free_bootmem_node(NODE_DATA(this_nid),

4183

PFN_PHYS(start_pfn),

4183

PFN_PHYS(start_pfn),

4184

(end_pfn - start_pfn) << PAGE_SHIFT);

4184

(end_pfn - start_pfn) << PAGE_SHIFT);

4185

}

4185

}

4186

}

4186

}

4187

4188

/**

4188

/**

4189

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4189

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4190

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4190

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4191

*

4191

*

4192

* If an architecture guarantees that all ranges registered with

4192

* If an architecture guarantees that all ranges registered with

4193

* add_active_ranges() contain no holes and may be freed, this

4193

* add_active_ranges() contain no holes and may be freed, this

4194

* function may be used instead of calling memory_present() manually.

4194

* function may be used instead of calling memory_present() manually.

4195

*/

4195

*/

4196

void __init sparse_memory_present_with_active_regions(int nid)

4196

void __init sparse_memory_present_with_active_regions(int nid)

4197

{

4197

{

4198

unsigned long start_pfn, end_pfn;

4198

unsigned long start_pfn, end_pfn;

4199

int i, this_nid;

4199

int i, this_nid;

4200

4201

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4201

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4202

memory_present(this_nid, start_pfn, end_pfn);

4202

memory_present(this_nid, start_pfn, end_pfn);

4203

}

4203

}

4204

4205

/**

4205

/**

4206

* get_pfn_range_for_nid - Return the start and end page frames for a node

4206

* get_pfn_range_for_nid - Return the start and end page frames for a node

4207

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4207

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4208

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4208

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4209

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4209

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4210

*

4210

*

4211

* It returns the start and end page frame of a node based on information

4211

* It returns the start and end page frame of a node based on information

4212

* provided by an arch calling add_active_range(). If called for a node

4212

* provided by an arch calling add_active_range(). If called for a node

4213

* with no available memory, a warning is printed and the start and end

4213

* with no available memory, a warning is printed and the start and end

4214

* PFNs will be 0.

4214

* PFNs will be 0.

4215

*/

4215

*/

4216

void __meminit get_pfn_range_for_nid(unsigned int nid,

4216

void __meminit get_pfn_range_for_nid(unsigned int nid,

4217

unsigned long *start_pfn, unsigned long *end_pfn)

4217

unsigned long *start_pfn, unsigned long *end_pfn)

4218

{

4218

{

4219

unsigned long this_start_pfn, this_end_pfn;

4219

unsigned long this_start_pfn, this_end_pfn;

4220

int i;

4220

int i;

4221

4222

*start_pfn = -1UL;

4222

*start_pfn = -1UL;

4223

*end_pfn = 0;

4223

*end_pfn = 0;

4224

4225

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4225

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4226

*start_pfn = min(*start_pfn, this_start_pfn);

4226

*start_pfn = min(*start_pfn, this_start_pfn);

4227

*end_pfn = max(*end_pfn, this_end_pfn);

4227

*end_pfn = max(*end_pfn, this_end_pfn);

4228

}

4228

}

4229

4230

if (*start_pfn == -1UL)

4230

if (*start_pfn == -1UL)

4231

*start_pfn = 0;

4231

*start_pfn = 0;

4232

}

4232

}

4233

4234

/*

4234

/*

4235

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4235

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4236

* assumption is made that zones within a node are ordered in monotonic

4236

* assumption is made that zones within a node are ordered in monotonic

4237

* increasing memory addresses so that the "highest" populated zone is used

4237

* increasing memory addresses so that the "highest" populated zone is used

4238

*/

4238

*/

4239

static void __init find_usable_zone_for_movable(void)

4239

static void __init find_usable_zone_for_movable(void)

4240

{

4240

{

4241

int zone_index;

4241

int zone_index;

4242

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4242

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4243

if (zone_index == ZONE_MOVABLE)

4243

if (zone_index == ZONE_MOVABLE)

4244

continue;

4244

continue;

4245

4246

if (arch_zone_highest_possible_pfn[zone_index] >

4246

if (arch_zone_highest_possible_pfn[zone_index] >

4247

arch_zone_lowest_possible_pfn[zone_index])

4247

arch_zone_lowest_possible_pfn[zone_index])

4248

break;

4248

break;

4249

}

4249

}

4250

4251

VM_BUG_ON(zone_index == -1);

4251

VM_BUG_ON(zone_index == -1);

4252

movable_zone = zone_index;

4252

movable_zone = zone_index;

4253

}

4253

}

4254

4255

/*

4255

/*

4256

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4256

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4257

* because it is sized independent of architecture. Unlike the other zones,

4257

* because it is sized independent of architecture. Unlike the other zones,

4258

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4258

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4259

* in each node depending on the size of each node and how evenly kernelcore

4259

* in each node depending on the size of each node and how evenly kernelcore

4260

* is distributed. This helper function adjusts the zone ranges

4260

* is distributed. This helper function adjusts the zone ranges

4261

* provided by the architecture for a given node by using the end of the

4261

* provided by the architecture for a given node by using the end of the

4262

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4262

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4263

* zones within a node are in order of monotonic increases memory addresses

4263

* zones within a node are in order of monotonic increases memory addresses

4264

*/

4264

*/

4265

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4265

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4266

unsigned long zone_type,

4266

unsigned long zone_type,

4267

unsigned long node_start_pfn,

4267

unsigned long node_start_pfn,

4268

unsigned long node_end_pfn,

4268

unsigned long node_end_pfn,

4269

unsigned long *zone_start_pfn,

4269

unsigned long *zone_start_pfn,

4270

unsigned long *zone_end_pfn)

4270

unsigned long *zone_end_pfn)

4271

{

4271

{

4272

/* Only adjust if ZONE_MOVABLE is on this node */

4272

/* Only adjust if ZONE_MOVABLE is on this node */

4273

if (zone_movable_pfn[nid]) {

4273

if (zone_movable_pfn[nid]) {

4274

/* Size ZONE_MOVABLE */

4274

/* Size ZONE_MOVABLE */

4275

if (zone_type == ZONE_MOVABLE) {

4275

if (zone_type == ZONE_MOVABLE) {

4276

*zone_start_pfn = zone_movable_pfn[nid];

4276

*zone_start_pfn = zone_movable_pfn[nid];

4277

*zone_end_pfn = min(node_end_pfn,

4277

*zone_end_pfn = min(node_end_pfn,

4278

arch_zone_highest_possible_pfn[movable_zone]);

4278

arch_zone_highest_possible_pfn[movable_zone]);

4279

4280

/* Adjust for ZONE_MOVABLE starting within this range */

4280

/* Adjust for ZONE_MOVABLE starting within this range */

4281

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4281

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4282

*zone_end_pfn > zone_movable_pfn[nid]) {

4282

*zone_end_pfn > zone_movable_pfn[nid]) {

4283

*zone_end_pfn = zone_movable_pfn[nid];

4283

*zone_end_pfn = zone_movable_pfn[nid];

4284

4285

/* Check if this whole range is within ZONE_MOVABLE */

4285

/* Check if this whole range is within ZONE_MOVABLE */

4286

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4286

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4287

*zone_start_pfn = *zone_end_pfn;

4287

*zone_start_pfn = *zone_end_pfn;

4288

}

4288

}

4289

}

4289

}

4290

4291

/*

4291

/*

4292

* Return the number of pages a zone spans in a node, including holes

4292

* Return the number of pages a zone spans in a node, including holes

4293

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4293

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4294

*/

4294

*/

4295

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4295

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4296

unsigned long zone_type,

4296

unsigned long zone_type,

4297

unsigned long *ignored)

4297

unsigned long *ignored)

4298

{

4298

{

4299

unsigned long node_start_pfn, node_end_pfn;

4299

unsigned long node_start_pfn, node_end_pfn;

4300

unsigned long zone_start_pfn, zone_end_pfn;

4300

unsigned long zone_start_pfn, zone_end_pfn;

4301

4302

/* Get the start and end of the node and zone */

4302

/* Get the start and end of the node and zone */

4303

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4303

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4304

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4304

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4305

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4305

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4306

adjust_zone_range_for_zone_movable(nid, zone_type,

4306

adjust_zone_range_for_zone_movable(nid, zone_type,

4307

node_start_pfn, node_end_pfn,

4307

node_start_pfn, node_end_pfn,

4308

&zone_start_pfn, &zone_end_pfn);

4308

&zone_start_pfn, &zone_end_pfn);

4309

4310

/* Check that this node has pages within the zone's required range */

4310

/* Check that this node has pages within the zone's required range */

4311

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4311

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4312

return 0;

4312

return 0;

4313

4314

/* Move the zone boundaries inside the node if necessary */

4314

/* Move the zone boundaries inside the node if necessary */

4315

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4315

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4316

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4316

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4317

4318

/* Return the spanned pages */

4318

/* Return the spanned pages */

4319

return zone_end_pfn - zone_start_pfn;

4319

return zone_end_pfn - zone_start_pfn;

4320

}

4320

}

4321

4322

/*

4322

/*

4323

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4323

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4324

* then all holes in the requested range will be accounted for.

4324

* then all holes in the requested range will be accounted for.

4325

*/

4325

*/

4326

unsigned long __meminit __absent_pages_in_range(int nid,

4326

unsigned long __meminit __absent_pages_in_range(int nid,

4327

unsigned long range_start_pfn,

4327

unsigned long range_start_pfn,

4328

unsigned long range_end_pfn)

4328

unsigned long range_end_pfn)

4329

{

4329

{

4330

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4330

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4331

unsigned long start_pfn, end_pfn;

4331

unsigned long start_pfn, end_pfn;

4332

int i;

4332

int i;

4333

4334

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4334

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4335

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4335

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4336

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4336

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4337

nr_absent -= end_pfn - start_pfn;

4337

nr_absent -= end_pfn - start_pfn;

4338

}

4338

}

4339

return nr_absent;

4339

return nr_absent;

4340

}

4340

}

4341

4342

/**

4342

/**

4343

* absent_pages_in_range - Return number of page frames in holes within a range

4343

* absent_pages_in_range - Return number of page frames in holes within a range

4344

* @start_pfn: The start PFN to start searching for holes

4344

* @start_pfn: The start PFN to start searching for holes

4345

* @end_pfn: The end PFN to stop searching for holes

4345

* @end_pfn: The end PFN to stop searching for holes

4346

*

4346

*

4347

* It returns the number of pages frames in memory holes within a range.

4347

* It returns the number of pages frames in memory holes within a range.

4348

*/

4348

*/

4349

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4349

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4350

unsigned long end_pfn)

4350

unsigned long end_pfn)

4351

{

4351

{

4352

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4352

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4353

}

4353

}

4354

4355

/* Return the number of page frames in holes in a zone on a node */

4355

/* Return the number of page frames in holes in a zone on a node */

4356

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4356

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4357

unsigned long zone_type,

4357

unsigned long zone_type,

4358

unsigned long *ignored)

4358

unsigned long *ignored)

4359

{

4359

{

4360

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4360

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4361

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4361

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4362

unsigned long node_start_pfn, node_end_pfn;

4362

unsigned long node_start_pfn, node_end_pfn;

4363

unsigned long zone_start_pfn, zone_end_pfn;

4363

unsigned long zone_start_pfn, zone_end_pfn;

4364

4365

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4365

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4366

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4366

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4367

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4367

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4368

4369

adjust_zone_range_for_zone_movable(nid, zone_type,

4369

adjust_zone_range_for_zone_movable(nid, zone_type,

4370

node_start_pfn, node_end_pfn,

4370

node_start_pfn, node_end_pfn,

4371

&zone_start_pfn, &zone_end_pfn);

4371

&zone_start_pfn, &zone_end_pfn);

4372

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4372

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4373

}

4373

}

4374

4375

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4375

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4376

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4376

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4377

unsigned long zone_type,

4377

unsigned long zone_type,

4378

unsigned long *zones_size)

4378

unsigned long *zones_size)

4379

{

4379

{

4380

return zones_size[zone_type];

4380

return zones_size[zone_type];

4381

}

4381

}

4382

4383

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4383

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4384

unsigned long zone_type,

4384

unsigned long zone_type,

4385

unsigned long *zholes_size)

4385

unsigned long *zholes_size)

4386

{

4386

{

4387

if (!zholes_size)

4387

if (!zholes_size)

4388

return 0;

4388

return 0;

4389

4390

return zholes_size[zone_type];

4390

return zholes_size[zone_type];

4391

}

4391

}

4392

4393

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4393

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4394

4395

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4395

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4396

unsigned long *zones_size, unsigned long *zholes_size)

4396

unsigned long *zones_size, unsigned long *zholes_size)

4397

{

4397

{

4398

unsigned long realtotalpages, totalpages = 0;

4398

unsigned long realtotalpages, totalpages = 0;

4399

enum zone_type i;

4399

enum zone_type i;

4400

4401

for (i = 0; i < MAX_NR_ZONES; i++)

4401

for (i = 0; i < MAX_NR_ZONES; i++)

4402

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4402

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4403

zones_size);

4403

zones_size);

4404

pgdat->node_spanned_pages = totalpages;

4404

pgdat->node_spanned_pages = totalpages;

4405

4406

realtotalpages = totalpages;

4406

realtotalpages = totalpages;

4407

for (i = 0; i < MAX_NR_ZONES; i++)

4407

for (i = 0; i < MAX_NR_ZONES; i++)

4408

realtotalpages -=

4408

realtotalpages -=

4409

zone_absent_pages_in_node(pgdat->node_id, i,

4409

zone_absent_pages_in_node(pgdat->node_id, i,

4410

zholes_size);

4410

zholes_size);

4411

pgdat->node_present_pages = realtotalpages;

4411

pgdat->node_present_pages = realtotalpages;

4412

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4412

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4413

realtotalpages);

4413

realtotalpages);

4414

}

4414

}

4415

4416

#ifndef CONFIG_SPARSEMEM

4416

#ifndef CONFIG_SPARSEMEM

4417

/*

4417

/*

4418

* Calculate the size of the zone->blockflags rounded to an unsigned long

4418

* Calculate the size of the zone->blockflags rounded to an unsigned long

4419

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4419

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4420

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4420

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4421

* round what is now in bits to nearest long in bits, then return it in

4421

* round what is now in bits to nearest long in bits, then return it in

4422

* bytes.

4422

* bytes.

4423

*/

4423

*/

4424

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4424

static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)

4425

{

4425

{

4426

unsigned long usemapsize;

4426

unsigned long usemapsize;

4427

4428

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4428

zonesize += zone_start_pfn & (pageblock_nr_pages-1);

4429

usemapsize = roundup(zonesize, pageblock_nr_pages);

4429

usemapsize = roundup(zonesize, pageblock_nr_pages);

4430

usemapsize = usemapsize >> pageblock_order;

4430

usemapsize = usemapsize >> pageblock_order;

4431

usemapsize *= NR_PAGEBLOCK_BITS;

4431

usemapsize *= NR_PAGEBLOCK_BITS;

4432

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4432

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4433

4434

return usemapsize / 8;

4434

return usemapsize / 8;

4435

}

4435

}

4436

4437

static void __init setup_usemap(struct pglist_data *pgdat,

4437

static void __init setup_usemap(struct pglist_data *pgdat,

4438

struct zone *zone,

4438

struct zone *zone,

4439

unsigned long zone_start_pfn,

4439

unsigned long zone_start_pfn,

4440

unsigned long zonesize)

4440

unsigned long zonesize)

4441

{

4441

{

4442

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4442

unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);

4443

zone->pageblock_flags = NULL;

4443

zone->pageblock_flags = NULL;

4444

if (usemapsize)

4444

if (usemapsize)

4445

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4445

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4446

usemapsize);

4446

usemapsize);

4447

}

4447

}

4448

#else

4448

#else

4449

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4449

static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,

4450

unsigned long zone_start_pfn, unsigned long zonesize) {}

4450

unsigned long zone_start_pfn, unsigned long zonesize) {}

4451

#endif /* CONFIG_SPARSEMEM */

4451

#endif /* CONFIG_SPARSEMEM */

4452

4453

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4453

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4454

4455

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4455

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4456

void __init set_pageblock_order(void)

4456

void __init set_pageblock_order(void)

4457

{

4457

{

4458

unsigned int order;

4458

unsigned int order;

4459

4460

/* Check that pageblock_nr_pages has not already been setup */

4460

/* Check that pageblock_nr_pages has not already been setup */

4461

if (pageblock_order)

4461

if (pageblock_order)

4462

return;

4462

return;

4463

4464

if (HPAGE_SHIFT > PAGE_SHIFT)

4464

if (HPAGE_SHIFT > PAGE_SHIFT)

4465

order = HUGETLB_PAGE_ORDER;

4465

order = HUGETLB_PAGE_ORDER;

4466

else

4466

else

4467

order = MAX_ORDER - 1;

4467

order = MAX_ORDER - 1;

4468

4469

/*

4469

/*

4470

* Assume the largest contiguous order of interest is a huge page.

4470

* Assume the largest contiguous order of interest is a huge page.

4471

* This value may be variable depending on boot parameters on IA64 and

4471

* This value may be variable depending on boot parameters on IA64 and

4472

* powerpc.

4472

* powerpc.

4473

*/

4473

*/

4474

pageblock_order = order;

4474

pageblock_order = order;

4475

}

4475

}

4476

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4476

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4477

4478

/*

4478

/*

4479

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4479

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4480

* is unused as pageblock_order is set at compile-time. See

4480

* is unused as pageblock_order is set at compile-time. See

4481

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4481

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4482

* the kernel config

4482

* the kernel config

4483

*/

4483

*/

4484

void __init set_pageblock_order(void)

4484

void __init set_pageblock_order(void)

4485

{

4485

{

4486

}

4486

}

4487

4488

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4488

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4489

4490

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4490

static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,

4491

unsigned long present_pages)

4491

unsigned long present_pages)

4492

{

4492

{

4493

unsigned long pages = spanned_pages;

4493

unsigned long pages = spanned_pages;

4494

4495

/*

4495

/*

4496

* Provide a more accurate estimation if there are holes within

4496

* Provide a more accurate estimation if there are holes within

4497

* the zone and SPARSEMEM is in use. If there are holes within the

4497

* the zone and SPARSEMEM is in use. If there are holes within the

4498

* zone, each populated memory region may cost us one or two extra

4498

* zone, each populated memory region may cost us one or two extra

4499

* memmap pages due to alignment because memmap pages for each

4499

* memmap pages due to alignment because memmap pages for each

4500

* populated regions may not naturally algined on page boundary.

4500

* populated regions may not naturally algined on page boundary.

4501

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4501

* So the (present_pages >> 4) heuristic is a tradeoff for that.

4502

*/

4502

*/

4503

if (spanned_pages > present_pages + (present_pages >> 4) &&

4503

if (spanned_pages > present_pages + (present_pages >> 4) &&

4504

IS_ENABLED(CONFIG_SPARSEMEM))

4504

IS_ENABLED(CONFIG_SPARSEMEM))

4505

pages = present_pages;

4505

pages = present_pages;

4506

4507

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4507

return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;

4508

}

4508

}

4509

4510

/*

4510

/*

4511

* Set up the zone data structures:

4511

* Set up the zone data structures:

4512

* - mark all pages reserved

4512

* - mark all pages reserved

4513

* - mark all memory queues empty

4513

* - mark all memory queues empty

4514

* - clear the memory bitmaps

4514

* - clear the memory bitmaps

4515

*

4515

*

4516

* NOTE: pgdat should get zeroed by caller.

4516

* NOTE: pgdat should get zeroed by caller.

4517

*/

4517

*/

4518

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4518

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4519

unsigned long *zones_size, unsigned long *zholes_size)

4519

unsigned long *zones_size, unsigned long *zholes_size)

4520

{

4520

{

4521

enum zone_type j;

4521

enum zone_type j;

4522

int nid = pgdat->node_id;

4522

int nid = pgdat->node_id;

4523

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4523

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4524

int ret;

4524

int ret;

4525

4526

pgdat_resize_init(pgdat);

4526

pgdat_resize_init(pgdat);

4527

#ifdef CONFIG_NUMA_BALANCING

4527

#ifdef CONFIG_NUMA_BALANCING

4528

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4528

spin_lock_init(&pgdat->numabalancing_migrate_lock);

4529

pgdat->numabalancing_migrate_nr_pages = 0;

4529

pgdat->numabalancing_migrate_nr_pages = 0;

4530

pgdat->numabalancing_migrate_next_window = jiffies;

4530

pgdat->numabalancing_migrate_next_window = jiffies;

4531

#endif

4531

#endif

4532

init_waitqueue_head(&pgdat->kswapd_wait);

4532

init_waitqueue_head(&pgdat->kswapd_wait);

4533

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4533

init_waitqueue_head(&pgdat->pfmemalloc_wait);

4534

pgdat_page_cgroup_init(pgdat);

4534

pgdat_page_cgroup_init(pgdat);

4535

4536

for (j = 0; j < MAX_NR_ZONES; j++) {

4536

for (j = 0; j < MAX_NR_ZONES; j++) {

4537

struct zone *zone = pgdat->node_zones + j;

4537

struct zone *zone = pgdat->node_zones + j;

4538

unsigned long size, realsize, freesize, memmap_pages;

4538

unsigned long size, realsize, freesize, memmap_pages;

4539

4540

size = zone_spanned_pages_in_node(nid, j, zones_size);

4540

size = zone_spanned_pages_in_node(nid, j, zones_size);

4541

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4541

realsize = freesize = size - zone_absent_pages_in_node(nid, j,

4542

zholes_size);

4542

zholes_size);

4543

4544

/*

4544

/*

4545

* Adjust freesize so that it accounts for how much memory

4545

* Adjust freesize so that it accounts for how much memory

4546

* is used by this zone for memmap. This affects the watermark

4546

* is used by this zone for memmap. This affects the watermark

4547

* and per-cpu initialisations

4547

* and per-cpu initialisations

4548

*/

4548

*/

4549

memmap_pages = calc_memmap_size(size, realsize);

4549

memmap_pages = calc_memmap_size(size, realsize);

4550

if (freesize >= memmap_pages) {

4550

if (freesize >= memmap_pages) {

4551

freesize -= memmap_pages;

4551

freesize -= memmap_pages;

4552

if (memmap_pages)

4552

if (memmap_pages)

4553

printk(KERN_DEBUG

4553

printk(KERN_DEBUG

4554

" %s zone: %lu pages used for memmap\n",

4554

" %s zone: %lu pages used for memmap\n",

4555

zone_names[j], memmap_pages);

4555

zone_names[j], memmap_pages);

4556

} else

4556

} else

4557

printk(KERN_WARNING

4557

printk(KERN_WARNING

4558

" %s zone: %lu pages exceeds freesize %lu\n",

4558

" %s zone: %lu pages exceeds freesize %lu\n",

4559

zone_names[j], memmap_pages, freesize);

4559

zone_names[j], memmap_pages, freesize);

4560

4561

/* Account for reserved pages */

4561

/* Account for reserved pages */

4562

if (j == 0 && freesize > dma_reserve) {

4562

if (j == 0 && freesize > dma_reserve) {

4563

freesize -= dma_reserve;

4563

freesize -= dma_reserve;

4564

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4564

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4565

zone_names[0], dma_reserve);

4565

zone_names[0], dma_reserve);

4566

}

4566

}

4567

4568

if (!is_highmem_idx(j))

4568

if (!is_highmem_idx(j))

4569

nr_kernel_pages += freesize;

4569

nr_kernel_pages += freesize;

4570

/* Charge for highmem memmap if there are enough kernel pages */

4570

/* Charge for highmem memmap if there are enough kernel pages */

4571

else if (nr_kernel_pages > memmap_pages * 2)

4571

else if (nr_kernel_pages > memmap_pages * 2)

4572

nr_kernel_pages -= memmap_pages;

4572

nr_kernel_pages -= memmap_pages;

4573

nr_all_pages += freesize;

4573

nr_all_pages += freesize;

4574

4575

zone->spanned_pages = size;

4575

zone->spanned_pages = size;

4576

zone->present_pages = freesize;

4576

zone->present_pages = freesize;

4577

/*

4577

/*

4578

* Set an approximate value for lowmem here, it will be adjusted

4578

* Set an approximate value for lowmem here, it will be adjusted

4579

* when the bootmem allocator frees pages into the buddy system.

4579

* when the bootmem allocator frees pages into the buddy system.

4580

* And all highmem pages will be managed by the buddy system.

4580

* And all highmem pages will be managed by the buddy system.

4581

*/

4581

*/

4582

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4582

zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;

4583

#ifdef CONFIG_NUMA

4583

#ifdef CONFIG_NUMA

4584

zone->node = nid;

4584

zone->node = nid;

4585

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4585

zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)

4586

/ 100;

4586

/ 100;

4587

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4587

zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;

4588

#endif

4588

#endif

4589

zone->name = zone_names[j];

4589

zone->name = zone_names[j];

4590

spin_lock_init(&zone->lock);

4590

spin_lock_init(&zone->lock);

4591

spin_lock_init(&zone->lru_lock);

4591

spin_lock_init(&zone->lru_lock);

4592

zone_seqlock_init(zone);

4592

zone_seqlock_init(zone);

4593

zone->zone_pgdat = pgdat;

4593

zone->zone_pgdat = pgdat;

4594

4595

zone_pcp_init(zone);

4595

zone_pcp_init(zone);

4596

lruvec_init(&zone->lruvec);

4596

lruvec_init(&zone->lruvec);

4597

if (!size)

4597

if (!size)

4598

continue;

4598

continue;

4599

4600

set_pageblock_order();

4600

set_pageblock_order();

4601

setup_usemap(pgdat, zone, zone_start_pfn, size);

4601

setup_usemap(pgdat, zone, zone_start_pfn, size);

4602

ret = init_currently_empty_zone(zone, zone_start_pfn,

4602

ret = init_currently_empty_zone(zone, zone_start_pfn,

4603

size, MEMMAP_EARLY);

4603

size, MEMMAP_EARLY);

4604

BUG_ON(ret);

4604

BUG_ON(ret);

4605

memmap_init(size, nid, j, zone_start_pfn);

4605

memmap_init(size, nid, j, zone_start_pfn);

4606

zone_start_pfn += size;

4606

zone_start_pfn += size;

4607

}

4607

}

4608

}

4608

}

4609

4610

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4610

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4611

{

4611

{

4612

/* Skip empty nodes */

4612

/* Skip empty nodes */

4613

if (!pgdat->node_spanned_pages)

4613

if (!pgdat->node_spanned_pages)

4614

return;

4614

return;

4615

4616

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4616

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4617

/* ia64 gets its own node_mem_map, before this, without bootmem */

4617

/* ia64 gets its own node_mem_map, before this, without bootmem */

4618

if (!pgdat->node_mem_map) {

4618

if (!pgdat->node_mem_map) {

4619

unsigned long size, start, end;

4619

unsigned long size, start, end;

4620

struct page *map;

4620

struct page *map;

4621

4622

/*

4622

/*

4623

* The zone's endpoints aren't required to be MAX_ORDER

4623

* The zone's endpoints aren't required to be MAX_ORDER

4624

* aligned but the node_mem_map endpoints must be in order

4624

* aligned but the node_mem_map endpoints must be in order

4625

* for the buddy allocator to function correctly.

4625

* for the buddy allocator to function correctly.

4626

*/

4626

*/

4627

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4627

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4628

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4628

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4629

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4629

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4630

size = (end - start) * sizeof(struct page);

4630

size = (end - start) * sizeof(struct page);

4631

map = alloc_remap(pgdat->node_id, size);

4631

map = alloc_remap(pgdat->node_id, size);

4632

if (!map)

4632

if (!map)

4633

map = alloc_bootmem_node_nopanic(pgdat, size);

4633

map = alloc_bootmem_node_nopanic(pgdat, size);

4634

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4634

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4635

}

4635

}

4636

#ifndef CONFIG_NEED_MULTIPLE_NODES

4636

#ifndef CONFIG_NEED_MULTIPLE_NODES

4637

/*

4637

/*

4638

* With no DISCONTIG, the global mem_map is just set as node 0's

4638

* With no DISCONTIG, the global mem_map is just set as node 0's

4639

*/

4639

*/

4640

if (pgdat == NODE_DATA(0)) {

4640

if (pgdat == NODE_DATA(0)) {

4641

mem_map = NODE_DATA(0)->node_mem_map;

4641

mem_map = NODE_DATA(0)->node_mem_map;

4642

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4642

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4643

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4643

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4644

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4644

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4645

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4645

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4646

}

4646

}

4647

#endif

4647

#endif

4648

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4648

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4649

}

4649

}

4650

4651

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4651

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4652

unsigned long node_start_pfn, unsigned long *zholes_size)

4652

unsigned long node_start_pfn, unsigned long *zholes_size)

4653

{

4653

{

4654

pg_data_t *pgdat = NODE_DATA(nid);

4654

pg_data_t *pgdat = NODE_DATA(nid);

4655

4656

/* pg_data_t should be reset to zero when it's allocated */

4656

/* pg_data_t should be reset to zero when it's allocated */

4657

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4657

WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);

4658

4659

pgdat->node_id = nid;

4659

pgdat->node_id = nid;

4660

pgdat->node_start_pfn = node_start_pfn;

4660

pgdat->node_start_pfn = node_start_pfn;

4661

init_zone_allows_reclaim(nid);

4661

init_zone_allows_reclaim(nid);

4662

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4662

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4663

4664

alloc_node_mem_map(pgdat);

4664

alloc_node_mem_map(pgdat);

4665

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4665

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4666

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4666

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4667

nid, (unsigned long)pgdat,

4667

nid, (unsigned long)pgdat,

4668

(unsigned long)pgdat->node_mem_map);

4668

(unsigned long)pgdat->node_mem_map);

4669

#endif

4669

#endif

4670

4671

free_area_init_core(pgdat, zones_size, zholes_size);

4671

free_area_init_core(pgdat, zones_size, zholes_size);

4672

}

4672

}

4673

4674

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4674

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4675

4676

#if MAX_NUMNODES > 1

4676

#if MAX_NUMNODES > 1

4677

/*

4677

/*

4678

* Figure out the number of possible node ids.

4678

* Figure out the number of possible node ids.

4679

*/

4679

*/

4680

static void __init setup_nr_node_ids(void)

4680

static void __init setup_nr_node_ids(void)

4681

{

4681

{

4682

unsigned int node;

4682

unsigned int node;

4683

unsigned int highest = 0;

4683

unsigned int highest = 0;

4684

4685

for_each_node_mask(node, node_possible_map)

4685

for_each_node_mask(node, node_possible_map)

4686

highest = node;

4686

highest = node;

4687

nr_node_ids = highest + 1;

4687

nr_node_ids = highest + 1;

4688

}

4688

}

4689

#else

4689

#else

4690

static inline void setup_nr_node_ids(void)

4690

static inline void setup_nr_node_ids(void)

4691

{

4691

{

4692

}

4692

}

4693

#endif

4693

#endif

4694

4695

/**

4695

/**

4696

* node_map_pfn_alignment - determine the maximum internode alignment

4696

* node_map_pfn_alignment - determine the maximum internode alignment

4697

*

4697

*

4698

* This function should be called after node map is populated and sorted.

4698

* This function should be called after node map is populated and sorted.

4699

* It calculates the maximum power of two alignment which can distinguish

4699

* It calculates the maximum power of two alignment which can distinguish

4700

* all the nodes.

4700

* all the nodes.

4701

*

4701

*

4702

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4702

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4703

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4703

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4704

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4704

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4705

* shifted, 1GiB is enough and this function will indicate so.

4705

* shifted, 1GiB is enough and this function will indicate so.

4706

*

4706

*

4707

* This is used to test whether pfn -> nid mapping of the chosen memory

4707

* This is used to test whether pfn -> nid mapping of the chosen memory

4708

* model has fine enough granularity to avoid incorrect mapping for the

4708

* model has fine enough granularity to avoid incorrect mapping for the

4709

* populated node map.

4709

* populated node map.

4710

*

4710

*

4711

* Returns the determined alignment in pfn's. 0 if there is no alignment

4711

* Returns the determined alignment in pfn's. 0 if there is no alignment

4712

* requirement (single node).

4712

* requirement (single node).

4713

*/

4713

*/

4714

unsigned long __init node_map_pfn_alignment(void)

4714

unsigned long __init node_map_pfn_alignment(void)

4715

{

4715

{

4716

unsigned long accl_mask = 0, last_end = 0;

4716

unsigned long accl_mask = 0, last_end = 0;

4717

unsigned long start, end, mask;

4717

unsigned long start, end, mask;

4718

int last_nid = -1;

4718

int last_nid = -1;

4719

int i, nid;

4719

int i, nid;

4720

4721

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4721

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4722

if (!start || last_nid < 0 || last_nid == nid) {

4722

if (!start || last_nid < 0 || last_nid == nid) {

4723

last_nid = nid;

4723

last_nid = nid;

4724

last_end = end;

4724

last_end = end;

4725

continue;

4725

continue;

4726

}

4726

}

4727

4728

/*

4728

/*

4729

* Start with a mask granular enough to pin-point to the

4729

* Start with a mask granular enough to pin-point to the

4730

* start pfn and tick off bits one-by-one until it becomes

4730

* start pfn and tick off bits one-by-one until it becomes

4731

* too coarse to separate the current node from the last.

4731

* too coarse to separate the current node from the last.

4732

*/

4732

*/

4733

mask = ~((1 << __ffs(start)) - 1);

4733

mask = ~((1 << __ffs(start)) - 1);

4734

while (mask && last_end <= (start & (mask << 1)))

4734

while (mask && last_end <= (start & (mask << 1)))

4735

mask <<= 1;

4735

mask <<= 1;

4736

4737

/* accumulate all internode masks */

4737

/* accumulate all internode masks */

4738

accl_mask |= mask;

4738

accl_mask |= mask;

4739

}

4739

}

4740

4741

/* convert mask to number of pages */

4741

/* convert mask to number of pages */

4742

return ~accl_mask + 1;

4742

return ~accl_mask + 1;

4743

}

4743

}

4744

4745

/* Find the lowest pfn for a node */

4745

/* Find the lowest pfn for a node */

4746

static unsigned long __init find_min_pfn_for_node(int nid)

4746

static unsigned long __init find_min_pfn_for_node(int nid)

4747

{

4747

{

4748

unsigned long min_pfn = ULONG_MAX;

4748

unsigned long min_pfn = ULONG_MAX;

4749

unsigned long start_pfn;

4749

unsigned long start_pfn;

4750

int i;

4750

int i;

4751

4752

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4752

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4753

min_pfn = min(min_pfn, start_pfn);

4753

min_pfn = min(min_pfn, start_pfn);

4754

4755

if (min_pfn == ULONG_MAX) {

4755

if (min_pfn == ULONG_MAX) {

4756

printk(KERN_WARNING

4756

printk(KERN_WARNING

4757

"Could not find start_pfn for node %d\n", nid);

4757

"Could not find start_pfn for node %d\n", nid);

4758

return 0;

4758

return 0;

4759

}

4759

}

4760

4761

return min_pfn;

4761

return min_pfn;

4762

}

4762

}

4763

4764

/**

4764

/**

4765

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4765

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4766

*

4766

*

4767

* It returns the minimum PFN based on information provided via

4767

* It returns the minimum PFN based on information provided via

4768

* add_active_range().

4768

* add_active_range().

4769

*/

4769

*/

4770

unsigned long __init find_min_pfn_with_active_regions(void)

4770

unsigned long __init find_min_pfn_with_active_regions(void)

4771

{

4771

{

4772

return find_min_pfn_for_node(MAX_NUMNODES);

4772

return find_min_pfn_for_node(MAX_NUMNODES);

4773

}

4773

}

4774

4775

/*

4775

/*

4776

* early_calculate_totalpages()

4776

* early_calculate_totalpages()

4777

* Sum pages in active regions for movable zone.

4777

* Sum pages in active regions for movable zone.

4778

* Populate N_MEMORY for calculating usable_nodes.

4778

* Populate N_MEMORY for calculating usable_nodes.

4779

*/

4779

*/

4780

static unsigned long __init early_calculate_totalpages(void)

4780

static unsigned long __init early_calculate_totalpages(void)

4781

{

4781

{

4782

unsigned long totalpages = 0;

4782

unsigned long totalpages = 0;

4783

unsigned long start_pfn, end_pfn;

4783

unsigned long start_pfn, end_pfn;

4784

int i, nid;

4784

int i, nid;

4785

4786

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4786

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4787

unsigned long pages = end_pfn - start_pfn;

4787

unsigned long pages = end_pfn - start_pfn;

4788

4789

totalpages += pages;

4789

totalpages += pages;

4790

if (pages)

4790

if (pages)

4791

node_set_state(nid, N_MEMORY);

4791

node_set_state(nid, N_MEMORY);

4792

}

4792

}

4793

return totalpages;

4793

return totalpages;

4794

}

4794

}

4795

4796

/*

4796

/*

4797

* Find the PFN the Movable zone begins in each node. Kernel memory

4797

* Find the PFN the Movable zone begins in each node. Kernel memory

4798

* is spread evenly between nodes as long as the nodes have enough

4798

* is spread evenly between nodes as long as the nodes have enough

4799

* memory. When they don't, some nodes will have more kernelcore than

4799

* memory. When they don't, some nodes will have more kernelcore than

4800

* others

4800

* others

4801

*/

4801

*/

4802

static void __init find_zone_movable_pfns_for_nodes(void)

4802

static void __init find_zone_movable_pfns_for_nodes(void)

4803

{

4803

{

4804

int i, nid;

4804

int i, nid;

4805

unsigned long usable_startpfn;

4805

unsigned long usable_startpfn;

4806

unsigned long kernelcore_node, kernelcore_remaining;

4806

unsigned long kernelcore_node, kernelcore_remaining;

4807

/* save the state before borrow the nodemask */

4807

/* save the state before borrow the nodemask */

4808

nodemask_t saved_node_state = node_states[N_MEMORY];

4808

nodemask_t saved_node_state = node_states[N_MEMORY];

4809

unsigned long totalpages = early_calculate_totalpages();

4809

unsigned long totalpages = early_calculate_totalpages();

4810

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

4810

int usable_nodes = nodes_weight(node_states[N_MEMORY]);

4811

4812

/*

4812

/*

4813

* If movablecore was specified, calculate what size of

4813

* If movablecore was specified, calculate what size of

4814

* kernelcore that corresponds so that memory usable for

4814

* kernelcore that corresponds so that memory usable for

4815

* any allocation type is evenly spread. If both kernelcore

4815

* any allocation type is evenly spread. If both kernelcore

4816

* and movablecore are specified, then the value of kernelcore

4816

* and movablecore are specified, then the value of kernelcore

4817

* will be used for required_kernelcore if it's greater than

4817

* will be used for required_kernelcore if it's greater than

4818

* what movablecore would have allowed.

4818

* what movablecore would have allowed.

4819

*/

4819

*/

4820

if (required_movablecore) {

4820

if (required_movablecore) {

4821

unsigned long corepages;

4821

unsigned long corepages;

4822

4823

/*

4823

/*

4824

* Round-up so that ZONE_MOVABLE is at least as large as what

4824

* Round-up so that ZONE_MOVABLE is at least as large as what

4825

* was requested by the user

4825

* was requested by the user

4826

*/

4826

*/

4827

required_movablecore =

4827

required_movablecore =

4828

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4828

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4829

corepages = totalpages - required_movablecore;

4829

corepages = totalpages - required_movablecore;

4830

4831

required_kernelcore = max(required_kernelcore, corepages);

4831

required_kernelcore = max(required_kernelcore, corepages);

4832

}

4832

}

4833

4834

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4834

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4835

if (!required_kernelcore)

4835

if (!required_kernelcore)

4836

goto out;

4836

goto out;

4837

4838

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4838

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4839

find_usable_zone_for_movable();

4839

find_usable_zone_for_movable();

4840

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4840

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4841

4842

restart:

4842

restart:

4843

/* Spread kernelcore memory as evenly as possible throughout nodes */

4843

/* Spread kernelcore memory as evenly as possible throughout nodes */

4844

kernelcore_node = required_kernelcore / usable_nodes;

4844

kernelcore_node = required_kernelcore / usable_nodes;

4845

for_each_node_state(nid, N_MEMORY) {

4845

for_each_node_state(nid, N_MEMORY) {

4846

unsigned long start_pfn, end_pfn;

4846

unsigned long start_pfn, end_pfn;

4847

4848

/*

4848

/*

4849

* Recalculate kernelcore_node if the division per node

4849

* Recalculate kernelcore_node if the division per node

4850

* now exceeds what is necessary to satisfy the requested

4850

* now exceeds what is necessary to satisfy the requested

4851

* amount of memory for the kernel

4851

* amount of memory for the kernel

4852

*/

4852

*/

4853

if (required_kernelcore < kernelcore_node)

4853

if (required_kernelcore < kernelcore_node)

4854

kernelcore_node = required_kernelcore / usable_nodes;

4854

kernelcore_node = required_kernelcore / usable_nodes;

4855

4856

/*

4856

/*

4857

* As the map is walked, we track how much memory is usable

4857

* As the map is walked, we track how much memory is usable

4858

* by the kernel using kernelcore_remaining. When it is

4858

* by the kernel using kernelcore_remaining. When it is

4859

* 0, the rest of the node is usable by ZONE_MOVABLE

4859

* 0, the rest of the node is usable by ZONE_MOVABLE

4860

*/

4860

*/

4861

kernelcore_remaining = kernelcore_node;

4861

kernelcore_remaining = kernelcore_node;

4862

4863

/* Go through each range of PFNs within this node */

4863

/* Go through each range of PFNs within this node */

4864

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4864

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4865

unsigned long size_pages;

4865

unsigned long size_pages;

4866

4867

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4867

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4868

if (start_pfn >= end_pfn)

4868

if (start_pfn >= end_pfn)

4869

continue;

4869

continue;

4870

4871

/* Account for what is only usable for kernelcore */

4871

/* Account for what is only usable for kernelcore */

4872

if (start_pfn < usable_startpfn) {

4872

if (start_pfn < usable_startpfn) {

4873

unsigned long kernel_pages;

4873

unsigned long kernel_pages;

4874

kernel_pages = min(end_pfn, usable_startpfn)

4874

kernel_pages = min(end_pfn, usable_startpfn)

4875

- start_pfn;

4875

- start_pfn;

4876

4877

kernelcore_remaining -= min(kernel_pages,

4877

kernelcore_remaining -= min(kernel_pages,

4878

kernelcore_remaining);

4878

kernelcore_remaining);

4879

required_kernelcore -= min(kernel_pages,

4879

required_kernelcore -= min(kernel_pages,

4880

required_kernelcore);

4880

required_kernelcore);

4881

4882

/* Continue if range is now fully accounted */

4882

/* Continue if range is now fully accounted */

4883

if (end_pfn <= usable_startpfn) {

4883

if (end_pfn <= usable_startpfn) {

4884

4885

/*

4885

/*

4886

* Push zone_movable_pfn to the end so

4886

* Push zone_movable_pfn to the end so

4887

* that if we have to rebalance

4887

* that if we have to rebalance

4888

* kernelcore across nodes, we will

4888

* kernelcore across nodes, we will

4889

* not double account here

4889

* not double account here

4890

*/

4890

*/

4891

zone_movable_pfn[nid] = end_pfn;

4891

zone_movable_pfn[nid] = end_pfn;

4892

continue;

4892

continue;

4893

}

4893

}

4894

start_pfn = usable_startpfn;

4894

start_pfn = usable_startpfn;

4895

}

4895

}

4896

4897

/*

4897

/*

4898

* The usable PFN range for ZONE_MOVABLE is from

4898

* The usable PFN range for ZONE_MOVABLE is from

4899

* start_pfn->end_pfn. Calculate size_pages as the

4899

* start_pfn->end_pfn. Calculate size_pages as the

4900

* number of pages used as kernelcore

4900

* number of pages used as kernelcore

4901

*/

4901

*/

4902

size_pages = end_pfn - start_pfn;

4902

size_pages = end_pfn - start_pfn;

4903

if (size_pages > kernelcore_remaining)

4903

if (size_pages > kernelcore_remaining)

4904

size_pages = kernelcore_remaining;

4904

size_pages = kernelcore_remaining;

4905

zone_movable_pfn[nid] = start_pfn + size_pages;

4905

zone_movable_pfn[nid] = start_pfn + size_pages;

4906

4907

/*

4907

/*

4908

* Some kernelcore has been met, update counts and

4908

* Some kernelcore has been met, update counts and

4909

* break if the kernelcore for this node has been

4909

* break if the kernelcore for this node has been

4910

* satisified

4910

* satisified

4911

*/

4911

*/

4912

required_kernelcore -= min(required_kernelcore,

4912

required_kernelcore -= min(required_kernelcore,

4913

size_pages);

4913

size_pages);

4914

kernelcore_remaining -= size_pages;

4914

kernelcore_remaining -= size_pages;

4915

if (!kernelcore_remaining)

4915

if (!kernelcore_remaining)

4916

break;

4916

break;

4917

}

4917

}

4918

}

4918

}

4919

4920

/*

4920

/*

4921

* If there is still required_kernelcore, we do another pass with one

4921

* If there is still required_kernelcore, we do another pass with one

4922

* less node in the count. This will push zone_movable_pfn[nid] further

4922

* less node in the count. This will push zone_movable_pfn[nid] further

4923

* along on the nodes that still have memory until kernelcore is

4923

* along on the nodes that still have memory until kernelcore is

4924

* satisified

4924

* satisified

4925

*/

4925

*/

4926

usable_nodes--;

4926

usable_nodes--;

4927

if (usable_nodes && required_kernelcore > usable_nodes)

4927

if (usable_nodes && required_kernelcore > usable_nodes)

4928

goto restart;

4928

goto restart;

4929

4930

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4930

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4931

for (nid = 0; nid < MAX_NUMNODES; nid++)

4931

for (nid = 0; nid < MAX_NUMNODES; nid++)

4932

zone_movable_pfn[nid] =

4932

zone_movable_pfn[nid] =

4933

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4933

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4934

4935

out:

4935

out:

4936

/* restore the node_state */

4936

/* restore the node_state */

4937

node_states[N_MEMORY] = saved_node_state;

4937

node_states[N_MEMORY] = saved_node_state;

4938

}

4938

}

4939

4940

/* Any regular or high memory on that node ? */

4940

/* Any regular or high memory on that node ? */

4941

static void check_for_memory(pg_data_t *pgdat, int nid)

4941

static void check_for_memory(pg_data_t *pgdat, int nid)

4942

{

4942

{

4943

enum zone_type zone_type;

4943

enum zone_type zone_type;

4944

4945

if (N_MEMORY == N_NORMAL_MEMORY)

4945

if (N_MEMORY == N_NORMAL_MEMORY)

4946

return;

4946

return;

4947

4948

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

4948

for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {

4949

struct zone *zone = &pgdat->node_zones[zone_type];

4949

struct zone *zone = &pgdat->node_zones[zone_type];

4950

if (zone->present_pages) {

4950

if (zone->present_pages) {

4951

node_set_state(nid, N_HIGH_MEMORY);

4951

node_set_state(nid, N_HIGH_MEMORY);

4952

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

4952

if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&

4953

zone_type <= ZONE_NORMAL)

4953

zone_type <= ZONE_NORMAL)

4954

node_set_state(nid, N_NORMAL_MEMORY);

4954

node_set_state(nid, N_NORMAL_MEMORY);

4955

break;

4955

break;

4956

}

4956

}

4957

}

4957

}

4958

}

4958

}

4959

4960

/**

4960

/**

4961

* free_area_init_nodes - Initialise all pg_data_t and zone data

4961

* free_area_init_nodes - Initialise all pg_data_t and zone data

4962

* @max_zone_pfn: an array of max PFNs for each zone

4962

* @max_zone_pfn: an array of max PFNs for each zone

4963

*

4963

*

4964

* This will call free_area_init_node() for each active node in the system.

4964

* This will call free_area_init_node() for each active node in the system.

4965

* Using the page ranges provided by add_active_range(), the size of each

4965

* Using the page ranges provided by add_active_range(), the size of each

4966

* zone in each node and their holes is calculated. If the maximum PFN

4966

* zone in each node and their holes is calculated. If the maximum PFN

4967

* between two adjacent zones match, it is assumed that the zone is empty.

4967

* between two adjacent zones match, it is assumed that the zone is empty.

4968

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4968

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4969

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4969

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4970

* starts where the previous one ended. For example, ZONE_DMA32 starts

4970

* starts where the previous one ended. For example, ZONE_DMA32 starts

4971

* at arch_max_dma_pfn.

4971

* at arch_max_dma_pfn.

4972

*/

4972

*/

4973

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4973

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4974

{

4974

{

4975

unsigned long start_pfn, end_pfn;

4975

unsigned long start_pfn, end_pfn;

4976

int i, nid;

4976

int i, nid;

4977

4978

/* Record where the zone boundaries are */

4978

/* Record where the zone boundaries are */

4979

memset(arch_zone_lowest_possible_pfn, 0,

4979

memset(arch_zone_lowest_possible_pfn, 0,

4980

sizeof(arch_zone_lowest_possible_pfn));

4980

sizeof(arch_zone_lowest_possible_pfn));

4981

memset(arch_zone_highest_possible_pfn, 0,

4981

memset(arch_zone_highest_possible_pfn, 0,

4982

sizeof(arch_zone_highest_possible_pfn));

4982

sizeof(arch_zone_highest_possible_pfn));

4983

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4983

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4984

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4984

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4985

for (i = 1; i < MAX_NR_ZONES; i++) {

4985

for (i = 1; i < MAX_NR_ZONES; i++) {

4986

if (i == ZONE_MOVABLE)

4986

if (i == ZONE_MOVABLE)

4987

continue;

4987

continue;

4988

arch_zone_lowest_possible_pfn[i] =

4988

arch_zone_lowest_possible_pfn[i] =

4989

arch_zone_highest_possible_pfn[i-1];

4989

arch_zone_highest_possible_pfn[i-1];

4990

arch_zone_highest_possible_pfn[i] =

4990

arch_zone_highest_possible_pfn[i] =

4991

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4991

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4992

}

4992

}

4993

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4993

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4994

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4994

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4995

4996

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4996

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4997

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4997

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4998

find_zone_movable_pfns_for_nodes();

4998

find_zone_movable_pfns_for_nodes();

4999

5000

/* Print out the zone ranges */

5000

/* Print out the zone ranges */

5001

printk("Zone ranges:\n");

5001

printk("Zone ranges:\n");

5002

for (i = 0; i < MAX_NR_ZONES; i++) {

5002

for (i = 0; i < MAX_NR_ZONES; i++) {

5003

if (i == ZONE_MOVABLE)

5003

if (i == ZONE_MOVABLE)

5004

continue;

5004

continue;

5005

printk(KERN_CONT " %-8s ", zone_names[i]);

5005

printk(KERN_CONT " %-8s ", zone_names[i]);

5006

if (arch_zone_lowest_possible_pfn[i] ==

5006

if (arch_zone_lowest_possible_pfn[i] ==

5007

arch_zone_highest_possible_pfn[i])

5007

arch_zone_highest_possible_pfn[i])

5008

printk(KERN_CONT "empty\n");

5008

printk(KERN_CONT "empty\n");

5009

else

5009

else

5010

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5010

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

5011

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5011

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

5012

(arch_zone_highest_possible_pfn[i]

5012

(arch_zone_highest_possible_pfn[i]

5013

<< PAGE_SHIFT) - 1);

5013

<< PAGE_SHIFT) - 1);

5014

}

5014

}

5015

5016

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5016

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

5017

printk("Movable zone start for each node\n");

5017

printk("Movable zone start for each node\n");

5018

for (i = 0; i < MAX_NUMNODES; i++) {

5018

for (i = 0; i < MAX_NUMNODES; i++) {

5019

if (zone_movable_pfn[i])

5019

if (zone_movable_pfn[i])

5020

printk(" Node %d: %#010lx\n", i,

5020

printk(" Node %d: %#010lx\n", i,

5021

zone_movable_pfn[i] << PAGE_SHIFT);

5021

zone_movable_pfn[i] << PAGE_SHIFT);

5022

}

5022

}

5023

5024

/* Print out the early node map */

5024

/* Print out the early node map */

5025

printk("Early memory node ranges\n");

5025

printk("Early memory node ranges\n");

5026

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5026

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

5027

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5027

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

5028

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5028

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

5029

5030

/* Initialise every node */

5030

/* Initialise every node */

5031

mminit_verify_pageflags_layout();

5031

mminit_verify_pageflags_layout();

5032

setup_nr_node_ids();

5032

setup_nr_node_ids();

5033

for_each_online_node(nid) {

5033

for_each_online_node(nid) {

5034

pg_data_t *pgdat = NODE_DATA(nid);

5034

pg_data_t *pgdat = NODE_DATA(nid);

5035

free_area_init_node(nid, NULL,

5035

free_area_init_node(nid, NULL,

5036

find_min_pfn_for_node(nid), NULL);

5036

find_min_pfn_for_node(nid), NULL);

5037

5038

/* Any memory on that node */

5038

/* Any memory on that node */

5039

if (pgdat->node_present_pages)

5039

if (pgdat->node_present_pages)

5040

node_set_state(nid, N_MEMORY);

5040

node_set_state(nid, N_MEMORY);

5041

check_for_memory(pgdat, nid);

5041

check_for_memory(pgdat, nid);

5042

}

5042

}

5043

}

5043

}

5044

5045

static int __init cmdline_parse_core(char *p, unsigned long *core)

5045

static int __init cmdline_parse_core(char *p, unsigned long *core)

5046

{

5046

{

5047

unsigned long long coremem;

5047

unsigned long long coremem;

5048

if (!p)

5048

if (!p)

5049

return -EINVAL;

5049

return -EINVAL;

5050

5051

coremem = memparse(p, &p);

5051

coremem = memparse(p, &p);

5052

*core = coremem >> PAGE_SHIFT;

5052

*core = coremem >> PAGE_SHIFT;

5053

5054

/* Paranoid check that UL is enough for the coremem value */

5054

/* Paranoid check that UL is enough for the coremem value */

5055

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5055

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

5056

5057

return 0;

5057

return 0;

5058

}

5058

}

5059

5060

/*

5060

/*

5061

* kernelcore=size sets the amount of memory for use for allocations that

5061

* kernelcore=size sets the amount of memory for use for allocations that

5062

* cannot be reclaimed or migrated.

5062

* cannot be reclaimed or migrated.

5063

*/

5063

*/

5064

static int __init cmdline_parse_kernelcore(char *p)

5064

static int __init cmdline_parse_kernelcore(char *p)

5065

{

5065

{

5066

return cmdline_parse_core(p, &required_kernelcore);

5066

return cmdline_parse_core(p, &required_kernelcore);

5067

}

5067

}

5068

5069

/*

5069

/*

5070

* movablecore=size sets the amount of memory for use for allocations that

5070

* movablecore=size sets the amount of memory for use for allocations that

5071

* can be reclaimed or migrated.

5071

* can be reclaimed or migrated.

5072

*/

5072

*/

5073

static int __init cmdline_parse_movablecore(char *p)

5073

static int __init cmdline_parse_movablecore(char *p)

5074

{

5074

{

5075

return cmdline_parse_core(p, &required_movablecore);

5075

return cmdline_parse_core(p, &required_movablecore);

5076

}

5076

}

5077

5078

early_param("kernelcore", cmdline_parse_kernelcore);

5078

early_param("kernelcore", cmdline_parse_kernelcore);

5079

early_param("movablecore", cmdline_parse_movablecore);

5079

early_param("movablecore", cmdline_parse_movablecore);

5080

5081

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5081

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

5082

5083

/**

5083

/**

5084

* set_dma_reserve - set the specified number of pages reserved in the first zone

5084

* set_dma_reserve - set the specified number of pages reserved in the first zone

5085

* @new_dma_reserve: The number of pages to mark reserved

5085

* @new_dma_reserve: The number of pages to mark reserved

5086

*

5086

*

5087

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5087

* The per-cpu batchsize and zone watermarks are determined by present_pages.

5088

* In the DMA zone, a significant percentage may be consumed by kernel image

5088

* In the DMA zone, a significant percentage may be consumed by kernel image

5089

* and other unfreeable allocations which can skew the watermarks badly. This

5089

* and other unfreeable allocations which can skew the watermarks badly. This

5090

* function may optionally be used to account for unfreeable pages in the

5090

* function may optionally be used to account for unfreeable pages in the

5091

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5091

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

5092

* smaller per-cpu batchsize.

5092

* smaller per-cpu batchsize.

5093

*/

5093

*/

5094

void __init set_dma_reserve(unsigned long new_dma_reserve)

5094

void __init set_dma_reserve(unsigned long new_dma_reserve)

5095

{

5095

{

5096

dma_reserve = new_dma_reserve;

5096

dma_reserve = new_dma_reserve;

5097

}

5097

}

5098

5099

void __init free_area_init(unsigned long *zones_size)

5099

void __init free_area_init(unsigned long *zones_size)

5100

{

5100

{

5101

free_area_init_node(0, zones_size,

5101

free_area_init_node(0, zones_size,

5102

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5102

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

5103

}

5103

}

5104

5105

static int page_alloc_cpu_notify(struct notifier_block *self,

5105

static int page_alloc_cpu_notify(struct notifier_block *self,

5106

unsigned long action, void *hcpu)

5106

unsigned long action, void *hcpu)

5107

{

5107

{

5108

int cpu = (unsigned long)hcpu;

5108

int cpu = (unsigned long)hcpu;

5109

5110

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5110

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

5111

lru_add_drain_cpu(cpu);

5111

lru_add_drain_cpu(cpu);

5112

drain_pages(cpu);

5112

drain_pages(cpu);

5113

5114

/*

5114

/*

5115

* Spill the event counters of the dead processor

5115

* Spill the event counters of the dead processor

5116

* into the current processors event counters.

5116

* into the current processors event counters.

5117

* This artificially elevates the count of the current

5117

* This artificially elevates the count of the current

5118

* processor.

5118

* processor.

5119

*/

5119

*/

5120

vm_events_fold_cpu(cpu);

5120

vm_events_fold_cpu(cpu);

5121

5122

/*

5122

/*

5123

* Zero the differential counters of the dead processor

5123

* Zero the differential counters of the dead processor

5124

* so that the vm statistics are consistent.

5124

* so that the vm statistics are consistent.

5125

*

5125

*

5126

* This is only okay since the processor is dead and cannot

5126

* This is only okay since the processor is dead and cannot

5127

* race with what we are doing.

5127

* race with what we are doing.

5128

*/

5128

*/

5129

refresh_cpu_vm_stats(cpu);

5129

refresh_cpu_vm_stats(cpu);

5130

}

5130

}

5131

return NOTIFY_OK;

5131

return NOTIFY_OK;

5132

}

5132

}

5133

5134

void __init page_alloc_init(void)

5134

void __init page_alloc_init(void)

5135

{

5135

{

5136

hotcpu_notifier(page_alloc_cpu_notify, 0);

5136

hotcpu_notifier(page_alloc_cpu_notify, 0);

5137

}

5137

}

5138

5139

/*

5139

/*

5140

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5140

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

5141

* or min_free_kbytes changes.

5141

* or min_free_kbytes changes.

5142

*/

5142

*/

5143

static void calculate_totalreserve_pages(void)

5143

static void calculate_totalreserve_pages(void)

5144

{

5144

{

5145

struct pglist_data *pgdat;

5145

struct pglist_data *pgdat;

5146

unsigned long reserve_pages = 0;

5146

unsigned long reserve_pages = 0;

5147

enum zone_type i, j;

5147

enum zone_type i, j;

5148

5149

for_each_online_pgdat(pgdat) {

5149

for_each_online_pgdat(pgdat) {

5150

for (i = 0; i < MAX_NR_ZONES; i++) {

5150

for (i = 0; i < MAX_NR_ZONES; i++) {

5151

struct zone *zone = pgdat->node_zones + i;

5151

struct zone *zone = pgdat->node_zones + i;

5152

unsigned long max = 0;

5152

unsigned long max = 0;

5153

5154

/* Find valid and maximum lowmem_reserve in the zone */

5154

/* Find valid and maximum lowmem_reserve in the zone */

5155

for (j = i; j < MAX_NR_ZONES; j++) {

5155

for (j = i; j < MAX_NR_ZONES; j++) {

5156

if (zone->lowmem_reserve[j] > max)

5156

if (zone->lowmem_reserve[j] > max)

5157

max = zone->lowmem_reserve[j];

5157

max = zone->lowmem_reserve[j];

5158

}

5158

}

5159

5160

/* we treat the high watermark as reserved pages. */

5160

/* we treat the high watermark as reserved pages. */

5161

max += high_wmark_pages(zone);

5161

max += high_wmark_pages(zone);

5162

5163

if (max > zone->present_pages)

5163

if (max > zone->present_pages)

5164

max = zone->present_pages;

5164

max = zone->present_pages;

5165

reserve_pages += max;

5165

reserve_pages += max;

5166

/*

5166

/*

5167

* Lowmem reserves are not available to

5167

* Lowmem reserves are not available to

5168

* GFP_HIGHUSER page cache allocations and

5168

* GFP_HIGHUSER page cache allocations and

5169

* kswapd tries to balance zones to their high

5169

* kswapd tries to balance zones to their high

5170

* watermark. As a result, neither should be

5170

* watermark. As a result, neither should be

5171

* regarded as dirtyable memory, to prevent a

5171

* regarded as dirtyable memory, to prevent a

5172

* situation where reclaim has to clean pages

5172

* situation where reclaim has to clean pages

5173

* in order to balance the zones.

5173

* in order to balance the zones.

5174

*/

5174

*/

5175

zone->dirty_balance_reserve = max;

5175

zone->dirty_balance_reserve = max;

5176

}

5176

}

5177

}

5177

}

5178

dirty_balance_reserve = reserve_pages;

5178

dirty_balance_reserve = reserve_pages;

5179

totalreserve_pages = reserve_pages;

5179

totalreserve_pages = reserve_pages;

5180

}

5180

}

5181

5182

/*

5182

/*

5183

* setup_per_zone_lowmem_reserve - called whenever

5183

* setup_per_zone_lowmem_reserve - called whenever

5184

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5184

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5185

* has a correct pages reserved value, so an adequate number of

5185

* has a correct pages reserved value, so an adequate number of

5186

* pages are left in the zone after a successful __alloc_pages().

5186

* pages are left in the zone after a successful __alloc_pages().

5187

*/

5187

*/

5188

static void setup_per_zone_lowmem_reserve(void)

5188

static void setup_per_zone_lowmem_reserve(void)

5189

{

5189

{

5190

struct pglist_data *pgdat;

5190

struct pglist_data *pgdat;

5191

enum zone_type j, idx;

5191

enum zone_type j, idx;

5192

5193

for_each_online_pgdat(pgdat) {

5193

for_each_online_pgdat(pgdat) {

5194

for (j = 0; j < MAX_NR_ZONES; j++) {

5194

for (j = 0; j < MAX_NR_ZONES; j++) {

5195

struct zone *zone = pgdat->node_zones + j;

5195

struct zone *zone = pgdat->node_zones + j;

5196

unsigned long present_pages = zone->present_pages;

5196

unsigned long present_pages = zone->present_pages;

5197

5198

zone->lowmem_reserve[j] = 0;

5198

zone->lowmem_reserve[j] = 0;

5199

5200

idx = j;

5200

idx = j;

5201

while (idx) {

5201

while (idx) {

5202

struct zone *lower_zone;

5202

struct zone *lower_zone;

5203

5204

idx--;

5204

idx--;

5205

5206

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5206

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5207

sysctl_lowmem_reserve_ratio[idx] = 1;

5207

sysctl_lowmem_reserve_ratio[idx] = 1;

5208

5209

lower_zone = pgdat->node_zones + idx;

5209

lower_zone = pgdat->node_zones + idx;

5210

lower_zone->lowmem_reserve[j] = present_pages /

5210

lower_zone->lowmem_reserve[j] = present_pages /

5211

sysctl_lowmem_reserve_ratio[idx];

5211

sysctl_lowmem_reserve_ratio[idx];

5212

present_pages += lower_zone->present_pages;

5212

present_pages += lower_zone->present_pages;

5213

}

5213

}

5214

}

5214

}

5215

}

5215

}

5216

5217

/* update totalreserve_pages */

5217

/* update totalreserve_pages */

5218

calculate_totalreserve_pages();

5218

calculate_totalreserve_pages();

5219

}

5219

}

5220

5221

static void __setup_per_zone_wmarks(void)

5221

static void __setup_per_zone_wmarks(void)

5222

{

5222

{

5223

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5223

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5224

unsigned long lowmem_pages = 0;

5224

unsigned long lowmem_pages = 0;

5225

struct zone *zone;

5225

struct zone *zone;

5226

unsigned long flags;

5226

unsigned long flags;

5227

5228

/* Calculate total number of !ZONE_HIGHMEM pages */

5228

/* Calculate total number of !ZONE_HIGHMEM pages */

5229

for_each_zone(zone) {

5229

for_each_zone(zone) {

5230

if (!is_highmem(zone))

5230

if (!is_highmem(zone))

5231

lowmem_pages += zone->present_pages;

5231

lowmem_pages += zone->present_pages;

5232

}

5232

}

5233

5234

for_each_zone(zone) {

5234

for_each_zone(zone) {

5235

u64 tmp;

5235

u64 tmp;

5236

5237

spin_lock_irqsave(&zone->lock, flags);

5237

spin_lock_irqsave(&zone->lock, flags);

5238

tmp = (u64)pages_min * zone->present_pages;

5238

tmp = (u64)pages_min * zone->present_pages;

5239

do_div(tmp, lowmem_pages);

5239

do_div(tmp, lowmem_pages);

5240

if (is_highmem(zone)) {

5240

if (is_highmem(zone)) {

5241

/*

5241

/*

5242

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5242

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5243

* need highmem pages, so cap pages_min to a small

5243

* need highmem pages, so cap pages_min to a small

5244

* value here.

5244

* value here.

5245

*

5245

*

5246

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5246

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5247

* deltas controls asynch page reclaim, and so should

5247

* deltas controls asynch page reclaim, and so should

5248

* not be capped for highmem.

5248

* not be capped for highmem.

5249

*/

5249

*/

5250

int min_pages;

5250

int min_pages;

5251

5252

min_pages = zone->present_pages / 1024;

5252

min_pages = zone->present_pages / 1024;

5253

if (min_pages < SWAP_CLUSTER_MAX)

5253

if (min_pages < SWAP_CLUSTER_MAX)

5254

min_pages = SWAP_CLUSTER_MAX;

5254

min_pages = SWAP_CLUSTER_MAX;

5255

if (min_pages > 128)

5255

if (min_pages > 128)

5256

min_pages = 128;

5256

min_pages = 128;

5257

zone->watermark[WMARK_MIN] = min_pages;

5257

zone->watermark[WMARK_MIN] = min_pages;

5258

} else {

5258

} else {

5259

/*

5259

/*

5260

* If it's a lowmem zone, reserve a number of pages

5260

* If it's a lowmem zone, reserve a number of pages

5261

* proportionate to the zone's size.

5261

* proportionate to the zone's size.

5262

*/

5262

*/

5263

zone->watermark[WMARK_MIN] = tmp;

5263

zone->watermark[WMARK_MIN] = tmp;

5264

}

5264

}

5265

5266

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5266

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5267

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5267

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5268

5269

setup_zone_migrate_reserve(zone);

5269

setup_zone_migrate_reserve(zone);

5270

spin_unlock_irqrestore(&zone->lock, flags);

5270

spin_unlock_irqrestore(&zone->lock, flags);

5271

}

5271

}

5272

5273

/* update totalreserve_pages */

5273

/* update totalreserve_pages */

5274

calculate_totalreserve_pages();

5274

calculate_totalreserve_pages();

5275

}

5275

}

5276

5277

/**

5277

/**

5278

* setup_per_zone_wmarks - called when min_free_kbytes changes

5278

* setup_per_zone_wmarks - called when min_free_kbytes changes

5279

* or when memory is hot-{added|removed}

5279

* or when memory is hot-{added|removed}

5280

*

5280

*

5281

* Ensures that the watermark[min,low,high] values for each zone are set

5281

* Ensures that the watermark[min,low,high] values for each zone are set

5282

* correctly with respect to min_free_kbytes.

5282

* correctly with respect to min_free_kbytes.

5283

*/

5283

*/

5284

void setup_per_zone_wmarks(void)

5284

void setup_per_zone_wmarks(void)

5285

{

5285

{

5286

mutex_lock(&zonelists_mutex);

5286

mutex_lock(&zonelists_mutex);

5287

__setup_per_zone_wmarks();

5287

__setup_per_zone_wmarks();

5288

mutex_unlock(&zonelists_mutex);

5288

mutex_unlock(&zonelists_mutex);

5289

}

5289

}

5290

5291

/*

5291

/*

5292

* The inactive anon list should be small enough that the VM never has to

5292

* The inactive anon list should be small enough that the VM never has to

5293

* do too much work, but large enough that each inactive page has a chance

5293

* do too much work, but large enough that each inactive page has a chance

5294

* to be referenced again before it is swapped out.

5294

* to be referenced again before it is swapped out.

5295

*

5295

*

5296

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5296

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5297

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5297

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5298

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5298

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5299

* the anonymous pages are kept on the inactive list.

5299

* the anonymous pages are kept on the inactive list.

5300

*

5300

*

5301

* total target max

5301

* total target max

5302

* memory ratio inactive anon

5302

* memory ratio inactive anon

5303

* -------------------------------------

5303

* -------------------------------------

5304

* 10MB 1 5MB

5304

* 10MB 1 5MB

5305

* 100MB 1 50MB

5305

* 100MB 1 50MB

5306

* 1GB 3 250MB

5306

* 1GB 3 250MB

5307

* 10GB 10 0.9GB

5307

* 10GB 10 0.9GB

5308

* 100GB 31 3GB

5308

* 100GB 31 3GB

5309

* 1TB 101 10GB

5309

* 1TB 101 10GB

5310

* 10TB 320 32GB

5310

* 10TB 320 32GB

5311

*/

5311

*/

5312

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5312

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5313

{

5313

{

5314

unsigned int gb, ratio;

5314

unsigned int gb, ratio;

5315

5316

/* Zone size in gigabytes */

5316

/* Zone size in gigabytes */

5317

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5317

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5318

if (gb)

5318

if (gb)

5319

ratio = int_sqrt(10 * gb);

5319

ratio = int_sqrt(10 * gb);

5320

else

5320

else

5321

ratio = 1;

5321

ratio = 1;

5322

5323

zone->inactive_ratio = ratio;

5323

zone->inactive_ratio = ratio;

5324

}

5324

}

5325

5326

static void __meminit setup_per_zone_inactive_ratio(void)

5326

static void __meminit setup_per_zone_inactive_ratio(void)

5327

{

5327

{

5328

struct zone *zone;

5328

struct zone *zone;

5329

5330

for_each_zone(zone)

5330

for_each_zone(zone)

5331

calculate_zone_inactive_ratio(zone);

5331

calculate_zone_inactive_ratio(zone);

5332

}

5332

}

5333

5334

/*

5334

/*

5335

* Initialise min_free_kbytes.

5335

* Initialise min_free_kbytes.

5336

*

5336

*

5337

* For small machines we want it small (128k min). For large machines

5337

* For small machines we want it small (128k min). For large machines

5338

* we want it large (64MB max). But it is not linear, because network

5338

* we want it large (64MB max). But it is not linear, because network

5339

* bandwidth does not increase linearly with machine size. We use

5339

* bandwidth does not increase linearly with machine size. We use

5340

*

5340

*

5341

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5341

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5342

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5342

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5343

*

5343

*

5344

* which yields

5344

* which yields

5345

*

5345

*

5346

* 16MB: 512k

5346

* 16MB: 512k

5347

* 32MB: 724k

5347

* 32MB: 724k

5348

* 64MB: 1024k

5348

* 64MB: 1024k

5349

* 128MB: 1448k

5349

* 128MB: 1448k

5350

* 256MB: 2048k

5350

* 256MB: 2048k

5351

* 512MB: 2896k

5351

* 512MB: 2896k

5352

* 1024MB: 4096k

5352

* 1024MB: 4096k

5353

* 2048MB: 5792k

5353

* 2048MB: 5792k

5354

* 4096MB: 8192k

5354

* 4096MB: 8192k

5355

* 8192MB: 11584k

5355

* 8192MB: 11584k

5356

* 16384MB: 16384k

5356

* 16384MB: 16384k

5357

*/

5357

*/

5358

int __meminit init_per_zone_wmark_min(void)

5358

int __meminit init_per_zone_wmark_min(void)

5359

{

5359

{

5360

unsigned long lowmem_kbytes;

5360

unsigned long lowmem_kbytes;

5361

5362

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5362

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5363

5364

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5364

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5365

if (min_free_kbytes < 128)

5365

if (min_free_kbytes < 128)

5366

min_free_kbytes = 128;

5366

min_free_kbytes = 128;

5367

if (min_free_kbytes > 65536)

5367

if (min_free_kbytes > 65536)

5368

min_free_kbytes = 65536;

5368

min_free_kbytes = 65536;

5369

setup_per_zone_wmarks();

5369

setup_per_zone_wmarks();

5370

refresh_zone_stat_thresholds();

5370

refresh_zone_stat_thresholds();

5371

setup_per_zone_lowmem_reserve();

5371

setup_per_zone_lowmem_reserve();

5372

setup_per_zone_inactive_ratio();

5372

setup_per_zone_inactive_ratio();

5373

return 0;

5373

return 0;

5374

}

5374

}

5375

module_init(init_per_zone_wmark_min)

5375

module_init(init_per_zone_wmark_min)

5376

5377

/*

5377

/*

5378

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5378

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5379

* that we can call two helper functions whenever min_free_kbytes

5379

* that we can call two helper functions whenever min_free_kbytes

5380

* changes.

5380

* changes.

5381

*/

5381

*/

5382

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5382

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5383

void __user *buffer, size_t *length, loff_t *ppos)

5383

void __user *buffer, size_t *length, loff_t *ppos)

5384

{

5384

{

5385

proc_dointvec(table, write, buffer, length, ppos);

5385

proc_dointvec(table, write, buffer, length, ppos);

5386

if (write)

5386

if (write)

5387

setup_per_zone_wmarks();

5387

setup_per_zone_wmarks();

5388

return 0;

5388

return 0;

5389

}

5389

}

5390

5391

#ifdef CONFIG_NUMA

5391

#ifdef CONFIG_NUMA

5392

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5392

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5393

void __user *buffer, size_t *length, loff_t *ppos)

5393

void __user *buffer, size_t *length, loff_t *ppos)

5394

{

5394

{

5395

struct zone *zone;

5395

struct zone *zone;

5396

int rc;

5396

int rc;

5397

5398

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5398

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5399

if (rc)

5399

if (rc)

5400

return rc;

5400

return rc;

5401

5402

for_each_zone(zone)

5402

for_each_zone(zone)

5403

zone->min_unmapped_pages = (zone->present_pages *

5403

zone->min_unmapped_pages = (zone->present_pages *

5404

sysctl_min_unmapped_ratio) / 100;

5404

sysctl_min_unmapped_ratio) / 100;

5405

return 0;

5405

return 0;

5406

}

5406

}

5407

5408

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5408

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5409

void __user *buffer, size_t *length, loff_t *ppos)

5409

void __user *buffer, size_t *length, loff_t *ppos)

5410

{

5410

{

5411

struct zone *zone;

5411

struct zone *zone;

5412

int rc;

5412

int rc;

5413

5414

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5414

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5415

if (rc)

5415

if (rc)

5416

return rc;

5416

return rc;

5417

5418

for_each_zone(zone)

5418

for_each_zone(zone)

5419

zone->min_slab_pages = (zone->present_pages *

5419

zone->min_slab_pages = (zone->present_pages *

5420

sysctl_min_slab_ratio) / 100;

5420

sysctl_min_slab_ratio) / 100;

5421

return 0;

5421

return 0;

5422

}

5422

}

5423

#endif

5423

#endif

5424

5425

/*

5425

/*

5426

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5426

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5427

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5427

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5428

* whenever sysctl_lowmem_reserve_ratio changes.

5428

* whenever sysctl_lowmem_reserve_ratio changes.

5429

*

5429

*

5430

* The reserve ratio obviously has absolutely no relation with the

5430

* The reserve ratio obviously has absolutely no relation with the

5431

* minimum watermarks. The lowmem reserve ratio can only make sense

5431

* minimum watermarks. The lowmem reserve ratio can only make sense

5432

* if in function of the boot time zone sizes.

5432

* if in function of the boot time zone sizes.

5433

*/

5433

*/

5434

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5434

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5435

void __user *buffer, size_t *length, loff_t *ppos)

5435

void __user *buffer, size_t *length, loff_t *ppos)

5436

{

5436

{

5437

proc_dointvec_minmax(table, write, buffer, length, ppos);

5437

proc_dointvec_minmax(table, write, buffer, length, ppos);

5438

setup_per_zone_lowmem_reserve();

5438

setup_per_zone_lowmem_reserve();

5439

return 0;

5439

return 0;

5440

}

5440

}

5441

5442

/*

5442

/*

5443

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5443

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5444

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5444

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5445

* can have before it gets flushed back to buddy allocator.

5445

* can have before it gets flushed back to buddy allocator.

5446

*/

5446

*/

5447

5448

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5448

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5449

void __user *buffer, size_t *length, loff_t *ppos)

5449

void __user *buffer, size_t *length, loff_t *ppos)

5450

{

5450

{

5451

struct zone *zone;

5451

struct zone *zone;

5452

unsigned int cpu;

5452

unsigned int cpu;

5453

int ret;

5453

int ret;

5454

5455

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5455

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5456

if (!write || (ret < 0))

5456

if (!write || (ret < 0))

5457

return ret;

5457

return ret;

5458

for_each_populated_zone(zone) {

5458

for_each_populated_zone(zone) {

5459

for_each_possible_cpu(cpu) {

5459

for_each_possible_cpu(cpu) {

5460

unsigned long high;

5460

unsigned long high;

5461

high = zone->present_pages / percpu_pagelist_fraction;

5461

high = zone->present_pages / percpu_pagelist_fraction;

5462

setup_pagelist_highmark(

5462

setup_pagelist_highmark(

5463

per_cpu_ptr(zone->pageset, cpu), high);

5463

per_cpu_ptr(zone->pageset, cpu), high);

5464

}

5464

}

5465

}

5465

}

5466

return 0;

5466

return 0;

5467

}

5467

}

5468

5469

int hashdist = HASHDIST_DEFAULT;

5469

int hashdist = HASHDIST_DEFAULT;

5470

5471

#ifdef CONFIG_NUMA

5471

#ifdef CONFIG_NUMA

5472

static int __init set_hashdist(char *str)

5472

static int __init set_hashdist(char *str)

5473

{

5473

{

5474

if (!str)

5474

if (!str)

5475

return 0;

5475

return 0;

5476

hashdist = simple_strtoul(str, &str, 0);

5476

hashdist = simple_strtoul(str, &str, 0);

5477

return 1;

5477

return 1;

5478

}

5478

}

5479

__setup("hashdist=", set_hashdist);

5479

__setup("hashdist=", set_hashdist);

5480

#endif

5480

#endif

5481

5482

/*

5482

/*

5483

* allocate a large system hash table from bootmem

5483

* allocate a large system hash table from bootmem

5484

* - it is assumed that the hash table must contain an exact power-of-2

5484

* - it is assumed that the hash table must contain an exact power-of-2

5485

* quantity of entries

5485

* quantity of entries

5486

* - limit is the number of hash buckets, not the total allocation size

5486

* - limit is the number of hash buckets, not the total allocation size

5487

*/

5487

*/

5488

void *__init alloc_large_system_hash(const char *tablename,

5488

void *__init alloc_large_system_hash(const char *tablename,

5489

unsigned long bucketsize,

5489

unsigned long bucketsize,

5490

unsigned long numentries,

5490

unsigned long numentries,

5491

int scale,

5491

int scale,

5492

int flags,

5492

int flags,

5493

unsigned int *_hash_shift,

5493

unsigned int *_hash_shift,

5494

unsigned int *_hash_mask,

5494

unsigned int *_hash_mask,

5495

unsigned long low_limit,

5495

unsigned long low_limit,

5496

unsigned long high_limit)

5496

unsigned long high_limit)

5497

{

5497

{

5498

unsigned long long max = high_limit;

5498

unsigned long long max = high_limit;

5499

unsigned long log2qty, size;

5499

unsigned long log2qty, size;

5500

void *table = NULL;

5500

void *table = NULL;

5501

5502

/* allow the kernel cmdline to have a say */

5502

/* allow the kernel cmdline to have a say */

5503

if (!numentries) {

5503

if (!numentries) {

5504

/* round applicable memory size up to nearest megabyte */

5504

/* round applicable memory size up to nearest megabyte */

5505

numentries = nr_kernel_pages;

5505

numentries = nr_kernel_pages;

5506

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5506

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5507

numentries >>= 20 - PAGE_SHIFT;

5507

numentries >>= 20 - PAGE_SHIFT;

5508

numentries <<= 20 - PAGE_SHIFT;

5508

numentries <<= 20 - PAGE_SHIFT;

5509

5510

/* limit to 1 bucket per 2^scale bytes of low memory */

5510

/* limit to 1 bucket per 2^scale bytes of low memory */

5511

if (scale > PAGE_SHIFT)

5511

if (scale > PAGE_SHIFT)

5512

numentries >>= (scale - PAGE_SHIFT);

5512

numentries >>= (scale - PAGE_SHIFT);

5513

else

5513

else

5514

numentries <<= (PAGE_SHIFT - scale);

5514

numentries <<= (PAGE_SHIFT - scale);

5515

5516

/* Make sure we've got at least a 0-order allocation.. */

5516

/* Make sure we've got at least a 0-order allocation.. */

5517

if (unlikely(flags & HASH_SMALL)) {

5517

if (unlikely(flags & HASH_SMALL)) {

5518

/* Makes no sense without HASH_EARLY */

5518

/* Makes no sense without HASH_EARLY */

5519

WARN_ON(!(flags & HASH_EARLY));

5519

WARN_ON(!(flags & HASH_EARLY));

5520

if (!(numentries >> *_hash_shift)) {

5520

if (!(numentries >> *_hash_shift)) {

5521

numentries = 1UL << *_hash_shift;

5521

numentries = 1UL << *_hash_shift;

5522

BUG_ON(!numentries);

5522

BUG_ON(!numentries);

5523

}

5523

}

5524

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5524

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5525

numentries = PAGE_SIZE / bucketsize;

5525

numentries = PAGE_SIZE / bucketsize;

5526

}

5526

}

5527

numentries = roundup_pow_of_two(numentries);

5527

numentries = roundup_pow_of_two(numentries);

5528

5529

/* limit allocation size to 1/16 total memory by default */

5529

/* limit allocation size to 1/16 total memory by default */

5530

if (max == 0) {

5530

if (max == 0) {

5531

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5531

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5532

do_div(max, bucketsize);

5532

do_div(max, bucketsize);

5533

}

5533

}

5534

max = min(max, 0x80000000ULL);

5534

max = min(max, 0x80000000ULL);

5535

5536

if (numentries < low_limit)

5536

if (numentries < low_limit)

5537

numentries = low_limit;

5537

numentries = low_limit;

5538

if (numentries > max)

5538

if (numentries > max)

5539

numentries = max;

5539

numentries = max;

5540

5541

log2qty = ilog2(numentries);

5541

log2qty = ilog2(numentries);

5542

5543

do {

5543

do {

5544

size = bucketsize << log2qty;

5544

size = bucketsize << log2qty;

5545

if (flags & HASH_EARLY)

5545

if (flags & HASH_EARLY)

5546

table = alloc_bootmem_nopanic(size);

5546

table = alloc_bootmem_nopanic(size);

5547

else if (hashdist)

5547

else if (hashdist)

5548

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5548

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5549

else {

5549

else {

5550

/*

5550

/*

5551

* If bucketsize is not a power-of-two, we may free

5551

* If bucketsize is not a power-of-two, we may free

5552

* some pages at the end of hash table which

5552

* some pages at the end of hash table which

5553

* alloc_pages_exact() automatically does

5553

* alloc_pages_exact() automatically does

5554

*/

5554

*/

5555

if (get_order(size) < MAX_ORDER) {

5555

if (get_order(size) < MAX_ORDER) {

5556

table = alloc_pages_exact(size, GFP_ATOMIC);

5556

table = alloc_pages_exact(size, GFP_ATOMIC);

5557

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5557

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5558

}

5558

}

5559

}

5559

}

5560

} while (!table && size > PAGE_SIZE && --log2qty);

5560

} while (!table && size > PAGE_SIZE && --log2qty);

5561

5562

if (!table)

5562

if (!table)

5563

panic("Failed to allocate %s hash table\n", tablename);

5563

panic("Failed to allocate %s hash table\n", tablename);

5564

5565

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5565

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5566

tablename,

5566

tablename,

5567

(1UL << log2qty),

5567

(1UL << log2qty),

5568

ilog2(size) - PAGE_SHIFT,

5568

ilog2(size) - PAGE_SHIFT,

5569

size);

5569

size);

5570

5571

if (_hash_shift)

5571

if (_hash_shift)

5572

*_hash_shift = log2qty;

5572

*_hash_shift = log2qty;

5573

if (_hash_mask)

5573

if (_hash_mask)

5574

*_hash_mask = (1 << log2qty) - 1;

5574

*_hash_mask = (1 << log2qty) - 1;

5575

5576

return table;

5576

return table;

5577

}

5577

}

5578

5579

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5579

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5580

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5580

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5581

unsigned long pfn)

5581

unsigned long pfn)

5582

{

5582

{

5583

#ifdef CONFIG_SPARSEMEM

5583

#ifdef CONFIG_SPARSEMEM

5584

return __pfn_to_section(pfn)->pageblock_flags;

5584

return __pfn_to_section(pfn)->pageblock_flags;

5585

#else

5585

#else

5586

return zone->pageblock_flags;

5586

return zone->pageblock_flags;

5587

#endif /* CONFIG_SPARSEMEM */

5587

#endif /* CONFIG_SPARSEMEM */

5588

}

5588

}

5589

5590

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5590

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5591

{

5591

{

5592

#ifdef CONFIG_SPARSEMEM

5592

#ifdef CONFIG_SPARSEMEM

5593

pfn &= (PAGES_PER_SECTION-1);

5593

pfn &= (PAGES_PER_SECTION-1);

5594

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5594

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5595

#else

5595

#else

5596

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5596

pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);

5597

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5597

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5598

#endif /* CONFIG_SPARSEMEM */

5598

#endif /* CONFIG_SPARSEMEM */

5599

}

5599

}

5600

5601

/**

5601

/**

5602

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5602

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5603

* @page: The page within the block of interest

5603

* @page: The page within the block of interest

5604

* @start_bitidx: The first bit of interest to retrieve

5604

* @start_bitidx: The first bit of interest to retrieve

5605

* @end_bitidx: The last bit of interest

5605

* @end_bitidx: The last bit of interest

5606

* returns pageblock_bits flags

5606

* returns pageblock_bits flags

5607

*/

5607

*/

5608

unsigned long get_pageblock_flags_group(struct page *page,

5608

unsigned long get_pageblock_flags_group(struct page *page,

5609

int start_bitidx, int end_bitidx)

5609

int start_bitidx, int end_bitidx)

5610

{

5610

{

5611

struct zone *zone;

5611

struct zone *zone;

5612

unsigned long *bitmap;

5612

unsigned long *bitmap;

5613

unsigned long pfn, bitidx;

5613

unsigned long pfn, bitidx;

5614

unsigned long flags = 0;

5614

unsigned long flags = 0;

5615

unsigned long value = 1;

5615

unsigned long value = 1;

5616

5617

zone = page_zone(page);

5617

zone = page_zone(page);

5618

pfn = page_to_pfn(page);

5618

pfn = page_to_pfn(page);

5619

bitmap = get_pageblock_bitmap(zone, pfn);

5619

bitmap = get_pageblock_bitmap(zone, pfn);

5620

bitidx = pfn_to_bitidx(zone, pfn);

5620

bitidx = pfn_to_bitidx(zone, pfn);

5621

5622

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5622

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5623

if (test_bit(bitidx + start_bitidx, bitmap))

5623

if (test_bit(bitidx + start_bitidx, bitmap))

5624

flags |= value;

5624

flags |= value;

5625

5626

return flags;

5626

return flags;

5627

}

5627

}

5628

5629

/**

5629

/**

5630

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5630

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5631

* @page: The page within the block of interest

5631

* @page: The page within the block of interest

5632

* @start_bitidx: The first bit of interest

5632

* @start_bitidx: The first bit of interest

5633

* @end_bitidx: The last bit of interest

5633

* @end_bitidx: The last bit of interest

5634

* @flags: The flags to set

5634

* @flags: The flags to set

5635

*/

5635

*/

5636

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5636

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5637

int start_bitidx, int end_bitidx)

5637

int start_bitidx, int end_bitidx)

5638

{

5638

{

5639

struct zone *zone;

5639

struct zone *zone;

5640

unsigned long *bitmap;

5640

unsigned long *bitmap;

5641

unsigned long pfn, bitidx;

5641

unsigned long pfn, bitidx;

5642

unsigned long value = 1;

5642

unsigned long value = 1;

5643

5644

zone = page_zone(page);

5644

zone = page_zone(page);

5645

pfn = page_to_pfn(page);

5645

pfn = page_to_pfn(page);

5646

bitmap = get_pageblock_bitmap(zone, pfn);

5646

bitmap = get_pageblock_bitmap(zone, pfn);

5647

bitidx = pfn_to_bitidx(zone, pfn);

5647

bitidx = pfn_to_bitidx(zone, pfn);

5648

VM_BUG_ON(pfn < zone->zone_start_pfn);

5648

VM_BUG_ON(pfn < zone->zone_start_pfn);

5649

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5649

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5650

5651

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5651

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5652

if (flags & value)

5652

if (flags & value)

5653

__set_bit(bitidx + start_bitidx, bitmap);

5653

__set_bit(bitidx + start_bitidx, bitmap);

5654

else

5654

else

5655

__clear_bit(bitidx + start_bitidx, bitmap);

5655

__clear_bit(bitidx + start_bitidx, bitmap);

5656

}

5656

}

5657

5658

/*

5658

/*

5659

* This function checks whether pageblock includes unmovable pages or not.

5659

* This function checks whether pageblock includes unmovable pages or not.

5660

* If @count is not zero, it is okay to include less @count unmovable pages

5660

* If @count is not zero, it is okay to include less @count unmovable pages

5661

*

5661

*

5662

* PageLRU check wihtout isolation or lru_lock could race so that

5662

* PageLRU check wihtout isolation or lru_lock could race so that

5663

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5663

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5664

* expect this function should be exact.

5664

* expect this function should be exact.

5665

*/

5665

*/

5666

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

5666

bool has_unmovable_pages(struct zone *zone, struct page *page, int count,

5667

bool skip_hwpoisoned_pages)

5667

bool skip_hwpoisoned_pages)

5668

{

5668

{

5669

unsigned long pfn, iter, found;

5669

unsigned long pfn, iter, found;

5670

int mt;

5670

int mt;

5671

5672

/*

5672

/*

5673

* For avoiding noise data, lru_add_drain_all() should be called

5673

* For avoiding noise data, lru_add_drain_all() should be called

5674

* If ZONE_MOVABLE, the zone never contains unmovable pages

5674

* If ZONE_MOVABLE, the zone never contains unmovable pages

5675

*/

5675

*/

5676

if (zone_idx(zone) == ZONE_MOVABLE)

5676

if (zone_idx(zone) == ZONE_MOVABLE)

5677

return false;

5677

return false;

5678

mt = get_pageblock_migratetype(page);

5678

mt = get_pageblock_migratetype(page);

5679

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5679

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5680

return false;

5680

return false;

5681

5682

pfn = page_to_pfn(page);

5682

pfn = page_to_pfn(page);

5683

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5683

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5684

unsigned long check = pfn + iter;

5684

unsigned long check = pfn + iter;

5685

5686

if (!pfn_valid_within(check))

5686

if (!pfn_valid_within(check))

5687

continue;

5687

continue;

5688

5689

page = pfn_to_page(check);

5689

page = pfn_to_page(check);

5690

/*

5690

/*

5691

* We can't use page_count without pin a page

5691

* We can't use page_count without pin a page

5692

* because another CPU can free compound page.

5692

* because another CPU can free compound page.

5693

* This check already skips compound tails of THP

5693

* This check already skips compound tails of THP

5694

* because their page->_count is zero at all time.

5694

* because their page->_count is zero at all time.

5695

*/

5695

*/

5696

if (!atomic_read(&page->_count)) {

5696

if (!atomic_read(&page->_count)) {

5697

if (PageBuddy(page))

5697

if (PageBuddy(page))

5698

iter += (1 << page_order(page)) - 1;

5698

iter += (1 << page_order(page)) - 1;

5699

continue;

5699

continue;

5700

}

5700

}

5701

5702

/*

5702

/*

5703

* The HWPoisoned page may be not in buddy system, and

5703

* The HWPoisoned page may be not in buddy system, and

5704

* page_count() is not 0.

5704

* page_count() is not 0.

5705

*/

5705

*/

5706

if (skip_hwpoisoned_pages && PageHWPoison(page))

5706

if (skip_hwpoisoned_pages && PageHWPoison(page))

5707

continue;

5707

continue;

5708

5709

if (!PageLRU(page))

5709

if (!PageLRU(page))

5710

found++;

5710

found++;

5711

/*

5711

/*

5712

* If there are RECLAIMABLE pages, we need to check it.

5712

* If there are RECLAIMABLE pages, we need to check it.

5713

* But now, memory offline itself doesn't call shrink_slab()

5713

* But now, memory offline itself doesn't call shrink_slab()

5714

* and it still to be fixed.

5714

* and it still to be fixed.

5715

*/

5715

*/

5716

/*

5716

/*

5717

* If the page is not RAM, page_count()should be 0.

5717

* If the page is not RAM, page_count()should be 0.

5718

* we don't need more check. This is an _used_ not-movable page.

5718

* we don't need more check. This is an _used_ not-movable page.

5719

*

5719

*

5720

* The problematic thing here is PG_reserved pages. PG_reserved

5720

* The problematic thing here is PG_reserved pages. PG_reserved

5721

* is set to both of a memory hole page and a _used_ kernel

5721

* is set to both of a memory hole page and a _used_ kernel

5722

* page at boot.

5722

* page at boot.

5723

*/

5723

*/

5724

if (found > count)

5724

if (found > count)

5725

return true;

5725

return true;

5726

}

5726

}

5727

return false;

5727

return false;

5728

}

5728

}

5729

5730

bool is_pageblock_removable_nolock(struct page *page)

5730

bool is_pageblock_removable_nolock(struct page *page)

5731

{

5731

{

5732

struct zone *zone;

5732

struct zone *zone;

5733

unsigned long pfn;

5733

unsigned long pfn;

5734

5735

/*

5735

/*

5736

* We have to be careful here because we are iterating over memory

5736

* We have to be careful here because we are iterating over memory

5737

* sections which are not zone aware so we might end up outside of

5737

* sections which are not zone aware so we might end up outside of

5738

* the zone but still within the section.

5738

* the zone but still within the section.

5739

* We have to take care about the node as well. If the node is offline

5739

* We have to take care about the node as well. If the node is offline

5740

* its NODE_DATA will be NULL - see page_zone.

5740

* its NODE_DATA will be NULL - see page_zone.

5741

*/

5741

*/

5742

if (!node_online(page_to_nid(page)))

5742

if (!node_online(page_to_nid(page)))

5743

return false;

5743

return false;

5744

5745

zone = page_zone(page);

5745

zone = page_zone(page);

5746

pfn = page_to_pfn(page);

5746

pfn = page_to_pfn(page);

5747

if (zone->zone_start_pfn > pfn ||

5747

if (zone->zone_start_pfn > pfn ||

5748

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5748

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5749

return false;

5749

return false;

5750

5751

return !has_unmovable_pages(zone, page, 0, true);

5751

return !has_unmovable_pages(zone, page, 0, true);

5752

}

5752

}

5753

5754

#ifdef CONFIG_CMA

5754

#ifdef CONFIG_CMA

5755

5756

static unsigned long pfn_max_align_down(unsigned long pfn)

5756

static unsigned long pfn_max_align_down(unsigned long pfn)

5757

{

5757

{

5758

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5758

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5759

pageblock_nr_pages) - 1);

5759

pageblock_nr_pages) - 1);

5760

}

5760

}

5761

5762

static unsigned long pfn_max_align_up(unsigned long pfn)

5762

static unsigned long pfn_max_align_up(unsigned long pfn)

5763

{

5763

{

5764

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5764

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5765

pageblock_nr_pages));

5765

pageblock_nr_pages));

5766

}

5766

}

5767

5768

/* [start, end) must belong to a single zone. */

5768

/* [start, end) must belong to a single zone. */

5769

static int __alloc_contig_migrate_range(struct compact_control *cc,

5769

static int __alloc_contig_migrate_range(struct compact_control *cc,

5770

unsigned long start, unsigned long end)

5770

unsigned long start, unsigned long end)

5771

{

5771

{

5772

/* This function is based on compact_zone() from compaction.c. */

5772

/* This function is based on compact_zone() from compaction.c. */

5773

unsigned long nr_reclaimed;

5773

unsigned long nr_reclaimed;

5774

unsigned long pfn = start;

5774

unsigned long pfn = start;

5775

unsigned int tries = 0;

5775

unsigned int tries = 0;

5776

int ret = 0;

5776

int ret = 0;

5777

5778

migrate_prep();

5778

migrate_prep();

5779

5780

while (pfn < end || !list_empty(&cc->migratepages)) {

5780

while (pfn < end || !list_empty(&cc->migratepages)) {

5781

if (fatal_signal_pending(current)) {

5781

if (fatal_signal_pending(current)) {

5782

ret = -EINTR;

5782

ret = -EINTR;

5783

break;

5783

break;

5784

}

5784

}

5785

5786

if (list_empty(&cc->migratepages)) {

5786

if (list_empty(&cc->migratepages)) {

5787

cc->nr_migratepages = 0;

5787

cc->nr_migratepages = 0;

5788

pfn = isolate_migratepages_range(cc->zone, cc,

5788

pfn = isolate_migratepages_range(cc->zone, cc,

5789

pfn, end, true);

5789

pfn, end, true);

5790

if (!pfn) {

5790

if (!pfn) {

5791

ret = -EINTR;

5791

ret = -EINTR;

5792

break;

5792

break;

5793

}

5793

}

5794

tries = 0;

5794

tries = 0;

5795

} else if (++tries == 5) {

5795

} else if (++tries == 5) {

5796

ret = ret < 0 ? ret : -EBUSY;

5796

ret = ret < 0 ? ret : -EBUSY;

5797

break;

5797

break;

5798

}

5798

}

5799

5800

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

5800

nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,

5801

&cc->migratepages);

5801

&cc->migratepages);

5802

cc->nr_migratepages -= nr_reclaimed;

5802

cc->nr_migratepages -= nr_reclaimed;

5803

5804

ret = migrate_pages(&cc->migratepages,

5804

ret = migrate_pages(&cc->migratepages,

5805

alloc_migrate_target,

5805

alloc_migrate_target,

5806

0, false, MIGRATE_SYNC,

5806

0, false, MIGRATE_SYNC,

5807

MR_CMA);

5807

MR_CMA);

5808

}

5808

}

5809

if (ret < 0) {

5810

putback_movable_pages(&cc->migratepages);

5810

putback_movable_pages(&cc->migratepages);

5811

return ret > 0 ? 0 : ret;

5811

return ret;

5812

}

5813

return 0;

5812

}

5814

}

5813

5815

5814

/**

5816

/**

5815

* alloc_contig_range() -- tries to allocate given range of pages

5817

* alloc_contig_range() -- tries to allocate given range of pages

5816

* @start: start PFN to allocate

5818

* @start: start PFN to allocate

5817

* @end: one-past-the-last PFN to allocate

5819

* @end: one-past-the-last PFN to allocate

5818

* @migratetype: migratetype of the underlaying pageblocks (either

5820

* @migratetype: migratetype of the underlaying pageblocks (either

5819

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5821

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5820

* in range must have the same migratetype and it must

5822

* in range must have the same migratetype and it must

5821

* be either of the two.

5823

* be either of the two.

5822

*

5824

*

5823

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5825

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5824

* aligned, however it's the caller's responsibility to guarantee that

5826

* aligned, however it's the caller's responsibility to guarantee that

5825

* we are the only thread that changes migrate type of pageblocks the

5827

* we are the only thread that changes migrate type of pageblocks the

5826

* pages fall in.

5828

* pages fall in.

5827

*

5829

*

5828

* The PFN range must belong to a single zone.

5830

* The PFN range must belong to a single zone.

5829

*

5831

*

5830

* Returns zero on success or negative error code. On success all

5832

* Returns zero on success or negative error code. On success all

5831

* pages which PFN is in [start, end) are allocated for the caller and

5833

* pages which PFN is in [start, end) are allocated for the caller and

5832

* need to be freed with free_contig_range().

5834

* need to be freed with free_contig_range().

5833

*/

5835

*/

5834

int alloc_contig_range(unsigned long start, unsigned long end,

5836

int alloc_contig_range(unsigned long start, unsigned long end,

5835

unsigned migratetype)

5837

unsigned migratetype)

5836

{

5838

{

5837

unsigned long outer_start, outer_end;

5839

unsigned long outer_start, outer_end;

5838

int ret = 0, order;

5840

int ret = 0, order;

5839

5841

5840

struct compact_control cc = {

5842

struct compact_control cc = {

5841

.nr_migratepages = 0,

5843

.nr_migratepages = 0,

5842

.order = -1,

5844

.order = -1,

5843

.zone = page_zone(pfn_to_page(start)),

5845

.zone = page_zone(pfn_to_page(start)),

5844

.sync = true,

5846

.sync = true,

5845

.ignore_skip_hint = true,

5847

.ignore_skip_hint = true,

5846

};

5848

};

5847

INIT_LIST_HEAD(&cc.migratepages);

5849

INIT_LIST_HEAD(&cc.migratepages);

5848

5850

5849

/*

5851

/*

5850

* What we do here is we mark all pageblocks in range as

5852

* What we do here is we mark all pageblocks in range as

5851

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5853

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5852

* have different sizes, and due to the way page allocator

5854

* have different sizes, and due to the way page allocator

5853

* work, we align the range to biggest of the two pages so

5855

* work, we align the range to biggest of the two pages so

5854

* that page allocator won't try to merge buddies from

5856

* that page allocator won't try to merge buddies from

5855

* different pageblocks and change MIGRATE_ISOLATE to some

5857

* different pageblocks and change MIGRATE_ISOLATE to some

5856

* other migration type.

5858

* other migration type.

5857

*

5859

*

5858

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5860

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5859

* migrate the pages from an unaligned range (ie. pages that

5861

* migrate the pages from an unaligned range (ie. pages that

5860

* we are interested in). This will put all the pages in

5862

* we are interested in). This will put all the pages in

5861

* range back to page allocator as MIGRATE_ISOLATE.

5863

* range back to page allocator as MIGRATE_ISOLATE.

5862

*

5864

*

5863

* When this is done, we take the pages in range from page

5865

* When this is done, we take the pages in range from page

5864

* allocator removing them from the buddy system. This way

5866

* allocator removing them from the buddy system. This way

5865

* page allocator will never consider using them.

5867

* page allocator will never consider using them.

5866

*

5868

*

5867

* This lets us mark the pageblocks back as

5869

* This lets us mark the pageblocks back as

5868

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5870

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5869

* aligned range but not in the unaligned, original range are

5871

* aligned range but not in the unaligned, original range are

5870

* put back to page allocator so that buddy can use them.

5872

* put back to page allocator so that buddy can use them.

5871

*/

5873

*/

5872

5874

5873

ret = start_isolate_page_range(pfn_max_align_down(start),

5875

ret = start_isolate_page_range(pfn_max_align_down(start),

5874

pfn_max_align_up(end), migratetype,

5876

pfn_max_align_up(end), migratetype,

5875

false);

5877

false);

5876

if (ret)

5878

if (ret)

5877

return ret;

5879

return ret;

5878

5880

5879

ret = __alloc_contig_migrate_range(&cc, start, end);

5881

ret = __alloc_contig_migrate_range(&cc, start, end);

5880

if (ret)

5882

if (ret)

5881

goto done;

5883

goto done;

5882

5884

5883

/*

5885

/*

5884

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5886

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5885

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5887

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5886

* more, all pages in [start, end) are free in page allocator.

5888

* more, all pages in [start, end) are free in page allocator.

5887

* What we are going to do is to allocate all pages from

5889

* What we are going to do is to allocate all pages from

5888

* [start, end) (that is remove them from page allocator).

5890

* [start, end) (that is remove them from page allocator).

5889

*

5891

*

5890

* The only problem is that pages at the beginning and at the

5892

* The only problem is that pages at the beginning and at the

5891

* end of interesting range may be not aligned with pages that

5893

* end of interesting range may be not aligned with pages that

5892

* page allocator holds, ie. they can be part of higher order

5894

* page allocator holds, ie. they can be part of higher order

5893

* pages. Because of this, we reserve the bigger range and

5895

* pages. Because of this, we reserve the bigger range and

5894

* once this is done free the pages we are not interested in.

5896

* once this is done free the pages we are not interested in.

5895

*

5897

*

5896

* We don't have to hold zone->lock here because the pages are

5898

* We don't have to hold zone->lock here because the pages are

5897

* isolated thus they won't get removed from buddy.

5899

* isolated thus they won't get removed from buddy.

5898

*/

5900

*/

5899

5901

5900

lru_add_drain_all();

5902

lru_add_drain_all();

5901

drain_all_pages();

5903

drain_all_pages();

5902

5904

5903

order = 0;

5905

order = 0;

5904

outer_start = start;

5906

outer_start = start;

5905

while (!PageBuddy(pfn_to_page(outer_start))) {

5907

while (!PageBuddy(pfn_to_page(outer_start))) {

5906

if (++order >= MAX_ORDER) {

5908

if (++order >= MAX_ORDER) {

5907

ret = -EBUSY;

5909

ret = -EBUSY;

5908

goto done;

5910

goto done;

5909

}

5911

}

5910

outer_start &= ~0UL << order;

5912

outer_start &= ~0UL << order;

5911

}

5913

}

5912

5914

5913

/* Make sure the range is really isolated. */

5915

/* Make sure the range is really isolated. */

5914

if (test_pages_isolated(outer_start, end, false)) {

5916

if (test_pages_isolated(outer_start, end, false)) {

5915

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5917

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5916

outer_start, end);

5918

outer_start, end);

5917

ret = -EBUSY;

5919

ret = -EBUSY;

5918

goto done;

5920

goto done;

5919

}

5921

}

5920

5922

5921

5923

5922

/* Grab isolated pages from freelists. */

5924

/* Grab isolated pages from freelists. */

5923

outer_end = isolate_freepages_range(&cc, outer_start, end);

5925

outer_end = isolate_freepages_range(&cc, outer_start, end);

5924

if (!outer_end) {

5926

if (!outer_end) {

5925

ret = -EBUSY;

5927

ret = -EBUSY;

5926

goto done;

5928

goto done;

5927

}

5929

}

5928

5930

5929

/* Free head and tail (if any) */

5931

/* Free head and tail (if any) */

5930

if (start != outer_start)

5932

if (start != outer_start)

5931

free_contig_range(outer_start, start - outer_start);

5933

free_contig_range(outer_start, start - outer_start);

5932

if (end != outer_end)

5934

if (end != outer_end)

5933

free_contig_range(end, outer_end - end);

5935

free_contig_range(end, outer_end - end);

5934

5936

5935

done:

5937

done:

5936

undo_isolate_page_range(pfn_max_align_down(start),

5938

undo_isolate_page_range(pfn_max_align_down(start),

5937

pfn_max_align_up(end), migratetype);

5939

pfn_max_align_up(end), migratetype);

5938

return ret;

5940

return ret;

5939

}

5941

}

5940

5942

5941

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5943

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5942

{

5944

{

5943

unsigned int count = 0;

5945

unsigned int count = 0;

5944

5946

5945

for (; nr_pages--; pfn++) {

5947

for (; nr_pages--; pfn++) {

5946

struct page *page = pfn_to_page(pfn);

5948

struct page *page = pfn_to_page(pfn);

5947

5949

5948

count += page_count(page) != 1;

5950

count += page_count(page) != 1;

5949

__free_page(page);

5951

__free_page(page);

5950

}

5952

}

5951

WARN(count != 0, "%d pages are still in use!\n", count);

5953

WARN(count != 0, "%d pages are still in use!\n", count);

5952

}

5954

}

5953

#endif

5955

#endif

5954

5956

5955

#ifdef CONFIG_MEMORY_HOTPLUG

5957

#ifdef CONFIG_MEMORY_HOTPLUG

5956

static int __meminit __zone_pcp_update(void *data)

5958

static int __meminit __zone_pcp_update(void *data)

5957

{

5959

{

5958

struct zone *zone = data;

5960

struct zone *zone = data;

5959

int cpu;

5961

int cpu;

5960

unsigned long batch = zone_batchsize(zone), flags;

5962

unsigned long batch = zone_batchsize(zone), flags;

5961

5963

5962

for_each_possible_cpu(cpu) {

5964

for_each_possible_cpu(cpu) {

5963

struct per_cpu_pageset *pset;

5965

struct per_cpu_pageset *pset;

5964

struct per_cpu_pages *pcp;

5966

struct per_cpu_pages *pcp;

5965

5967

5966

pset = per_cpu_ptr(zone->pageset, cpu);

5968

pset = per_cpu_ptr(zone->pageset, cpu);

5967

pcp = &pset->pcp;

5969

pcp = &pset->pcp;

5968

5970

5969

local_irq_save(flags);

5971

local_irq_save(flags);

5970

if (pcp->count > 0)

5972

if (pcp->count > 0)

5971

free_pcppages_bulk(zone, pcp->count, pcp);

5973

free_pcppages_bulk(zone, pcp->count, pcp);

5972

drain_zonestat(zone, pset);

5974

drain_zonestat(zone, pset);

5973

setup_pageset(pset, batch);

5975

setup_pageset(pset, batch);

5974

local_irq_restore(flags);

5976

local_irq_restore(flags);

5975

}

5977

}

5976

return 0;

5978

return 0;

5977

}

5979

}

5978

5980

5979

void __meminit zone_pcp_update(struct zone *zone)

5981

void __meminit zone_pcp_update(struct zone *zone)

5980

{

5982

{

5981

stop_machine(__zone_pcp_update, zone, NULL);

5983

stop_machine(__zone_pcp_update, zone, NULL);

5982

}

5984

}

5983

#endif

5985

#endif

5984

5986

5985

void zone_pcp_reset(struct zone *zone)

5987

void zone_pcp_reset(struct zone *zone)

5986

{

5988

{

5987

unsigned long flags;

5989

unsigned long flags;

5988

int cpu;

5990

int cpu;

5989

struct per_cpu_pageset *pset;

5991

struct per_cpu_pageset *pset;

5990

5992

5991

/* avoid races with drain_pages() */

5993

/* avoid races with drain_pages() */

5992

local_irq_save(flags);

5994

local_irq_save(flags);

5993

if (zone->pageset != &boot_pageset) {

5995

if (zone->pageset != &boot_pageset) {

5994

for_each_online_cpu(cpu) {

5996

for_each_online_cpu(cpu) {

5995

pset = per_cpu_ptr(zone->pageset, cpu);

5997

pset = per_cpu_ptr(zone->pageset, cpu);

5996

drain_zonestat(zone, pset);

5998

drain_zonestat(zone, pset);

5997

}

5999

}

5998

free_percpu(zone->pageset);

6000

free_percpu(zone->pageset);

5999

zone->pageset = &boot_pageset;

6001

zone->pageset = &boot_pageset;

6000

}

6002

}

6001

local_irq_restore(flags);

6003

local_irq_restore(flags);

6002

}

6004

}

6003

6005

6004

#ifdef CONFIG_MEMORY_HOTREMOVE

6006

#ifdef CONFIG_MEMORY_HOTREMOVE

6005

/*

6007

/*

6006

* All pages in the range must be isolated before calling this.

6008

* All pages in the range must be isolated before calling this.

6007

*/

6009

*/

6008

void

6010

void

6009

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6011

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

6010

{

6012

{

6011

struct page *page;

6013

struct page *page;

6012

struct zone *zone;

6014

struct zone *zone;

6013

int order, i;

6015

int order, i;

6014

unsigned long pfn;

6016

unsigned long pfn;

6015

unsigned long flags;

6017

unsigned long flags;

6016

/* find the first valid pfn */

6018

/* find the first valid pfn */

6017

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6019

for (pfn = start_pfn; pfn < end_pfn; pfn++)

6018

if (pfn_valid(pfn))

6020

if (pfn_valid(pfn))

6019

break;

6021

break;

6020

if (pfn == end_pfn)

6022

if (pfn == end_pfn)

6021

return;

6023

return;

6022

zone = page_zone(pfn_to_page(pfn));

6024

zone = page_zone(pfn_to_page(pfn));

6023

spin_lock_irqsave(&zone->lock, flags);

6025

spin_lock_irqsave(&zone->lock, flags);

6024

pfn = start_pfn;

6026

pfn = start_pfn;

6025

while (pfn < end_pfn) {

6027

while (pfn < end_pfn) {

6026

if (!pfn_valid(pfn)) {

6028

if (!pfn_valid(pfn)) {

6027

pfn++;

6029

pfn++;

6028

continue;

6030

continue;

6029

}

6031

}

6030

page = pfn_to_page(pfn);

6032

page = pfn_to_page(pfn);

6031

/*

6033

/*

6032

* The HWPoisoned page may be not in buddy system, and

6034

* The HWPoisoned page may be not in buddy system, and

6033

* page_count() is not 0.

6035

* page_count() is not 0.

6034

*/

6036

*/

6035

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6037

if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {

6036

pfn++;

6038

pfn++;

6037

SetPageReserved(page);

6039

SetPageReserved(page);

6038

continue;

6040

continue;

6039

}

6041

}

6040

6042

6041

BUG_ON(page_count(page));

6043

BUG_ON(page_count(page));

6042

BUG_ON(!PageBuddy(page));

6044

BUG_ON(!PageBuddy(page));

6043

order = page_order(page);

6045

order = page_order(page);

6044

#ifdef CONFIG_DEBUG_VM

6046

#ifdef CONFIG_DEBUG_VM

6045

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6047

printk(KERN_INFO "remove from free list %lx %d %lx\n",

6046

pfn, 1 << order, end_pfn);

6048

pfn, 1 << order, end_pfn);

6047

#endif

6049

#endif

6048

list_del(&page->lru);

6050

list_del(&page->lru);

6049

rmv_page_order(page);

6051

rmv_page_order(page);

6050

zone->free_area[order].nr_free--;

6052

zone->free_area[order].nr_free--;

6051

for (i = 0; i < (1 << order); i++)

6053

for (i = 0; i < (1 << order); i++)

6052

SetPageReserved((page+i));

6054

SetPageReserved((page+i));

6053

pfn += (1 << order);

6055

pfn += (1 << order);

6054

}

6056

}

6055

spin_unlock_irqrestore(&zone->lock, flags);

6057

spin_unlock_irqrestore(&zone->lock, flags);

6056

}

6058

}

6057

#endif

6059

#endif

6058

6060

6059

#ifdef CONFIG_MEMORY_FAILURE

6061

#ifdef CONFIG_MEMORY_FAILURE

6060

bool is_free_buddy_page(struct page *page)

6062

bool is_free_buddy_page(struct page *page)

6061

{

6063

{

6062

struct zone *zone = page_zone(page);

6064

struct zone *zone = page_zone(page);

6063

unsigned long pfn = page_to_pfn(page);

6065

unsigned long pfn = page_to_pfn(page);

6064

unsigned long flags;

6066

unsigned long flags;

6065

int order;

6067

int order;

6066

6068

6067

spin_lock_irqsave(&zone->lock, flags);

6069

spin_lock_irqsave(&zone->lock, flags);

6068

for (order = 0; order < MAX_ORDER; order++) {

6070

for (order = 0; order < MAX_ORDER; order++) {

6069

struct page *page_head = page - (pfn & ((1 << order) - 1));

6071

struct page *page_head = page - (pfn & ((1 << order) - 1));

6070

6072

6071

if (PageBuddy(page_head) && page_order(page_head) >= order)

6073

if (PageBuddy(page_head) && page_order(page_head) >= order)

6072

break;

6074

break;

6073

}

6075

}

6074

spin_unlock_irqrestore(&zone->lock, flags);

6076

spin_unlock_irqrestore(&zone->lock, flags);

6075

6077

6076

return order < MAX_ORDER;

6078

return order < MAX_ORDER;

6077

}

6079

}

6078

#endif

6080

#endif

6079

6081

6080

static const struct trace_print_flags pageflag_names[] = {

6082

static const struct trace_print_flags pageflag_names[] = {

6081

{1UL << PG_locked, "locked" },

6083

{1UL << PG_locked, "locked" },

6082

{1UL << PG_error, "error" },

6084

{1UL << PG_error, "error" },

6083

{1UL << PG_referenced, "referenced" },

6085

{1UL << PG_referenced, "referenced" },

6084

{1UL << PG_uptodate, "uptodate" },

6086

{1UL << PG_uptodate, "uptodate" },

6085

{1UL << PG_dirty, "dirty" },

6087

{1UL << PG_dirty, "dirty" },

6086

{1UL << PG_lru, "lru" },

6088

{1UL << PG_lru, "lru" },

6087

{1UL << PG_active, "active" },

6089

{1UL << PG_active, "active" },

6088

{1UL << PG_slab, "slab" },

6090

{1UL << PG_slab, "slab" },

6089

{1UL << PG_owner_priv_1, "owner_priv_1" },

6091

{1UL << PG_owner_priv_1, "owner_priv_1" },

6090

{1UL << PG_arch_1, "arch_1" },

6092

{1UL << PG_arch_1, "arch_1" },

6091

{1UL << PG_reserved, "reserved" },

6093

{1UL << PG_reserved, "reserved" },

6092

{1UL << PG_private, "private" },

6094

{1UL << PG_private, "private" },

6093

{1UL << PG_private_2, "private_2" },

6095

{1UL << PG_private_2, "private_2" },

6094

{1UL << PG_writeback, "writeback" },

6096

{1UL << PG_writeback, "writeback" },

6095

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6097

#ifdef CONFIG_PAGEFLAGS_EXTENDED

6096

{1UL << PG_head, "head" },

6098

{1UL << PG_head, "head" },

6097

{1UL << PG_tail, "tail" },

6099

{1UL << PG_tail, "tail" },

6098

#else

6100

#else

6099

{1UL << PG_compound, "compound" },

6101

{1UL << PG_compound, "compound" },

6100

#endif

6102

#endif

6101

{1UL << PG_swapcache, "swapcache" },

6103

{1UL << PG_swapcache, "swapcache" },

6102

{1UL << PG_mappedtodisk, "mappedtodisk" },

6104

{1UL << PG_mappedtodisk, "mappedtodisk" },

6103

{1UL << PG_reclaim, "reclaim" },

6105

{1UL << PG_reclaim, "reclaim" },

6104

{1UL << PG_swapbacked, "swapbacked" },

6106

{1UL << PG_swapbacked, "swapbacked" },

6105

{1UL << PG_unevictable, "unevictable" },

6107

{1UL << PG_unevictable, "unevictable" },

6106

#ifdef CONFIG_MMU

6108

#ifdef CONFIG_MMU

6107

{1UL << PG_mlocked, "mlocked" },

6109

{1UL << PG_mlocked, "mlocked" },

6108

#endif

6110

#endif

6109

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6111

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

6110

{1UL << PG_uncached, "uncached" },

6112

{1UL << PG_uncached, "uncached" },

6111

#endif

6113

#endif

6112

#ifdef CONFIG_MEMORY_FAILURE

6114

#ifdef CONFIG_MEMORY_FAILURE

6113

{1UL << PG_hwpoison, "hwpoison" },

6115

{1UL << PG_hwpoison, "hwpoison" },

6114

#endif

6116

#endif

6115

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6117

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6116

{1UL << PG_compound_lock, "compound_lock" },

6118

{1UL << PG_compound_lock, "compound_lock" },

6117

#endif

6119

#endif

6118

};

6120

};

6119

6121

6120

static void dump_page_flags(unsigned long flags)

6122

static void dump_page_flags(unsigned long flags)

6121

{

6123

{

6122

const char *delim = "";

6124

const char *delim = "";

6123

unsigned long mask;

6125

unsigned long mask;

6124

int i;

6126

int i;

6125

6127

6126

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6128

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6127

6129

6128

printk(KERN_ALERT "page flags: %#lx(", flags);

6130

printk(KERN_ALERT "page flags: %#lx(", flags);

6129

6131

6130

/* remove zone id */

6132

/* remove zone id */

6131

flags &= (1UL << NR_PAGEFLAGS) - 1;

6133

flags &= (1UL << NR_PAGEFLAGS) - 1;

6132

6134

6133

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6135

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6134

6136

6135

mask = pageflag_names[i].mask;

6137

mask = pageflag_names[i].mask;

6136

if ((flags & mask) != mask)

6138

if ((flags & mask) != mask)

6137

continue;

6139

continue;

6138

6140

6139

flags &= ~mask;

6141

flags &= ~mask;

6140

printk("%s%s", delim, pageflag_names[i].name);

6142

printk("%s%s", delim, pageflag_names[i].name);

6141

delim = "|";

6143

delim = "|";

6142

}

6144

}

6143

6145

6144

/* check for left over flags */

6146

/* check for left over flags */

6145

if (flags)

6147

if (flags)

6146

printk("%s%#lx", delim, flags);

6148

printk("%s%#lx", delim, flags);

6147

6149

6148

printk(")\n");

6150

printk(")\n");

6149

}

6151

}

6150

6152

6151

void dump_page(struct page *page)

6153

void dump_page(struct page *page)

6152

{

6154

{

6153

printk(KERN_ALERT

6155

printk(KERN_ALERT

6154

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6156

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6155

page, atomic_read(&page->_count), page_mapcount(page),

6157

page, atomic_read(&page->_count), page_mapcount(page),

6156

page->mapping, page->index);

6158

page->mapping, page->index);

6157

dump_page_flags(page->flags);

6159

dump_page_flags(page->flags);

6158

mem_cgroup_print_bad_page(page);

6160

mem_cgroup_print_bad_page(page);

6159

}

6161

}

6160

6162

GITLAB

CMA: make putback_lru_pages() call conditional