Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/jiffies.h>

22

#include <linux/jiffies.h>

23

#include <linux/bootmem.h>

23

#include <linux/bootmem.h>

24

#include <linux/memblock.h>

24

#include <linux/memblock.h>

25

#include <linux/compiler.h>

25

#include <linux/compiler.h>

26

#include <linux/kernel.h>

26

#include <linux/kernel.h>

27

#include <linux/kmemcheck.h>

27

#include <linux/kmemcheck.h>

28

#include <linux/module.h>

28

#include <linux/module.h>

29

#include <linux/suspend.h>

29

#include <linux/suspend.h>

30

#include <linux/pagevec.h>

30

#include <linux/pagevec.h>

31

#include <linux/blkdev.h>

31

#include <linux/blkdev.h>

32

#include <linux/slab.h>

32

#include <linux/slab.h>

33

#include <linux/ratelimit.h>

33

#include <linux/ratelimit.h>

34

#include <linux/oom.h>

34

#include <linux/oom.h>

35

#include <linux/notifier.h>

35

#include <linux/notifier.h>

36

#include <linux/topology.h>

36

#include <linux/topology.h>

37

#include <linux/sysctl.h>

37

#include <linux/sysctl.h>

38

#include <linux/cpu.h>

38

#include <linux/cpu.h>

39

#include <linux/cpuset.h>

39

#include <linux/cpuset.h>

40

#include <linux/memory_hotplug.h>

40

#include <linux/memory_hotplug.h>

41

#include <linux/nodemask.h>

41

#include <linux/nodemask.h>

42

#include <linux/vmalloc.h>

42

#include <linux/vmalloc.h>

43

#include <linux/vmstat.h>

43

#include <linux/vmstat.h>

44

#include <linux/mempolicy.h>

44

#include <linux/mempolicy.h>

45

#include <linux/stop_machine.h>

45

#include <linux/stop_machine.h>

46

#include <linux/sort.h>

46

#include <linux/sort.h>

47

#include <linux/pfn.h>

47

#include <linux/pfn.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/fault-inject.h>

49

#include <linux/fault-inject.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/page_cgroup.h>

51

#include <linux/page_cgroup.h>

52

#include <linux/debugobjects.h>

52

#include <linux/debugobjects.h>

53

#include <linux/kmemleak.h>

53

#include <linux/kmemleak.h>

54

#include <linux/compaction.h>

54

#include <linux/compaction.h>

55

#include <trace/events/kmem.h>

55

#include <trace/events/kmem.h>

56

#include <linux/ftrace_event.h>

56

#include <linux/ftrace_event.h>

57

#include <linux/memcontrol.h>

57

#include <linux/memcontrol.h>

58

#include <linux/prefetch.h>

58

#include <linux/prefetch.h>

59

#include <linux/migrate.h>

59

#include <linux/migrate.h>

60

#include <linux/page-debug-flags.h>

60

#include <linux/page-debug-flags.h>

61

62

#include <asm/tlbflush.h>

62

#include <asm/tlbflush.h>

63

#include <asm/div64.h>

63

#include <asm/div64.h>

64

#include "internal.h"

64

#include "internal.h"

65

66

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

66

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID

67

DEFINE_PER_CPU(int, numa_node);

67

DEFINE_PER_CPU(int, numa_node);

68

EXPORT_PER_CPU_SYMBOL(numa_node);

68

EXPORT_PER_CPU_SYMBOL(numa_node);

69

#endif

69

#endif

70

71

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

71

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

72

/*

72

/*

73

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

73

* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.

74

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

74

* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.

75

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

75

* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()

76

* defined in <linux/topology.h>.

76

* defined in <linux/topology.h>.

77

*/

77

*/

78

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

78

DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */

79

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

79

EXPORT_PER_CPU_SYMBOL(_numa_mem_);

80

#endif

80

#endif

81

82

/*

82

/*

83

* Array of node states.

83

* Array of node states.

84

*/

84

*/

85

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

85

nodemask_t node_states[NR_NODE_STATES] __read_mostly = {

86

[N_POSSIBLE] = NODE_MASK_ALL,

86

[N_POSSIBLE] = NODE_MASK_ALL,

87

[N_ONLINE] = { { [0] = 1UL } },

87

[N_ONLINE] = { { [0] = 1UL } },

88

#ifndef CONFIG_NUMA

88

#ifndef CONFIG_NUMA

89

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

89

[N_NORMAL_MEMORY] = { { [0] = 1UL } },

90

#ifdef CONFIG_HIGHMEM

90

#ifdef CONFIG_HIGHMEM

91

[N_HIGH_MEMORY] = { { [0] = 1UL } },

91

[N_HIGH_MEMORY] = { { [0] = 1UL } },

92

#endif

92

#endif

93

[N_CPU] = { { [0] = 1UL } },

93

[N_CPU] = { { [0] = 1UL } },

94

#endif /* NUMA */

94

#endif /* NUMA */

95

};

95

};

96

EXPORT_SYMBOL(node_states);

96

EXPORT_SYMBOL(node_states);

97

98

unsigned long totalram_pages __read_mostly;

98

unsigned long totalram_pages __read_mostly;

99

unsigned long totalreserve_pages __read_mostly;

99

unsigned long totalreserve_pages __read_mostly;

100

/*

100

/*

101

* When calculating the number of globally allowed dirty pages, there

101

* When calculating the number of globally allowed dirty pages, there

102

* is a certain number of per-zone reserves that should not be

102

* is a certain number of per-zone reserves that should not be

103

* considered dirtyable memory. This is the sum of those reserves

103

* considered dirtyable memory. This is the sum of those reserves

104

* over all existing zones that contribute dirtyable memory.

104

* over all existing zones that contribute dirtyable memory.

105

*/

105

*/

106

unsigned long dirty_balance_reserve __read_mostly;

106

unsigned long dirty_balance_reserve __read_mostly;

107

108

int percpu_pagelist_fraction;

108

int percpu_pagelist_fraction;

109

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

109

gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;

110

111

#ifdef CONFIG_PM_SLEEP

111

#ifdef CONFIG_PM_SLEEP

112

/*

112

/*

113

* The following functions are used by the suspend/hibernate code to temporarily

113

* The following functions are used by the suspend/hibernate code to temporarily

114

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

114

* change gfp_allowed_mask in order to avoid using I/O during memory allocations

115

* while devices are suspended. To avoid races with the suspend/hibernate code,

115

* while devices are suspended. To avoid races with the suspend/hibernate code,

116

* they should always be called with pm_mutex held (gfp_allowed_mask also should

116

* they should always be called with pm_mutex held (gfp_allowed_mask also should

117

* only be modified with pm_mutex held, unless the suspend/hibernate code is

117

* only be modified with pm_mutex held, unless the suspend/hibernate code is

118

* guaranteed not to run in parallel with that modification).

118

* guaranteed not to run in parallel with that modification).

119

*/

119

*/

120

121

static gfp_t saved_gfp_mask;

121

static gfp_t saved_gfp_mask;

122

123

void pm_restore_gfp_mask(void)

123

void pm_restore_gfp_mask(void)

124

{

124

{

125

WARN_ON(!mutex_is_locked(&pm_mutex));

125

WARN_ON(!mutex_is_locked(&pm_mutex));

126

if (saved_gfp_mask) {

126

if (saved_gfp_mask) {

127

gfp_allowed_mask = saved_gfp_mask;

127

gfp_allowed_mask = saved_gfp_mask;

128

saved_gfp_mask = 0;

128

saved_gfp_mask = 0;

129

}

129

}

130

}

130

}

131

132

void pm_restrict_gfp_mask(void)

132

void pm_restrict_gfp_mask(void)

133

{

133

{

134

WARN_ON(!mutex_is_locked(&pm_mutex));

134

WARN_ON(!mutex_is_locked(&pm_mutex));

135

WARN_ON(saved_gfp_mask);

135

WARN_ON(saved_gfp_mask);

136

saved_gfp_mask = gfp_allowed_mask;

136

saved_gfp_mask = gfp_allowed_mask;

137

gfp_allowed_mask &= ~GFP_IOFS;

137

gfp_allowed_mask &= ~GFP_IOFS;

138

}

138

}

139

140

bool pm_suspended_storage(void)

140

bool pm_suspended_storage(void)

141

{

141

{

142

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

142

if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)

143

return false;

143

return false;

144

return true;

144

return true;

145

}

145

}

146

#endif /* CONFIG_PM_SLEEP */

146

#endif /* CONFIG_PM_SLEEP */

147

148

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

148

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

149

int pageblock_order __read_mostly;

149

int pageblock_order __read_mostly;

150

#endif

150

#endif

151

152

static void __free_pages_ok(struct page *page, unsigned int order);

152

static void __free_pages_ok(struct page *page, unsigned int order);

153

154

/*

154

/*

155

* results with 256, 32 in the lowmem_reserve sysctl:

155

* results with 256, 32 in the lowmem_reserve sysctl:

156

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

156

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

157

* 1G machine -> (16M dma, 784M normal, 224M high)

157

* 1G machine -> (16M dma, 784M normal, 224M high)

158

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

158

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

159

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

159

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

160

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

160

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

161

*

161

*

162

* TBD: should special case ZONE_DMA32 machines here - in those we normally

162

* TBD: should special case ZONE_DMA32 machines here - in those we normally

163

* don't need any ZONE_NORMAL reservation

163

* don't need any ZONE_NORMAL reservation

164

*/

164

*/

165

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

165

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

166

#ifdef CONFIG_ZONE_DMA

166

#ifdef CONFIG_ZONE_DMA

167

256,

167

256,

168

#endif

168

#endif

169

#ifdef CONFIG_ZONE_DMA32

169

#ifdef CONFIG_ZONE_DMA32

170

256,

170

256,

171

#endif

171

#endif

172

#ifdef CONFIG_HIGHMEM

172

#ifdef CONFIG_HIGHMEM

173

32,

173

32,

174

#endif

174

#endif

175

32,

175

32,

176

};

176

};

177

178

EXPORT_SYMBOL(totalram_pages);

178

EXPORT_SYMBOL(totalram_pages);

179

180

static char * const zone_names[MAX_NR_ZONES] = {

180

static char * const zone_names[MAX_NR_ZONES] = {

181

#ifdef CONFIG_ZONE_DMA

181

#ifdef CONFIG_ZONE_DMA

182

"DMA",

182

"DMA",

183

#endif

183

#endif

184

#ifdef CONFIG_ZONE_DMA32

184

#ifdef CONFIG_ZONE_DMA32

185

"DMA32",

185

"DMA32",

186

#endif

186

#endif

187

"Normal",

187

"Normal",

188

#ifdef CONFIG_HIGHMEM

188

#ifdef CONFIG_HIGHMEM

189

"HighMem",

189

"HighMem",

190

#endif

190

#endif

191

"Movable",

191

"Movable",

192

};

192

};

193

194

int min_free_kbytes = 1024;

194

int min_free_kbytes = 1024;

195

196

static unsigned long __meminitdata nr_kernel_pages;

196

static unsigned long __meminitdata nr_kernel_pages;

197

static unsigned long __meminitdata nr_all_pages;

197

static unsigned long __meminitdata nr_all_pages;

198

static unsigned long __meminitdata dma_reserve;

198

static unsigned long __meminitdata dma_reserve;

199

200

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

200

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

201

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

201

static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

202

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

202

static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

203

static unsigned long __initdata required_kernelcore;

203

static unsigned long __initdata required_kernelcore;

204

static unsigned long __initdata required_movablecore;

204

static unsigned long __initdata required_movablecore;

205

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

205

static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];

206

207

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

207

/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */

208

int movable_zone;

208

int movable_zone;

209

EXPORT_SYMBOL(movable_zone);

209

EXPORT_SYMBOL(movable_zone);

210

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

210

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

211

212

#if MAX_NUMNODES > 1

212

#if MAX_NUMNODES > 1

213

int nr_node_ids __read_mostly = MAX_NUMNODES;

213

int nr_node_ids __read_mostly = MAX_NUMNODES;

214

int nr_online_nodes __read_mostly = 1;

214

int nr_online_nodes __read_mostly = 1;

215

EXPORT_SYMBOL(nr_node_ids);

215

EXPORT_SYMBOL(nr_node_ids);

216

EXPORT_SYMBOL(nr_online_nodes);

216

EXPORT_SYMBOL(nr_online_nodes);

217

#endif

217

#endif

218

219

int page_group_by_mobility_disabled __read_mostly;

219

int page_group_by_mobility_disabled __read_mostly;

220

221

/*

221

/*

222

* NOTE:

222

* NOTE:

223

* Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.

223

* Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.

224

* Instead, use {un}set_pageblock_isolate.

224

* Instead, use {un}set_pageblock_isolate.

225

*/

225

*/

226

void set_pageblock_migratetype(struct page *page, int migratetype)

226

void set_pageblock_migratetype(struct page *page, int migratetype)

227

{

227

{

228

229

if (unlikely(page_group_by_mobility_disabled))

229

if (unlikely(page_group_by_mobility_disabled))

230

migratetype = MIGRATE_UNMOVABLE;

230

migratetype = MIGRATE_UNMOVABLE;

231

232

set_pageblock_flags_group(page, (unsigned long)migratetype,

232

set_pageblock_flags_group(page, (unsigned long)migratetype,

233

PB_migrate, PB_migrate_end);

233

PB_migrate, PB_migrate_end);

234

}

234

}

235

236

bool oom_killer_disabled __read_mostly;

236

bool oom_killer_disabled __read_mostly;

237

238

#ifdef CONFIG_DEBUG_VM

238

#ifdef CONFIG_DEBUG_VM

239

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

239

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

240

{

240

{

241

int ret = 0;

241

int ret = 0;

242

unsigned seq;

242

unsigned seq;

243

unsigned long pfn = page_to_pfn(page);

243

unsigned long pfn = page_to_pfn(page);

244

245

do {

245

do {

246

seq = zone_span_seqbegin(zone);

246

seq = zone_span_seqbegin(zone);

247

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

247

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

248

ret = 1;

248

ret = 1;

249

else if (pfn < zone->zone_start_pfn)

249

else if (pfn < zone->zone_start_pfn)

250

ret = 1;

250

ret = 1;

251

} while (zone_span_seqretry(zone, seq));

251

} while (zone_span_seqretry(zone, seq));

252

253

return ret;

253

return ret;

254

}

254

}

255

256

static int page_is_consistent(struct zone *zone, struct page *page)

256

static int page_is_consistent(struct zone *zone, struct page *page)

257

{

257

{

258

if (!pfn_valid_within(page_to_pfn(page)))

258

if (!pfn_valid_within(page_to_pfn(page)))

259

return 0;

259

return 0;

260

if (zone != page_zone(page))

260

if (zone != page_zone(page))

261

return 0;

261

return 0;

262

263

return 1;

263

return 1;

264

}

264

}

265

/*

265

/*

266

* Temporary debugging check for pages not lying within a given zone.

266

* Temporary debugging check for pages not lying within a given zone.

267

*/

267

*/

268

static int bad_range(struct zone *zone, struct page *page)

268

static int bad_range(struct zone *zone, struct page *page)

269

{

269

{

270

if (page_outside_zone_boundaries(zone, page))

270

if (page_outside_zone_boundaries(zone, page))

271

return 1;

271

return 1;

272

if (!page_is_consistent(zone, page))

272

if (!page_is_consistent(zone, page))

273

return 1;

273

return 1;

274

275

return 0;

275

return 0;

276

}

276

}

277

#else

277

#else

278

static inline int bad_range(struct zone *zone, struct page *page)

278

static inline int bad_range(struct zone *zone, struct page *page)

279

{

279

{

280

return 0;

280

return 0;

281

}

281

}

282

#endif

282

#endif

283

284

static void bad_page(struct page *page)

284

static void bad_page(struct page *page)

285

{

285

{

286

static unsigned long resume;

286

static unsigned long resume;

287

static unsigned long nr_shown;

287

static unsigned long nr_shown;

288

static unsigned long nr_unshown;

288

static unsigned long nr_unshown;

289

290

/* Don't complain about poisoned pages */

290

/* Don't complain about poisoned pages */

291

if (PageHWPoison(page)) {

291

if (PageHWPoison(page)) {

292

reset_page_mapcount(page); /* remove PageBuddy */

292

reset_page_mapcount(page); /* remove PageBuddy */

293

return;

293

return;

294

}

294

}

295

296

/*

296

/*

297

* Allow a burst of 60 reports, then keep quiet for that minute;

297

* Allow a burst of 60 reports, then keep quiet for that minute;

298

* or allow a steady drip of one report per second.

298

* or allow a steady drip of one report per second.

299

*/

299

*/

300

if (nr_shown == 60) {

300

if (nr_shown == 60) {

301

if (time_before(jiffies, resume)) {

301

if (time_before(jiffies, resume)) {

302

nr_unshown++;

302

nr_unshown++;

303

goto out;

303

goto out;

304

}

304

}

305

if (nr_unshown) {

305

if (nr_unshown) {

306

printk(KERN_ALERT

306

printk(KERN_ALERT

307

"BUG: Bad page state: %lu messages suppressed\n",

307

"BUG: Bad page state: %lu messages suppressed\n",

308

nr_unshown);

308

nr_unshown);

309

nr_unshown = 0;

309

nr_unshown = 0;

310

}

310

}

311

nr_shown = 0;

311

nr_shown = 0;

312

}

312

}

313

if (nr_shown++ == 0)

313

if (nr_shown++ == 0)

314

resume = jiffies + 60 * HZ;

314

resume = jiffies + 60 * HZ;

315

316

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

316

printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",

317

current->comm, page_to_pfn(page));

317

current->comm, page_to_pfn(page));

318

dump_page(page);

318

dump_page(page);

319

320

print_modules();

320

print_modules();

321

dump_stack();

321

dump_stack();

322

out:

322

out:

323

/* Leave bad fields for debug, except PageBuddy could make trouble */

323

/* Leave bad fields for debug, except PageBuddy could make trouble */

324

reset_page_mapcount(page); /* remove PageBuddy */

324

reset_page_mapcount(page); /* remove PageBuddy */

325

add_taint(TAINT_BAD_PAGE);

325

add_taint(TAINT_BAD_PAGE);

326

}

326

}

327

328

/*

328

/*

329

* Higher-order pages are called "compound pages". They are structured thusly:

329

* Higher-order pages are called "compound pages". They are structured thusly:

330

*

330

*

331

* The first PAGE_SIZE page is called the "head page".

331

* The first PAGE_SIZE page is called the "head page".

332

*

332

*

333

* The remaining PAGE_SIZE pages are called "tail pages".

333

* The remaining PAGE_SIZE pages are called "tail pages".

334

*

334

*

335

* All pages have PG_compound set. All tail pages have their ->first_page

335

* All pages have PG_compound set. All tail pages have their ->first_page

336

* pointing at the head page.

336

* pointing at the head page.

337

*

337

*

338

* The first tail page's ->lru.next holds the address of the compound page's

338

* The first tail page's ->lru.next holds the address of the compound page's

339

* put_page() function. Its ->lru.prev holds the order of allocation.

339

* put_page() function. Its ->lru.prev holds the order of allocation.

340

* This usage means that zero-order pages may not be compound.

340

* This usage means that zero-order pages may not be compound.

341

*/

341

*/

342

343

static void free_compound_page(struct page *page)

343

static void free_compound_page(struct page *page)

344

{

344

{

345

__free_pages_ok(page, compound_order(page));

345

__free_pages_ok(page, compound_order(page));

346

}

346

}

347

348

void prep_compound_page(struct page *page, unsigned long order)

348

void prep_compound_page(struct page *page, unsigned long order)

349

{

349

{

350

int i;

350

int i;

351

int nr_pages = 1 << order;

351

int nr_pages = 1 << order;

352

353

set_compound_page_dtor(page, free_compound_page);

353

set_compound_page_dtor(page, free_compound_page);

354

set_compound_order(page, order);

354

set_compound_order(page, order);

355

__SetPageHead(page);

355

__SetPageHead(page);

356

for (i = 1; i < nr_pages; i++) {

356

for (i = 1; i < nr_pages; i++) {

357

struct page *p = page + i;

357

struct page *p = page + i;

358

__SetPageTail(p);

358

__SetPageTail(p);

359

set_page_count(p, 0);

359

set_page_count(p, 0);

360

p->first_page = page;

360

p->first_page = page;

361

}

361

}

362

}

362

}

363

364

/* update __split_huge_page_refcount if you change this function */

364

/* update __split_huge_page_refcount if you change this function */

365

static int destroy_compound_page(struct page *page, unsigned long order)

365

static int destroy_compound_page(struct page *page, unsigned long order)

366

{

366

{

367

int i;

367

int i;

368

int nr_pages = 1 << order;

368

int nr_pages = 1 << order;

369

int bad = 0;

369

int bad = 0;

370

371

if (unlikely(compound_order(page) != order) ||

371

if (unlikely(compound_order(page) != order) ||

372

unlikely(!PageHead(page))) {

372

unlikely(!PageHead(page))) {

373

bad_page(page);

373

bad_page(page);

374

bad++;

374

bad++;

375

}

375

}

376

377

__ClearPageHead(page);

377

__ClearPageHead(page);

378

379

for (i = 1; i < nr_pages; i++) {

379

for (i = 1; i < nr_pages; i++) {

380

struct page *p = page + i;

380

struct page *p = page + i;

381

382

if (unlikely(!PageTail(p) || (p->first_page != page))) {

382

if (unlikely(!PageTail(p) || (p->first_page != page))) {

383

bad_page(page);

383

bad_page(page);

384

bad++;

384

bad++;

385

}

385

}

386

__ClearPageTail(p);

386

__ClearPageTail(p);

387

}

387

}

388

389

return bad;

389

return bad;

390

}

390

}

391

392

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

392

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

393

{

393

{

394

int i;

394

int i;

395

396

/*

396

/*

397

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

397

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

398

* and __GFP_HIGHMEM from hard or soft interrupt context.

398

* and __GFP_HIGHMEM from hard or soft interrupt context.

399

*/

399

*/

400

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

400

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

401

for (i = 0; i < (1 << order); i++)

401

for (i = 0; i < (1 << order); i++)

402

clear_highpage(page + i);

402

clear_highpage(page + i);

403

}

403

}

404

405

#ifdef CONFIG_DEBUG_PAGEALLOC

405

#ifdef CONFIG_DEBUG_PAGEALLOC

406

unsigned int _debug_guardpage_minorder;

406

unsigned int _debug_guardpage_minorder;

407

408

static int __init debug_guardpage_minorder_setup(char *buf)

408

static int __init debug_guardpage_minorder_setup(char *buf)

409

{

409

{

410

unsigned long res;

410

unsigned long res;

411

412

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

412

if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {

413

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

413

printk(KERN_ERR "Bad debug_guardpage_minorder value\n");

414

return 0;

414

return 0;

415

}

415

}

416

_debug_guardpage_minorder = res;

416

_debug_guardpage_minorder = res;

417

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

417

printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);

418

return 0;

418

return 0;

419

}

419

}

420

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

420

__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);

421

422

static inline void set_page_guard_flag(struct page *page)

422

static inline void set_page_guard_flag(struct page *page)

423

{

423

{

424

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

424

__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

425

}

425

}

426

427

static inline void clear_page_guard_flag(struct page *page)

427

static inline void clear_page_guard_flag(struct page *page)

428

{

428

{

429

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

429

__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);

430

}

430

}

431

#else

431

#else

432

static inline void set_page_guard_flag(struct page *page) { }

432

static inline void set_page_guard_flag(struct page *page) { }

433

static inline void clear_page_guard_flag(struct page *page) { }

433

static inline void clear_page_guard_flag(struct page *page) { }

434

#endif

434

#endif

435

436

static inline void set_page_order(struct page *page, int order)

436

static inline void set_page_order(struct page *page, int order)

437

{

437

{

438

set_page_private(page, order);

438

set_page_private(page, order);

439

__SetPageBuddy(page);

439

__SetPageBuddy(page);

440

}

440

}

441

442

static inline void rmv_page_order(struct page *page)

442

static inline void rmv_page_order(struct page *page)

443

{

443

{

444

__ClearPageBuddy(page);

444

__ClearPageBuddy(page);

445

set_page_private(page, 0);

445

set_page_private(page, 0);

446

}

446

}

447

448

/*

448

/*

449

* Locate the struct page for both the matching buddy in our

449

* Locate the struct page for both the matching buddy in our

450

* pair (buddy1) and the combined O(n+1) page they form (page).

450

* pair (buddy1) and the combined O(n+1) page they form (page).

451

*

451

*

452

* 1) Any buddy B1 will have an order O twin B2 which satisfies

452

* 1) Any buddy B1 will have an order O twin B2 which satisfies

453

* the following equation:

453

* the following equation:

454

* B2 = B1 ^ (1 << O)

454

* B2 = B1 ^ (1 << O)

455

* For example, if the starting buddy (buddy2) is #8 its order

455

* For example, if the starting buddy (buddy2) is #8 its order

456

* 1 buddy is #10:

456

* 1 buddy is #10:

457

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

457

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

458

*

458

*

459

* 2) Any buddy B will have an order O+1 parent P which

459

* 2) Any buddy B will have an order O+1 parent P which

460

* satisfies the following equation:

460

* satisfies the following equation:

461

* P = B & ~(1 << O)

461

* P = B & ~(1 << O)

462

*

462

*

463

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

463

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

464

*/

464

*/

465

static inline unsigned long

465

static inline unsigned long

466

__find_buddy_index(unsigned long page_idx, unsigned int order)

466

__find_buddy_index(unsigned long page_idx, unsigned int order)

467

{

467

{

468

return page_idx ^ (1 << order);

468

return page_idx ^ (1 << order);

469

}

469

}

470

471

/*

471

/*

472

* This function checks whether a page is free && is the buddy

472

* This function checks whether a page is free && is the buddy

473

* we can do coalesce a page and its buddy if

473

* we can do coalesce a page and its buddy if

474

* (a) the buddy is not in a hole &&

474

* (a) the buddy is not in a hole &&

475

* (b) the buddy is in the buddy system &&

475

* (b) the buddy is in the buddy system &&

476

* (c) a page and its buddy have the same order &&

476

* (c) a page and its buddy have the same order &&

477

* (d) a page and its buddy are in the same zone.

477

* (d) a page and its buddy are in the same zone.

478

*

478

*

479

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

479

* For recording whether a page is in the buddy system, we set ->_mapcount -2.

480

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

480

* Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.

481

*

481

*

482

* For recording page's order, we use page_private(page).

482

* For recording page's order, we use page_private(page).

483

*/

483

*/

484

static inline int page_is_buddy(struct page *page, struct page *buddy,

484

static inline int page_is_buddy(struct page *page, struct page *buddy,

485

int order)

485

int order)

486

{

486

{

487

if (!pfn_valid_within(page_to_pfn(buddy)))

487

if (!pfn_valid_within(page_to_pfn(buddy)))

488

return 0;

488

return 0;

489

490

if (page_zone_id(page) != page_zone_id(buddy))

490

if (page_zone_id(page) != page_zone_id(buddy))

491

return 0;

491

return 0;

492

493

if (page_is_guard(buddy) && page_order(buddy) == order) {

493

if (page_is_guard(buddy) && page_order(buddy) == order) {

494

VM_BUG_ON(page_count(buddy) != 0);

494

VM_BUG_ON(page_count(buddy) != 0);

495

return 1;

495

return 1;

496

}

496

}

497

498

if (PageBuddy(buddy) && page_order(buddy) == order) {

498

if (PageBuddy(buddy) && page_order(buddy) == order) {

499

VM_BUG_ON(page_count(buddy) != 0);

499

VM_BUG_ON(page_count(buddy) != 0);

500

return 1;

500

return 1;

501

}

501

}

502

return 0;

502

return 0;

503

}

503

}

504

505

/*

505

/*

506

* Freeing function for a buddy system allocator.

506

* Freeing function for a buddy system allocator.

507

*

507

*

508

* The concept of a buddy system is to maintain direct-mapped table

508

* The concept of a buddy system is to maintain direct-mapped table

509

* (containing bit values) for memory blocks of various "orders".

509

* (containing bit values) for memory blocks of various "orders".

510

* The bottom level table contains the map for the smallest allocatable

510

* The bottom level table contains the map for the smallest allocatable

511

* units of memory (here, pages), and each level above it describes

511

* units of memory (here, pages), and each level above it describes

512

* pairs of units from the levels below, hence, "buddies".

512

* pairs of units from the levels below, hence, "buddies".

513

* At a high level, all that happens here is marking the table entry

513

* At a high level, all that happens here is marking the table entry

514

* at the bottom level available, and propagating the changes upward

514

* at the bottom level available, and propagating the changes upward

515

* as necessary, plus some accounting needed to play nicely with other

515

* as necessary, plus some accounting needed to play nicely with other

516

* parts of the VM system.

516

* parts of the VM system.

517

* At each level, we keep a list of pages, which are heads of continuous

517

* At each level, we keep a list of pages, which are heads of continuous

518

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

518

* free pages of length of (1 << order) and marked with _mapcount -2. Page's

519

* order is recorded in page_private(page) field.

519

* order is recorded in page_private(page) field.

520

* So when we are allocating or freeing one, we can derive the state of the

520

* So when we are allocating or freeing one, we can derive the state of the

521

* other. That is, if we allocate a small block, and both were

521

* other. That is, if we allocate a small block, and both were

522

* free, the remainder of the region must be split into blocks.

522

* free, the remainder of the region must be split into blocks.

523

* If a block is freed, and its buddy is also free, then this

523

* If a block is freed, and its buddy is also free, then this

524

* triggers coalescing into a block of larger size.

524

* triggers coalescing into a block of larger size.

525

*

525

*

526

* -- wli

526

* -- wli

527

*/

527

*/

528

529

static inline void __free_one_page(struct page *page,

529

static inline void __free_one_page(struct page *page,

530

struct zone *zone, unsigned int order,

530

struct zone *zone, unsigned int order,

531

int migratetype)

531

int migratetype)

532

{

532

{

533

unsigned long page_idx;

533

unsigned long page_idx;

534

unsigned long combined_idx;

534

unsigned long combined_idx;

535

unsigned long uninitialized_var(buddy_idx);

535

unsigned long uninitialized_var(buddy_idx);

536

struct page *buddy;

536

struct page *buddy;

537

538

if (unlikely(PageCompound(page)))

538

if (unlikely(PageCompound(page)))

539

if (unlikely(destroy_compound_page(page, order)))

539

if (unlikely(destroy_compound_page(page, order)))

540

return;

540

return;

541

542

VM_BUG_ON(migratetype == -1);

542

VM_BUG_ON(migratetype == -1);

543

544

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

544

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

545

546

VM_BUG_ON(page_idx & ((1 << order) - 1));

546

VM_BUG_ON(page_idx & ((1 << order) - 1));

547

VM_BUG_ON(bad_range(zone, page));

547

VM_BUG_ON(bad_range(zone, page));

548

549

while (order < MAX_ORDER-1) {

549

while (order < MAX_ORDER-1) {

550

buddy_idx = __find_buddy_index(page_idx, order);

550

buddy_idx = __find_buddy_index(page_idx, order);

551

buddy = page + (buddy_idx - page_idx);

551

buddy = page + (buddy_idx - page_idx);

552

if (!page_is_buddy(page, buddy, order))

552

if (!page_is_buddy(page, buddy, order))

553

break;

553

break;

554

/*

554

/*

555

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

555

* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,

556

* merge with it and move up one order.

556

* merge with it and move up one order.

557

*/

557

*/

558

if (page_is_guard(buddy)) {

558

if (page_is_guard(buddy)) {

559

clear_page_guard_flag(buddy);

559

clear_page_guard_flag(buddy);

560

set_page_private(page, 0);

560

set_page_private(page, 0);

561

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

561

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

562

} else {

562

} else {

563

list_del(&buddy->lru);

563

list_del(&buddy->lru);

564

zone->free_area[order].nr_free--;

564

zone->free_area[order].nr_free--;

565

rmv_page_order(buddy);

565

rmv_page_order(buddy);

566

}

566

}

567

combined_idx = buddy_idx & page_idx;

567

combined_idx = buddy_idx & page_idx;

568

page = page + (combined_idx - page_idx);

568

page = page + (combined_idx - page_idx);

569

page_idx = combined_idx;

569

page_idx = combined_idx;

570

order++;

570

order++;

571

}

571

}

572

set_page_order(page, order);

572

set_page_order(page, order);

573

574

/*

574

/*

575

* If this is not the largest possible page, check if the buddy

575

* If this is not the largest possible page, check if the buddy

576

* of the next-highest order is free. If it is, it's possible

576

* of the next-highest order is free. If it is, it's possible

577

* that pages are being freed that will coalesce soon. In case,

577

* that pages are being freed that will coalesce soon. In case,

578

* that is happening, add the free page to the tail of the list

578

* that is happening, add the free page to the tail of the list

579

* so it's less likely to be used soon and more likely to be merged

579

* so it's less likely to be used soon and more likely to be merged

580

* as a higher order page

580

* as a higher order page

581

*/

581

*/

582

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

582

if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {

583

struct page *higher_page, *higher_buddy;

583

struct page *higher_page, *higher_buddy;

584

combined_idx = buddy_idx & page_idx;

584

combined_idx = buddy_idx & page_idx;

585

higher_page = page + (combined_idx - page_idx);

585

higher_page = page + (combined_idx - page_idx);

586

buddy_idx = __find_buddy_index(combined_idx, order + 1);

586

buddy_idx = __find_buddy_index(combined_idx, order + 1);

587

higher_buddy = page + (buddy_idx - combined_idx);

587

higher_buddy = page + (buddy_idx - combined_idx);

588

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

588

if (page_is_buddy(higher_page, higher_buddy, order + 1)) {

589

list_add_tail(&page->lru,

589

list_add_tail(&page->lru,

590

&zone->free_area[order].free_list[migratetype]);

590

&zone->free_area[order].free_list[migratetype]);

591

goto out;

591

goto out;

592

}

592

}

593

}

593

}

594

595

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

595

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

596

out:

596

out:

597

zone->free_area[order].nr_free++;

597

zone->free_area[order].nr_free++;

598

}

598

}

599

600

/*

600

/*

601

* free_page_mlock() -- clean up attempts to free and mlocked() page.

601

* free_page_mlock() -- clean up attempts to free and mlocked() page.

602

* Page should not be on lru, so no need to fix that up.

602

* Page should not be on lru, so no need to fix that up.

603

* free_pages_check() will verify...

603

* free_pages_check() will verify...

604

*/

604

*/

605

static inline void free_page_mlock(struct page *page)

605

static inline void free_page_mlock(struct page *page)

606

{

606

{

607

__dec_zone_page_state(page, NR_MLOCK);

607

__dec_zone_page_state(page, NR_MLOCK);

608

__count_vm_event(UNEVICTABLE_MLOCKFREED);

608

__count_vm_event(UNEVICTABLE_MLOCKFREED);

609

}

609

}

610

611

static inline int free_pages_check(struct page *page)

611

static inline int free_pages_check(struct page *page)

612

{

612

{

613

if (unlikely(page_mapcount(page) |

613

if (unlikely(page_mapcount(page) |

614

(page->mapping != NULL) |

614

(page->mapping != NULL) |

615

(atomic_read(&page->_count) != 0) |

615

(atomic_read(&page->_count) != 0) |

616

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

616

(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |

617

(mem_cgroup_bad_page_check(page)))) {

617

(mem_cgroup_bad_page_check(page)))) {

618

bad_page(page);

618

bad_page(page);

619

return 1;

619

return 1;

620

}

620

}

621

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

621

if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)

622

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

622

page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;

623

return 0;

623

return 0;

624

}

624

}

625

626

/*

626

/*

627

* Frees a number of pages from the PCP lists

627

* Frees a number of pages from the PCP lists

628

* Assumes all pages on list are in same zone, and of same order.

628

* Assumes all pages on list are in same zone, and of same order.

629

* count is the number of pages to free.

629

* count is the number of pages to free.

630

*

630

*

631

* If the zone was previously in an "all pages pinned" state then look to

631

* If the zone was previously in an "all pages pinned" state then look to

632

* see if this freeing clears that state.

632

* see if this freeing clears that state.

633

*

633

*

634

* And clear the zone's pages_scanned counter, to hold off the "all pages are

634

* And clear the zone's pages_scanned counter, to hold off the "all pages are

635

* pinned" detection logic.

635

* pinned" detection logic.

636

*/

636

*/

637

static void free_pcppages_bulk(struct zone *zone, int count,

637

static void free_pcppages_bulk(struct zone *zone, int count,

638

struct per_cpu_pages *pcp)

638

struct per_cpu_pages *pcp)

639

{

639

{

640

int migratetype = 0;

640

int migratetype = 0;

641

int batch_free = 0;

641

int batch_free = 0;

642

int to_free = count;

642

int to_free = count;

643

644

spin_lock(&zone->lock);

644

spin_lock(&zone->lock);

645

zone->all_unreclaimable = 0;

645

zone->all_unreclaimable = 0;

646

zone->pages_scanned = 0;

646

zone->pages_scanned = 0;

647

648

while (to_free) {

648

while (to_free) {

649

struct page *page;

649

struct page *page;

650

struct list_head *list;

650

struct list_head *list;

651

652

/*

652

/*

653

* Remove pages from lists in a round-robin fashion. A

653

* Remove pages from lists in a round-robin fashion. A

654

* batch_free count is maintained that is incremented when an

654

* batch_free count is maintained that is incremented when an

655

* empty list is encountered. This is so more pages are freed

655

* empty list is encountered. This is so more pages are freed

656

* off fuller lists instead of spinning excessively around empty

656

* off fuller lists instead of spinning excessively around empty

657

* lists

657

* lists

658

*/

658

*/

659

do {

659

do {

660

batch_free++;

660

batch_free++;

661

if (++migratetype == MIGRATE_PCPTYPES)

661

if (++migratetype == MIGRATE_PCPTYPES)

662

migratetype = 0;

662

migratetype = 0;

663

list = &pcp->lists[migratetype];

663

list = &pcp->lists[migratetype];

664

} while (list_empty(list));

664

} while (list_empty(list));

665

666

/* This is the only non-empty list. Free them all. */

666

/* This is the only non-empty list. Free them all. */

667

if (batch_free == MIGRATE_PCPTYPES)

667

if (batch_free == MIGRATE_PCPTYPES)

668

batch_free = to_free;

668

batch_free = to_free;

669

670

do {

670

do {

671

page = list_entry(list->prev, struct page, lru);

671

page = list_entry(list->prev, struct page, lru);

672

/* must delete as __free_one_page list manipulates */

672

/* must delete as __free_one_page list manipulates */

673

list_del(&page->lru);

673

list_del(&page->lru);

674

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

674

/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */

675

__free_one_page(page, zone, 0, page_private(page));

675

__free_one_page(page, zone, 0, page_private(page));

676

trace_mm_page_pcpu_drain(page, 0, page_private(page));

676

trace_mm_page_pcpu_drain(page, 0, page_private(page));

677

} while (--to_free && --batch_free && !list_empty(list));

677

} while (--to_free && --batch_free && !list_empty(list));

678

}

678

}

679

__mod_zone_page_state(zone, NR_FREE_PAGES, count);

679

__mod_zone_page_state(zone, NR_FREE_PAGES, count);

680

spin_unlock(&zone->lock);

680

spin_unlock(&zone->lock);

681

}

681

}

682

683

static void free_one_page(struct zone *zone, struct page *page, int order,

683

static void free_one_page(struct zone *zone, struct page *page, int order,

684

int migratetype)

684

int migratetype)

685

{

685

{

686

spin_lock(&zone->lock);

686

spin_lock(&zone->lock);

687

zone->all_unreclaimable = 0;

687

zone->all_unreclaimable = 0;

688

zone->pages_scanned = 0;

688

zone->pages_scanned = 0;

689

690

__free_one_page(page, zone, order, migratetype);

690

__free_one_page(page, zone, order, migratetype);

691

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

691

__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);

692

spin_unlock(&zone->lock);

692

spin_unlock(&zone->lock);

693

}

693

}

694

695

static bool free_pages_prepare(struct page *page, unsigned int order)

695

static bool free_pages_prepare(struct page *page, unsigned int order)

696

{

696

{

697

int i;

697

int i;

698

int bad = 0;

698

int bad = 0;

699

700

trace_mm_page_free(page, order);

700

trace_mm_page_free(page, order);

701

kmemcheck_free_shadow(page, order);

701

kmemcheck_free_shadow(page, order);

702

703

if (PageAnon(page))

703

if (PageAnon(page))

704

page->mapping = NULL;

704

page->mapping = NULL;

705

for (i = 0; i < (1 << order); i++)

705

for (i = 0; i < (1 << order); i++)

706

bad += free_pages_check(page + i);

706

bad += free_pages_check(page + i);

707

if (bad)

707

if (bad)

708

return false;

708

return false;

709

710

if (!PageHighMem(page)) {

710

if (!PageHighMem(page)) {

711

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

711

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

712

debug_check_no_obj_freed(page_address(page),

712

debug_check_no_obj_freed(page_address(page),

713

PAGE_SIZE << order);

713

PAGE_SIZE << order);

714

}

714

}

715

arch_free_page(page, order);

715

arch_free_page(page, order);

716

kernel_map_pages(page, 1 << order, 0);

716

kernel_map_pages(page, 1 << order, 0);

717

718

return true;

718

return true;

719

}

719

}

720

721

static void __free_pages_ok(struct page *page, unsigned int order)

721

static void __free_pages_ok(struct page *page, unsigned int order)

722

{

722

{

723

unsigned long flags;

723

unsigned long flags;

724

int wasMlocked = __TestClearPageMlocked(page);

724

int wasMlocked = __TestClearPageMlocked(page);

725

726

if (!free_pages_prepare(page, order))

726

if (!free_pages_prepare(page, order))

727

return;

727

return;

728

729

local_irq_save(flags);

729

local_irq_save(flags);

730

if (unlikely(wasMlocked))

730

if (unlikely(wasMlocked))

731

free_page_mlock(page);

731

free_page_mlock(page);

732

__count_vm_events(PGFREE, 1 << order);

732

__count_vm_events(PGFREE, 1 << order);

733

free_one_page(page_zone(page), page, order,

733

free_one_page(page_zone(page), page, order,

734

get_pageblock_migratetype(page));

734

get_pageblock_migratetype(page));

735

local_irq_restore(flags);

735

local_irq_restore(flags);

736

}

736

}

737

738

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

738

void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

739

{

739

{

740

unsigned int nr_pages = 1 << order;

740

unsigned int nr_pages = 1 << order;

741

unsigned int loop;

741

unsigned int loop;

742

743

prefetchw(page);

743

prefetchw(page);

744

for (loop = 0; loop < nr_pages; loop++) {

744

for (loop = 0; loop < nr_pages; loop++) {

745

struct page *p = &page[loop];

745

struct page *p = &page[loop];

746

747

if (loop + 1 < nr_pages)

747

if (loop + 1 < nr_pages)

748

prefetchw(p + 1);

748

prefetchw(p + 1);

749

__ClearPageReserved(p);

749

__ClearPageReserved(p);

750

set_page_count(p, 0);

750

set_page_count(p, 0);

751

}

751

}

752

753

set_page_refcounted(page);

753

set_page_refcounted(page);

754

__free_pages(page, order);

754

__free_pages(page, order);

755

}

755

}

756

757

#ifdef CONFIG_CMA

757

#ifdef CONFIG_CMA

758

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

758

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

759

void __init init_cma_reserved_pageblock(struct page *page)

759

void __init init_cma_reserved_pageblock(struct page *page)

760

{

760

{

761

unsigned i = pageblock_nr_pages;

761

unsigned i = pageblock_nr_pages;

762

struct page *p = page;

762

struct page *p = page;

763

764

do {

764

do {

765

__ClearPageReserved(p);

765

__ClearPageReserved(p);

766

set_page_count(p, 0);

766

set_page_count(p, 0);

767

} while (++p, --i);

767

} while (++p, --i);

768

769

set_page_refcounted(page);

769

set_page_refcounted(page);

770

set_pageblock_migratetype(page, MIGRATE_CMA);

770

set_pageblock_migratetype(page, MIGRATE_CMA);

771

__free_pages(page, pageblock_order);

771

__free_pages(page, pageblock_order);

772

totalram_pages += pageblock_nr_pages;

772

totalram_pages += pageblock_nr_pages;

773

}

773

}

774

#endif

774

#endif

775

776

/*

776

/*

777

* The order of subdivision here is critical for the IO subsystem.

777

* The order of subdivision here is critical for the IO subsystem.

778

* Please do not alter this order without good reasons and regression

778

* Please do not alter this order without good reasons and regression

779

* testing. Specifically, as large blocks of memory are subdivided,

779

* testing. Specifically, as large blocks of memory are subdivided,

780

* the order in which smaller blocks are delivered depends on the order

780

* the order in which smaller blocks are delivered depends on the order

781

* they're subdivided in this function. This is the primary factor

781

* they're subdivided in this function. This is the primary factor

782

* influencing the order in which pages are delivered to the IO

782

* influencing the order in which pages are delivered to the IO

783

* subsystem according to empirical testing, and this is also justified

783

* subsystem according to empirical testing, and this is also justified

784

* by considering the behavior of a buddy system containing a single

784

* by considering the behavior of a buddy system containing a single

785

* large block of memory acted on by a series of small allocations.

785

* large block of memory acted on by a series of small allocations.

786

* This behavior is a critical factor in sglist merging's success.

786

* This behavior is a critical factor in sglist merging's success.

787

*

787

*

788

* -- wli

788

* -- wli

789

*/

789

*/

790

static inline void expand(struct zone *zone, struct page *page,

790

static inline void expand(struct zone *zone, struct page *page,

791

int low, int high, struct free_area *area,

791

int low, int high, struct free_area *area,

792

int migratetype)

792

int migratetype)

793

{

793

{

794

unsigned long size = 1 << high;

794

unsigned long size = 1 << high;

795

796

while (high > low) {

796

while (high > low) {

797

area--;

797

area--;

798

high--;

798

high--;

799

size >>= 1;

799

size >>= 1;

800

VM_BUG_ON(bad_range(zone, &page[size]));

800

VM_BUG_ON(bad_range(zone, &page[size]));

801

802

#ifdef CONFIG_DEBUG_PAGEALLOC

802

#ifdef CONFIG_DEBUG_PAGEALLOC

803

if (high < debug_guardpage_minorder()) {

803

if (high < debug_guardpage_minorder()) {

804

/*

804

/*

805

* Mark as guard pages (or page), that will allow to

805

* Mark as guard pages (or page), that will allow to

806

* merge back to allocator when buddy will be freed.

806

* merge back to allocator when buddy will be freed.

807

* Corresponding page table entries will not be touched,

807

* Corresponding page table entries will not be touched,

808

* pages will stay not present in virtual address space

808

* pages will stay not present in virtual address space

809

*/

809

*/

810

INIT_LIST_HEAD(&page[size].lru);

810

INIT_LIST_HEAD(&page[size].lru);

811

set_page_guard_flag(&page[size]);

811

set_page_guard_flag(&page[size]);

812

set_page_private(&page[size], high);

812

set_page_private(&page[size], high);

813

/* Guard pages are not available for any usage */

813

/* Guard pages are not available for any usage */

814

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));

814

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));

815

continue;

815

continue;

816

}

816

}

817

#endif

817

#endif

818

list_add(&page[size].lru, &area->free_list[migratetype]);

818

list_add(&page[size].lru, &area->free_list[migratetype]);

819

area->nr_free++;

819

area->nr_free++;

820

set_page_order(&page[size], high);

820

set_page_order(&page[size], high);

821

}

821

}

822

}

822

}

823

824

/*

824

/*

825

* This page is about to be returned from the page allocator

825

* This page is about to be returned from the page allocator

826

*/

826

*/

827

static inline int check_new_page(struct page *page)

827

static inline int check_new_page(struct page *page)

828

{

828

{

829

if (unlikely(page_mapcount(page) |

829

if (unlikely(page_mapcount(page) |

830

(page->mapping != NULL) |

830

(page->mapping != NULL) |

831

(atomic_read(&page->_count) != 0) |

831

(atomic_read(&page->_count) != 0) |

832

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

832

(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |

833

(mem_cgroup_bad_page_check(page)))) {

833

(mem_cgroup_bad_page_check(page)))) {

834

bad_page(page);

834

bad_page(page);

835

return 1;

835

return 1;

836

}

836

}

837

return 0;

837

return 0;

838

}

838

}

839

840

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

840

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

841

{

841

{

842

int i;

842

int i;

843

844

for (i = 0; i < (1 << order); i++) {

844

for (i = 0; i < (1 << order); i++) {

845

struct page *p = page + i;

845

struct page *p = page + i;

846

if (unlikely(check_new_page(p)))

846

if (unlikely(check_new_page(p)))

847

return 1;

847

return 1;

848

}

848

}

849

850

set_page_private(page, 0);

850

set_page_private(page, 0);

851

set_page_refcounted(page);

851

set_page_refcounted(page);

852

853

arch_alloc_page(page, order);

853

arch_alloc_page(page, order);

854

kernel_map_pages(page, 1 << order, 1);

854

kernel_map_pages(page, 1 << order, 1);

855

856

if (gfp_flags & __GFP_ZERO)

856

if (gfp_flags & __GFP_ZERO)

857

prep_zero_page(page, order, gfp_flags);

857

prep_zero_page(page, order, gfp_flags);

858

859

if (order && (gfp_flags & __GFP_COMP))

859

if (order && (gfp_flags & __GFP_COMP))

860

prep_compound_page(page, order);

860

prep_compound_page(page, order);

861

862

return 0;

862

return 0;

863

}

863

}

864

865

/*

865

/*

866

* Go through the free lists for the given migratetype and remove

866

* Go through the free lists for the given migratetype and remove

867

* the smallest available page from the freelists

867

* the smallest available page from the freelists

868

*/

868

*/

869

static inline

869

static inline

870

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

870

struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,

871

int migratetype)

871

int migratetype)

872

{

872

{

873

unsigned int current_order;

873

unsigned int current_order;

874

struct free_area * area;

874

struct free_area * area;

875

struct page *page;

875

struct page *page;

876

877

/* Find a page of the appropriate size in the preferred list */

877

/* Find a page of the appropriate size in the preferred list */

878

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

878

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

879

area = &(zone->free_area[current_order]);

879

area = &(zone->free_area[current_order]);

880

if (list_empty(&area->free_list[migratetype]))

880

if (list_empty(&area->free_list[migratetype]))

881

continue;

881

continue;

882

883

page = list_entry(area->free_list[migratetype].next,

883

page = list_entry(area->free_list[migratetype].next,

884

struct page, lru);

884

struct page, lru);

885

list_del(&page->lru);

885

list_del(&page->lru);

886

rmv_page_order(page);

886

rmv_page_order(page);

887

area->nr_free--;

887

area->nr_free--;

888

expand(zone, page, order, current_order, area, migratetype);

888

expand(zone, page, order, current_order, area, migratetype);

889

return page;

889

return page;

890

}

890

}

891

892

return NULL;

892

return NULL;

893

}

893

}

894

895

896

/*

896

/*

897

* This array describes the order lists are fallen back to when

897

* This array describes the order lists are fallen back to when

898

* the free lists for the desirable migrate type are depleted

898

* the free lists for the desirable migrate type are depleted

899

*/

899

*/

900

static int fallbacks[MIGRATE_TYPES][4] = {

900

static int fallbacks[MIGRATE_TYPES][4] = {

901

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

901

[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

902

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

902

[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },

903

#ifdef CONFIG_CMA

903

#ifdef CONFIG_CMA

904

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

904

[MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

905

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

905

[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */

906

#else

906

#else

907

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

907

[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },

908

#endif

908

#endif

909

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

909

[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */

910

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

910

[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */

911

};

911

};

912

913

/*

913

/*

914

* Move the free pages in a range to the free lists of the requested type.

914

* Move the free pages in a range to the free lists of the requested type.

915

* Note that start_page and end_pages are not aligned on a pageblock

915

* Note that start_page and end_pages are not aligned on a pageblock

916

* boundary. If alignment is required, use move_freepages_block()

916

* boundary. If alignment is required, use move_freepages_block()

917

*/

917

*/

918

static int move_freepages(struct zone *zone,

918

static int move_freepages(struct zone *zone,

919

struct page *start_page, struct page *end_page,

919

struct page *start_page, struct page *end_page,

920

int migratetype)

920

int migratetype)

921

{

921

{

922

struct page *page;

922

struct page *page;

923

unsigned long order;

923

unsigned long order;

924

int pages_moved = 0;

924

int pages_moved = 0;

925

926

#ifndef CONFIG_HOLES_IN_ZONE

926

#ifndef CONFIG_HOLES_IN_ZONE

927

/*

927

/*

928

* page_zone is not safe to call in this context when

928

* page_zone is not safe to call in this context when

929

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

929

* CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant

930

* anyway as we check zone boundaries in move_freepages_block().

930

* anyway as we check zone boundaries in move_freepages_block().

931

* Remove at a later date when no bug reports exist related to

931

* Remove at a later date when no bug reports exist related to

932

* grouping pages by mobility

932

* grouping pages by mobility

933

*/

933

*/

934

BUG_ON(page_zone(start_page) != page_zone(end_page));

934

BUG_ON(page_zone(start_page) != page_zone(end_page));

935

#endif

935

#endif

936

937

for (page = start_page; page <= end_page;) {

937

for (page = start_page; page <= end_page;) {

938

/* Make sure we are not inadvertently changing nodes */

938

/* Make sure we are not inadvertently changing nodes */

939

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

939

VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

940

941

if (!pfn_valid_within(page_to_pfn(page))) {

941

if (!pfn_valid_within(page_to_pfn(page))) {

942

page++;

942

page++;

943

continue;

943

continue;

944

}

944

}

945

946

if (!PageBuddy(page)) {

946

if (!PageBuddy(page)) {

947

page++;

947

page++;

948

continue;

948

continue;

949

}

949

}

950

951

order = page_order(page);

951

order = page_order(page);

952

list_move(&page->lru,

952

list_move(&page->lru,

953

&zone->free_area[order].free_list[migratetype]);

953

&zone->free_area[order].free_list[migratetype]);

954

page += 1 << order;

954

page += 1 << order;

955

pages_moved += 1 << order;

955

pages_moved += 1 << order;

956

}

956

}

957

958

return pages_moved;

958

return pages_moved;

959

}

959

}

960

961

int move_freepages_block(struct zone *zone, struct page *page,

961

int move_freepages_block(struct zone *zone, struct page *page,

962

int migratetype)

962

int migratetype)

963

{

963

{

964

unsigned long start_pfn, end_pfn;

964

unsigned long start_pfn, end_pfn;

965

struct page *start_page, *end_page;

965

struct page *start_page, *end_page;

966

967

start_pfn = page_to_pfn(page);

967

start_pfn = page_to_pfn(page);

968

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

968

start_pfn = start_pfn & ~(pageblock_nr_pages-1);

969

start_page = pfn_to_page(start_pfn);

969

start_page = pfn_to_page(start_pfn);

970

end_page = start_page + pageblock_nr_pages - 1;

970

end_page = start_page + pageblock_nr_pages - 1;

971

end_pfn = start_pfn + pageblock_nr_pages - 1;

971

end_pfn = start_pfn + pageblock_nr_pages - 1;

972

973

/* Do not cross zone boundaries */

973

/* Do not cross zone boundaries */

974

if (start_pfn < zone->zone_start_pfn)

974

if (start_pfn < zone->zone_start_pfn)

975

start_page = page;

975

start_page = page;

976

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

976

if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)

977

return 0;

977

return 0;

978

979

return move_freepages(zone, start_page, end_page, migratetype);

979

return move_freepages(zone, start_page, end_page, migratetype);

980

}

980

}

981

982

static void change_pageblock_range(struct page *pageblock_page,

982

static void change_pageblock_range(struct page *pageblock_page,

983

int start_order, int migratetype)

983

int start_order, int migratetype)

984

{

984

{

985

int nr_pageblocks = 1 << (start_order - pageblock_order);

985

int nr_pageblocks = 1 << (start_order - pageblock_order);

986

987

while (nr_pageblocks--) {

987

while (nr_pageblocks--) {

988

set_pageblock_migratetype(pageblock_page, migratetype);

988

set_pageblock_migratetype(pageblock_page, migratetype);

989

pageblock_page += pageblock_nr_pages;

989

pageblock_page += pageblock_nr_pages;

990

}

990

}

991

}

991

}

992

993

/* Remove an element from the buddy allocator from the fallback list */

993

/* Remove an element from the buddy allocator from the fallback list */

994

static inline struct page *

994

static inline struct page *

995

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

995

__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)

996

{

996

{

997

struct free_area * area;

997

struct free_area * area;

998

int current_order;

998

int current_order;

999

struct page *page;

999

struct page *page;

1000

int migratetype, i;

1000

int migratetype, i;

1001

1002

/* Find the largest possible block of pages in the other list */

1002

/* Find the largest possible block of pages in the other list */

1003

for (current_order = MAX_ORDER-1; current_order >= order;

1003

for (current_order = MAX_ORDER-1; current_order >= order;

1004

--current_order) {

1004

--current_order) {

1005

for (i = 0;; i++) {

1005

for (i = 0;; i++) {

1006

migratetype = fallbacks[start_migratetype][i];

1006

migratetype = fallbacks[start_migratetype][i];

1007

1008

/* MIGRATE_RESERVE handled later if necessary */

1008

/* MIGRATE_RESERVE handled later if necessary */

1009

if (migratetype == MIGRATE_RESERVE)

1009

if (migratetype == MIGRATE_RESERVE)

1010

break;

1010

break;

1011

1012

area = &(zone->free_area[current_order]);

1012

area = &(zone->free_area[current_order]);

1013

if (list_empty(&area->free_list[migratetype]))

1013

if (list_empty(&area->free_list[migratetype]))

1014

continue;

1014

continue;

1015

1016

page = list_entry(area->free_list[migratetype].next,

1016

page = list_entry(area->free_list[migratetype].next,

1017

struct page, lru);

1017

struct page, lru);

1018

area->nr_free--;

1018

area->nr_free--;

1019

1020

/*

1020

/*

1021

* If breaking a large block of pages, move all free

1021

* If breaking a large block of pages, move all free

1022

* pages to the preferred allocation list. If falling

1022

* pages to the preferred allocation list. If falling

1023

* back for a reclaimable kernel allocation, be more

1023

* back for a reclaimable kernel allocation, be more

1024

* aggressive about taking ownership of free pages

1024

* aggressive about taking ownership of free pages

1025

*

1025

*

1026

* On the other hand, never change migration

1026

* On the other hand, never change migration

1027

* type of MIGRATE_CMA pageblocks nor move CMA

1027

* type of MIGRATE_CMA pageblocks nor move CMA

1028

* pages on different free lists. We don't

1028

* pages on different free lists. We don't

1029

* want unmovable pages to be allocated from

1029

* want unmovable pages to be allocated from

1030

* MIGRATE_CMA areas.

1030

* MIGRATE_CMA areas.

1031

*/

1031

*/

1032

if (!is_migrate_cma(migratetype) &&

1032

if (!is_migrate_cma(migratetype) &&

1033

(unlikely(current_order >= pageblock_order / 2) ||

1033

(unlikely(current_order >= pageblock_order / 2) ||

1034

start_migratetype == MIGRATE_RECLAIMABLE ||

1034

start_migratetype == MIGRATE_RECLAIMABLE ||

1035

page_group_by_mobility_disabled)) {

1035

page_group_by_mobility_disabled)) {

1036

int pages;

1036

int pages;

1037

pages = move_freepages_block(zone, page,

1037

pages = move_freepages_block(zone, page,

1038

start_migratetype);

1038

start_migratetype);

1039

1040

/* Claim the whole block if over half of it is free */

1040

/* Claim the whole block if over half of it is free */

1041

if (pages >= (1 << (pageblock_order-1)) ||

1041

if (pages >= (1 << (pageblock_order-1)) ||

1042

page_group_by_mobility_disabled)

1042

page_group_by_mobility_disabled)

1043

set_pageblock_migratetype(page,

1043

set_pageblock_migratetype(page,

1044

start_migratetype);

1044

start_migratetype);

1045

1046

migratetype = start_migratetype;

1046

migratetype = start_migratetype;

1047

}

1047

}

1048

1049

/* Remove the page from the freelists */

1049

/* Remove the page from the freelists */

1050

list_del(&page->lru);

1050

list_del(&page->lru);

1051

rmv_page_order(page);

1051

rmv_page_order(page);

1052

1053

/* Take ownership for orders >= pageblock_order */

1053

/* Take ownership for orders >= pageblock_order */

1054

if (current_order >= pageblock_order &&

1054

if (current_order >= pageblock_order &&

1055

!is_migrate_cma(migratetype))

1055

!is_migrate_cma(migratetype))

1056

change_pageblock_range(page, current_order,

1056

change_pageblock_range(page, current_order,

1057

start_migratetype);

1057

start_migratetype);

1058

1059

expand(zone, page, order, current_order, area,

1059

expand(zone, page, order, current_order, area,

1060

is_migrate_cma(migratetype)

1060

is_migrate_cma(migratetype)

1061

? migratetype : start_migratetype);

1061

? migratetype : start_migratetype);

1062

1063

trace_mm_page_alloc_extfrag(page, order, current_order,

1063

trace_mm_page_alloc_extfrag(page, order, current_order,

1064

start_migratetype, migratetype);

1064

start_migratetype, migratetype);

1065

1066

return page;

1066

return page;

1067

}

1067

}

1068

}

1068

}

1069

1070

return NULL;

1070

return NULL;

1071

}

1071

}

1072

1073

/*

1073

/*

1074

* Do the hard work of removing an element from the buddy allocator.

1074

* Do the hard work of removing an element from the buddy allocator.

1075

* Call me with the zone->lock already held.

1075

* Call me with the zone->lock already held.

1076

*/

1076

*/

1077

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1077

static struct page *__rmqueue(struct zone *zone, unsigned int order,

1078

int migratetype)

1078

int migratetype)

1079

{

1079

{

1080

struct page *page;

1080

struct page *page;

1081

1082

retry_reserve:

1082

retry_reserve:

1083

page = __rmqueue_smallest(zone, order, migratetype);

1083

page = __rmqueue_smallest(zone, order, migratetype);

1084

1085

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1085

if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {

1086

page = __rmqueue_fallback(zone, order, migratetype);

1086

page = __rmqueue_fallback(zone, order, migratetype);

1087

1088

/*

1088

/*

1089

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1089

* Use MIGRATE_RESERVE rather than fail an allocation. goto

1090

* is used because __rmqueue_smallest is an inline function

1090

* is used because __rmqueue_smallest is an inline function

1091

* and we want just one call site

1091

* and we want just one call site

1092

*/

1092

*/

1093

if (!page) {

1093

if (!page) {

1094

migratetype = MIGRATE_RESERVE;

1094

migratetype = MIGRATE_RESERVE;

1095

goto retry_reserve;

1095

goto retry_reserve;

1096

}

1096

}

1097

}

1097

}

1098

1099

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1099

trace_mm_page_alloc_zone_locked(page, order, migratetype);

1100

return page;

1100

return page;

1101

}

1101

}

1102

1103

/*

1103

/*

1104

* Obtain a specified number of elements from the buddy allocator, all under

1104

* Obtain a specified number of elements from the buddy allocator, all under

1105

* a single hold of the lock, for efficiency. Add them to the supplied list.

1105

* a single hold of the lock, for efficiency. Add them to the supplied list.

1106

* Returns the number of new pages which were placed at *list.

1106

* Returns the number of new pages which were placed at *list.

1107

*/

1107

*/

1108

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1108

static int rmqueue_bulk(struct zone *zone, unsigned int order,

1109

unsigned long count, struct list_head *list,

1109

unsigned long count, struct list_head *list,

1110

int migratetype, int cold)

1110

int migratetype, int cold)

1111

{

1111

{

1112

int mt = migratetype, i;

1112

int mt = migratetype, i;

1113

1114

spin_lock(&zone->lock);

1114

spin_lock(&zone->lock);

1115

for (i = 0; i < count; ++i) {

1115

for (i = 0; i < count; ++i) {

1116

struct page *page = __rmqueue(zone, order, migratetype);

1116

struct page *page = __rmqueue(zone, order, migratetype);

1117

if (unlikely(page == NULL))

1117

if (unlikely(page == NULL))

1118

break;

1118

break;

1119

1120

/*

1120

/*

1121

* Split buddy pages returned by expand() are received here

1121

* Split buddy pages returned by expand() are received here

1122

* in physical page order. The page is added to the callers and

1122

* in physical page order. The page is added to the callers and

1123

* list and the list head then moves forward. From the callers

1123

* list and the list head then moves forward. From the callers

1124

* perspective, the linked list is ordered by page number in

1124

* perspective, the linked list is ordered by page number in

1125

* some conditions. This is useful for IO devices that can

1125

* some conditions. This is useful for IO devices that can

1126

* merge IO requests if the physical pages are ordered

1126

* merge IO requests if the physical pages are ordered

1127

* properly.

1127

* properly.

1128

*/

1128

*/

1129

if (likely(cold == 0))

1129

if (likely(cold == 0))

1130

list_add(&page->lru, list);

1130

list_add(&page->lru, list);

1131

else

1131

else

1132

list_add_tail(&page->lru, list);

1132

list_add_tail(&page->lru, list);

1133

if (IS_ENABLED(CONFIG_CMA)) {

1133

if (IS_ENABLED(CONFIG_CMA)) {

1134

mt = get_pageblock_migratetype(page);

1134

mt = get_pageblock_migratetype(page);

1135

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1135

if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)

1136

mt = migratetype;

1136

mt = migratetype;

1137

}

1137

}

1138

set_page_private(page, mt);

1138

set_page_private(page, mt);

1139

list = &page->lru;

1139

list = &page->lru;

1140

}

1140

}

1141

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1141

__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));

1142

spin_unlock(&zone->lock);

1142

spin_unlock(&zone->lock);

1143

return i;

1143

return i;

1144

}

1144

}

1145

1146

#ifdef CONFIG_NUMA

1146

#ifdef CONFIG_NUMA

1147

/*

1147

/*

1148

* Called from the vmstat counter updater to drain pagesets of this

1148

* Called from the vmstat counter updater to drain pagesets of this

1149

* currently executing processor on remote nodes after they have

1149

* currently executing processor on remote nodes after they have

1150

* expired.

1150

* expired.

1151

*

1151

*

1152

* Note that this function must be called with the thread pinned to

1152

* Note that this function must be called with the thread pinned to

1153

* a single processor.

1153

* a single processor.

1154

*/

1154

*/

1155

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1155

void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)

1156

{

1156

{

1157

unsigned long flags;

1157

unsigned long flags;

1158

int to_drain;

1158

int to_drain;

1159

1160

local_irq_save(flags);

1160

local_irq_save(flags);

1161

if (pcp->count >= pcp->batch)

1161

if (pcp->count >= pcp->batch)

1162

to_drain = pcp->batch;

1162

to_drain = pcp->batch;

1163

else

1163

else

1164

to_drain = pcp->count;

1164

to_drain = pcp->count;

1165

if (to_drain > 0) {

1165

if (to_drain > 0) {

1166

free_pcppages_bulk(zone, to_drain, pcp);

1166

free_pcppages_bulk(zone, to_drain, pcp);

1167

pcp->count -= to_drain;

1167

pcp->count -= to_drain;

1168

}

1168

}

1169

local_irq_restore(flags);

1169

local_irq_restore(flags);

1170

}

1170

}

1171

#endif

1171

#endif

1172

1173

/*

1173

/*

1174

* Drain pages of the indicated processor.

1174

* Drain pages of the indicated processor.

1175

*

1175

*

1176

* The processor must either be the current processor and the

1176

* The processor must either be the current processor and the

1177

* thread pinned to the current processor or a processor that

1177

* thread pinned to the current processor or a processor that

1178

* is not online.

1178

* is not online.

1179

*/

1179

*/

1180

static void drain_pages(unsigned int cpu)

1180

static void drain_pages(unsigned int cpu)

1181

{

1181

{

1182

unsigned long flags;

1182

unsigned long flags;

1183

struct zone *zone;

1183

struct zone *zone;

1184

1185

for_each_populated_zone(zone) {

1185

for_each_populated_zone(zone) {

1186

struct per_cpu_pageset *pset;

1186

struct per_cpu_pageset *pset;

1187

struct per_cpu_pages *pcp;

1187

struct per_cpu_pages *pcp;

1188

1189

local_irq_save(flags);

1189

local_irq_save(flags);

1190

pset = per_cpu_ptr(zone->pageset, cpu);

1190

pset = per_cpu_ptr(zone->pageset, cpu);

1191

1192

pcp = &pset->pcp;

1192

pcp = &pset->pcp;

1193

if (pcp->count) {

1193

if (pcp->count) {

1194

free_pcppages_bulk(zone, pcp->count, pcp);

1194

free_pcppages_bulk(zone, pcp->count, pcp);

1195

pcp->count = 0;

1195

pcp->count = 0;

1196

}

1196

}

1197

local_irq_restore(flags);

1197

local_irq_restore(flags);

1198

}

1198

}

1199

}

1199

}

1200

1201

/*

1201

/*

1202

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1202

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

1203

*/

1203

*/

1204

void drain_local_pages(void *arg)

1204

void drain_local_pages(void *arg)

1205

{

1205

{

1206

drain_pages(smp_processor_id());

1206

drain_pages(smp_processor_id());

1207

}

1207

}

1208

1209

/*

1209

/*

1210

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1210

* Spill all the per-cpu pages from all CPUs back into the buddy allocator.

1211

*

1211

*

1212

* Note that this code is protected against sending an IPI to an offline

1212

* Note that this code is protected against sending an IPI to an offline

1213

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1213

* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:

1214

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1214

* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but

1215

* nothing keeps CPUs from showing up after we populated the cpumask and

1215

* nothing keeps CPUs from showing up after we populated the cpumask and

1216

* before the call to on_each_cpu_mask().

1216

* before the call to on_each_cpu_mask().

1217

*/

1217

*/

1218

void drain_all_pages(void)

1218

void drain_all_pages(void)

1219

{

1219

{

1220

int cpu;

1220

int cpu;

1221

struct per_cpu_pageset *pcp;

1221

struct per_cpu_pageset *pcp;

1222

struct zone *zone;

1222

struct zone *zone;

1223

1224

/*

1224

/*

1225

* Allocate in the BSS so we wont require allocation in

1225

* Allocate in the BSS so we wont require allocation in

1226

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1226

* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y

1227

*/

1227

*/

1228

static cpumask_t cpus_with_pcps;

1228

static cpumask_t cpus_with_pcps;

1229

1230

/*

1230

/*

1231

* We don't care about racing with CPU hotplug event

1231

* We don't care about racing with CPU hotplug event

1232

* as offline notification will cause the notified

1232

* as offline notification will cause the notified

1233

* cpu to drain that CPU pcps and on_each_cpu_mask

1233

* cpu to drain that CPU pcps and on_each_cpu_mask

1234

* disables preemption as part of its processing

1234

* disables preemption as part of its processing

1235

*/

1235

*/

1236

for_each_online_cpu(cpu) {

1236

for_each_online_cpu(cpu) {

1237

bool has_pcps = false;

1237

bool has_pcps = false;

1238

for_each_populated_zone(zone) {

1238

for_each_populated_zone(zone) {

1239

pcp = per_cpu_ptr(zone->pageset, cpu);

1239

pcp = per_cpu_ptr(zone->pageset, cpu);

1240

if (pcp->pcp.count) {

1240

if (pcp->pcp.count) {

1241

has_pcps = true;

1241

has_pcps = true;

1242

break;

1242

break;

1243

}

1243

}

1244

}

1244

}

1245

if (has_pcps)

1245

if (has_pcps)

1246

cpumask_set_cpu(cpu, &cpus_with_pcps);

1246

cpumask_set_cpu(cpu, &cpus_with_pcps);

1247

else

1247

else

1248

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1248

cpumask_clear_cpu(cpu, &cpus_with_pcps);

1249

}

1249

}

1250

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1250

on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);

1251

}

1251

}

1252

1253

#ifdef CONFIG_HIBERNATION

1253

#ifdef CONFIG_HIBERNATION

1254

1255

void mark_free_pages(struct zone *zone)

1255

void mark_free_pages(struct zone *zone)

1256

{

1256

{

1257

unsigned long pfn, max_zone_pfn;

1257

unsigned long pfn, max_zone_pfn;

1258

unsigned long flags;

1258

unsigned long flags;

1259

int order, t;

1259

int order, t;

1260

struct list_head *curr;

1260

struct list_head *curr;

1261

1262

if (!zone->spanned_pages)

1262

if (!zone->spanned_pages)

1263

return;

1263

return;

1264

1265

spin_lock_irqsave(&zone->lock, flags);

1265

spin_lock_irqsave(&zone->lock, flags);

1266

1267

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1267

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

1268

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1268

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

1269

if (pfn_valid(pfn)) {

1269

if (pfn_valid(pfn)) {

1270

struct page *page = pfn_to_page(pfn);

1270

struct page *page = pfn_to_page(pfn);

1271

1272

if (!swsusp_page_is_forbidden(page))

1272

if (!swsusp_page_is_forbidden(page))

1273

swsusp_unset_page_free(page);

1273

swsusp_unset_page_free(page);

1274

}

1274

}

1275

1276

for_each_migratetype_order(order, t) {

1276

for_each_migratetype_order(order, t) {

1277

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1277

list_for_each(curr, &zone->free_area[order].free_list[t]) {

1278

unsigned long i;

1278

unsigned long i;

1279

1280

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1280

pfn = page_to_pfn(list_entry(curr, struct page, lru));

1281

for (i = 0; i < (1UL << order); i++)

1281

for (i = 0; i < (1UL << order); i++)

1282

swsusp_set_page_free(pfn_to_page(pfn + i));

1282

swsusp_set_page_free(pfn_to_page(pfn + i));

1283

}

1283

}

1284

}

1284

}

1285

spin_unlock_irqrestore(&zone->lock, flags);

1285

spin_unlock_irqrestore(&zone->lock, flags);

1286

}

1286

}

1287

#endif /* CONFIG_PM */

1287

#endif /* CONFIG_PM */

1288

1289

/*

1289

/*

1290

* Free a 0-order page

1290

* Free a 0-order page

1291

* cold == 1 ? free a cold page : free a hot page

1291

* cold == 1 ? free a cold page : free a hot page

1292

*/

1292

*/

1293

void free_hot_cold_page(struct page *page, int cold)

1293

void free_hot_cold_page(struct page *page, int cold)

1294

{

1294

{

1295

struct zone *zone = page_zone(page);

1295

struct zone *zone = page_zone(page);

1296

struct per_cpu_pages *pcp;

1296

struct per_cpu_pages *pcp;

1297

unsigned long flags;

1297

unsigned long flags;

1298

int migratetype;

1298

int migratetype;

1299

int wasMlocked = __TestClearPageMlocked(page);

1299

int wasMlocked = __TestClearPageMlocked(page);

1300

1301

if (!free_pages_prepare(page, 0))

1301

if (!free_pages_prepare(page, 0))

1302

return;

1302

return;

1303

1304

migratetype = get_pageblock_migratetype(page);

1304

migratetype = get_pageblock_migratetype(page);

1305

set_page_private(page, migratetype);

1305

set_page_private(page, migratetype);

1306

local_irq_save(flags);

1306

local_irq_save(flags);

1307

if (unlikely(wasMlocked))

1307

if (unlikely(wasMlocked))

1308

free_page_mlock(page);

1308

free_page_mlock(page);

1309

__count_vm_event(PGFREE);

1309

__count_vm_event(PGFREE);

1310

1311

/*

1311

/*

1312

* We only track unmovable, reclaimable and movable on pcp lists.

1312

* We only track unmovable, reclaimable and movable on pcp lists.

1313

* Free ISOLATE pages back to the allocator because they are being

1313

* Free ISOLATE pages back to the allocator because they are being

1314

* offlined but treat RESERVE as movable pages so we can get those

1314

* offlined but treat RESERVE as movable pages so we can get those

1315

* areas back if necessary. Otherwise, we may have to free

1315

* areas back if necessary. Otherwise, we may have to free

1316

* excessively into the page allocator

1316

* excessively into the page allocator

1317

*/

1317

*/

1318

if (migratetype >= MIGRATE_PCPTYPES) {

1318

if (migratetype >= MIGRATE_PCPTYPES) {

1319

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1319

if (unlikely(migratetype == MIGRATE_ISOLATE)) {

1320

free_one_page(zone, page, 0, migratetype);

1320

free_one_page(zone, page, 0, migratetype);

1321

goto out;

1321

goto out;

1322

}

1322

}

1323

migratetype = MIGRATE_MOVABLE;

1323

migratetype = MIGRATE_MOVABLE;

1324

}

1324

}

1325

1326

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1326

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1327

if (cold)

1327

if (cold)

1328

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1328

list_add_tail(&page->lru, &pcp->lists[migratetype]);

1329

else

1329

else

1330

list_add(&page->lru, &pcp->lists[migratetype]);

1330

list_add(&page->lru, &pcp->lists[migratetype]);

1331

pcp->count++;

1331

pcp->count++;

1332

if (pcp->count >= pcp->high) {

1332

if (pcp->count >= pcp->high) {

1333

free_pcppages_bulk(zone, pcp->batch, pcp);

1333

free_pcppages_bulk(zone, pcp->batch, pcp);

1334

pcp->count -= pcp->batch;

1334

pcp->count -= pcp->batch;

1335

}

1335

}

1336

1337

out:

1337

out:

1338

local_irq_restore(flags);

1338

local_irq_restore(flags);

1339

}

1339

}

1340

1341

/*

1341

/*

1342

* Free a list of 0-order pages

1342

* Free a list of 0-order pages

1343

*/

1343

*/

1344

void free_hot_cold_page_list(struct list_head *list, int cold)

1344

void free_hot_cold_page_list(struct list_head *list, int cold)

1345

{

1345

{

1346

struct page *page, *next;

1346

struct page *page, *next;

1347

1348

list_for_each_entry_safe(page, next, list, lru) {

1348

list_for_each_entry_safe(page, next, list, lru) {

1349

trace_mm_page_free_batched(page, cold);

1349

trace_mm_page_free_batched(page, cold);

1350

free_hot_cold_page(page, cold);

1350

free_hot_cold_page(page, cold);

1351

}

1351

}

1352

}

1352

}

1353

1354

/*

1354

/*

1355

* split_page takes a non-compound higher-order page, and splits it into

1355

* split_page takes a non-compound higher-order page, and splits it into

1356

* n (1<<order) sub-pages: page[0..n]

1356

* n (1<<order) sub-pages: page[0..n]

1357

* Each sub-page must be freed individually.

1357

* Each sub-page must be freed individually.

1358

*

1358

*

1359

* Note: this is probably too low level an operation for use in drivers.

1359

* Note: this is probably too low level an operation for use in drivers.

1360

* Please consult with lkml before using this in your driver.

1360

* Please consult with lkml before using this in your driver.

1361

*/

1361

*/

1362

void split_page(struct page *page, unsigned int order)

1362

void split_page(struct page *page, unsigned int order)

1363

{

1363

{

1364

int i;

1364

int i;

1365

1366

VM_BUG_ON(PageCompound(page));

1366

VM_BUG_ON(PageCompound(page));

1367

VM_BUG_ON(!page_count(page));

1367

VM_BUG_ON(!page_count(page));

1368

1369

#ifdef CONFIG_KMEMCHECK

1369

#ifdef CONFIG_KMEMCHECK

1370

/*

1370

/*

1371

* Split shadow pages too, because free(page[0]) would

1371

* Split shadow pages too, because free(page[0]) would

1372

* otherwise free the whole shadow.

1372

* otherwise free the whole shadow.

1373

*/

1373

*/

1374

if (kmemcheck_page_is_tracked(page))

1374

if (kmemcheck_page_is_tracked(page))

1375

split_page(virt_to_page(page[0].shadow), order);

1375

split_page(virt_to_page(page[0].shadow), order);

1376

#endif

1376

#endif

1377

1378

for (i = 1; i < (1 << order); i++)

1378

for (i = 1; i < (1 << order); i++)

1379

set_page_refcounted(page + i);

1379

set_page_refcounted(page + i);

1380

}

1380

}

1381

1382

/*

1382

/*

1383

* Similar to split_page except the page is already free. As this is only

1383

* Similar to split_page except the page is already free. As this is only

1384

* being used for migration, the migratetype of the block also changes.

1384

* being used for migration, the migratetype of the block also changes.

1385

* As this is called with interrupts disabled, the caller is responsible

1385

* As this is called with interrupts disabled, the caller is responsible

1386

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1386

* for calling arch_alloc_page() and kernel_map_page() after interrupts

1387

* are enabled.

1387

* are enabled.

1388

*

1388

*

1389

* Note: this is probably too low level an operation for use in drivers.

1389

* Note: this is probably too low level an operation for use in drivers.

1390

* Please consult with lkml before using this in your driver.

1390

* Please consult with lkml before using this in your driver.

1391

*/

1391

*/

1392

int split_free_page(struct page *page)

1392

int split_free_page(struct page *page)

1393

{

1393

{

1394

unsigned int order;

1394

unsigned int order;

1395

unsigned long watermark;

1395

unsigned long watermark;

1396

struct zone *zone;

1396

struct zone *zone;

1397

1398

BUG_ON(!PageBuddy(page));

1398

BUG_ON(!PageBuddy(page));

1399

1400

zone = page_zone(page);

1400

zone = page_zone(page);

1401

order = page_order(page);

1401

order = page_order(page);

1402

1403

/* Obey watermarks as if the page was being allocated */

1403

/* Obey watermarks as if the page was being allocated */

1404

watermark = low_wmark_pages(zone) + (1 << order);

1404

watermark = low_wmark_pages(zone) + (1 << order);

1405

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1405

if (!zone_watermark_ok(zone, 0, watermark, 0, 0))

1406

return 0;

1406

return 0;

1407

1408

/* Remove page from free list */

1408

/* Remove page from free list */

1409

list_del(&page->lru);

1409

list_del(&page->lru);

1410

zone->free_area[order].nr_free--;

1410

zone->free_area[order].nr_free--;

1411

rmv_page_order(page);

1411

rmv_page_order(page);

1412

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));

1412

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));

1413

1414

/* Split into individual pages */

1414

/* Split into individual pages */

1415

set_page_refcounted(page);

1415

set_page_refcounted(page);

1416

split_page(page, order);

1416

split_page(page, order);

1417

1418

if (order >= pageblock_order - 1) {

1418

if (order >= pageblock_order - 1) {

1419

struct page *endpage = page + (1 << order) - 1;

1419

struct page *endpage = page + (1 << order) - 1;

1420

for (; page < endpage; page += pageblock_nr_pages) {

1420

for (; page < endpage; page += pageblock_nr_pages) {

1421

int mt = get_pageblock_migratetype(page);

1421

int mt = get_pageblock_migratetype(page);

1422

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1422

if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))

1423

set_pageblock_migratetype(page,

1423

set_pageblock_migratetype(page,

1424

MIGRATE_MOVABLE);

1424

MIGRATE_MOVABLE);

1425

}

1425

}

1426

}

1426

}

1427

1428

return 1 << order;

1428

return 1 << order;

1429

}

1429

}

1430

1431

/*

1431

/*

1432

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1432

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

1433

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1433

* we cheat by calling it from here, in the order > 0 path. Saves a branch

1434

* or two.

1434

* or two.

1435

*/

1435

*/

1436

static inline

1436

static inline

1437

struct page *buffered_rmqueue(struct zone *preferred_zone,

1437

struct page *buffered_rmqueue(struct zone *preferred_zone,

1438

struct zone *zone, int order, gfp_t gfp_flags,

1438

struct zone *zone, int order, gfp_t gfp_flags,

1439

int migratetype)

1439

int migratetype)

1440

{

1440

{

1441

unsigned long flags;

1441

unsigned long flags;

1442

struct page *page;

1442

struct page *page;

1443

int cold = !!(gfp_flags & __GFP_COLD);

1443

int cold = !!(gfp_flags & __GFP_COLD);

1444

1445

again:

1445

again:

1446

if (likely(order == 0)) {

1446

if (likely(order == 0)) {

1447

struct per_cpu_pages *pcp;

1447

struct per_cpu_pages *pcp;

1448

struct list_head *list;

1448

struct list_head *list;

1449

1450

local_irq_save(flags);

1450

local_irq_save(flags);

1451

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1451

pcp = &this_cpu_ptr(zone->pageset)->pcp;

1452

list = &pcp->lists[migratetype];

1452

list = &pcp->lists[migratetype];

1453

if (list_empty(list)) {

1453

if (list_empty(list)) {

1454

pcp->count += rmqueue_bulk(zone, 0,

1454

pcp->count += rmqueue_bulk(zone, 0,

1455

pcp->batch, list,

1455

pcp->batch, list,

1456

migratetype, cold);

1456

migratetype, cold);

1457

if (unlikely(list_empty(list)))

1457

if (unlikely(list_empty(list)))

1458

goto failed;

1458

goto failed;

1459

}

1459

}

1460

1461

if (cold)

1461

if (cold)

1462

page = list_entry(list->prev, struct page, lru);

1462

page = list_entry(list->prev, struct page, lru);

1463

else

1463

else

1464

page = list_entry(list->next, struct page, lru);

1464

page = list_entry(list->next, struct page, lru);

1465

1466

list_del(&page->lru);

1466

list_del(&page->lru);

1467

pcp->count--;

1467

pcp->count--;

1468

} else {

1468

} else {

1469

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1469

if (unlikely(gfp_flags & __GFP_NOFAIL)) {

1470

/*

1470

/*

1471

* __GFP_NOFAIL is not to be used in new code.

1471

* __GFP_NOFAIL is not to be used in new code.

1472

*

1472

*

1473

* All __GFP_NOFAIL callers should be fixed so that they

1473

* All __GFP_NOFAIL callers should be fixed so that they

1474

* properly detect and handle allocation failures.

1474

* properly detect and handle allocation failures.

1475

*

1475

*

1476

* We most definitely don't want callers attempting to

1476

* We most definitely don't want callers attempting to

1477

* allocate greater than order-1 page units with

1477

* allocate greater than order-1 page units with

1478

* __GFP_NOFAIL.

1478

* __GFP_NOFAIL.

1479

*/

1479

*/

1480

WARN_ON_ONCE(order > 1);

1480

WARN_ON_ONCE(order > 1);

1481

}

1481

}

1482

spin_lock_irqsave(&zone->lock, flags);

1482

spin_lock_irqsave(&zone->lock, flags);

1483

page = __rmqueue(zone, order, migratetype);

1483

page = __rmqueue(zone, order, migratetype);

1484

spin_unlock(&zone->lock);

1484

spin_unlock(&zone->lock);

1485

if (!page)

1485

if (!page)

1486

goto failed;

1486

goto failed;

1487

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));

1487

__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));

1488

}

1488

}

1489

1490

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1490

__count_zone_vm_events(PGALLOC, zone, 1 << order);

1491

zone_statistics(preferred_zone, zone, gfp_flags);

1491

zone_statistics(preferred_zone, zone, gfp_flags);

1492

local_irq_restore(flags);

1492

local_irq_restore(flags);

1493

1494

VM_BUG_ON(bad_range(zone, page));

1494

VM_BUG_ON(bad_range(zone, page));

1495

if (prep_new_page(page, order, gfp_flags))

1495

if (prep_new_page(page, order, gfp_flags))

1496

goto again;

1496

goto again;

1497

return page;

1497

return page;

1498

1499

failed:

1499

failed:

1500

local_irq_restore(flags);

1500

local_irq_restore(flags);

1501

return NULL;

1501

return NULL;

1502

}

1502

}

1503

1504

/* The ALLOC_WMARK bits are used as an index to zone->watermark */

1504

/* The ALLOC_WMARK bits are used as an index to zone->watermark */

1505

#define ALLOC_WMARK_MIN WMARK_MIN

1505

#define ALLOC_WMARK_MIN WMARK_MIN

1506

#define ALLOC_WMARK_LOW WMARK_LOW

1506

#define ALLOC_WMARK_LOW WMARK_LOW

1507

#define ALLOC_WMARK_HIGH WMARK_HIGH

1507

#define ALLOC_WMARK_HIGH WMARK_HIGH

1508

#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */

1508

#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */

1509

1510

/* Mask to get the watermark bits */

1510

/* Mask to get the watermark bits */

1511

#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)

1511

#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)

1512

1513

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1513

#define ALLOC_HARDER 0x10 /* try to alloc harder */

1514

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1514

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

1515

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1515

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

1516

1517

#ifdef CONFIG_FAIL_PAGE_ALLOC

1517

#ifdef CONFIG_FAIL_PAGE_ALLOC

1518

1519

static struct {

1519

static struct {

1520

struct fault_attr attr;

1520

struct fault_attr attr;

1521

1522

u32 ignore_gfp_highmem;

1522

u32 ignore_gfp_highmem;

1523

u32 ignore_gfp_wait;

1523

u32 ignore_gfp_wait;

1524

u32 min_order;

1524

u32 min_order;

1525

} fail_page_alloc = {

1525

} fail_page_alloc = {

1526

.attr = FAULT_ATTR_INITIALIZER,

1526

.attr = FAULT_ATTR_INITIALIZER,

1527

.ignore_gfp_wait = 1,

1527

.ignore_gfp_wait = 1,

1528

.ignore_gfp_highmem = 1,

1528

.ignore_gfp_highmem = 1,

1529

.min_order = 1,

1529

.min_order = 1,

1530

};

1530

};

1531

1532

static int __init setup_fail_page_alloc(char *str)

1532

static int __init setup_fail_page_alloc(char *str)

1533

{

1533

{

1534

return setup_fault_attr(&fail_page_alloc.attr, str);

1534

return setup_fault_attr(&fail_page_alloc.attr, str);

1535

}

1535

}

1536

__setup("fail_page_alloc=", setup_fail_page_alloc);

1536

__setup("fail_page_alloc=", setup_fail_page_alloc);

1537

1538

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1538

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1539

{

1539

{

1540

if (order < fail_page_alloc.min_order)

1540

if (order < fail_page_alloc.min_order)

1541

return false;

1541

return false;

1542

if (gfp_mask & __GFP_NOFAIL)

1542

if (gfp_mask & __GFP_NOFAIL)

1543

return false;

1543

return false;

1544

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1544

if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))

1545

return false;

1545

return false;

1546

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1546

if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))

1547

return false;

1547

return false;

1548

1549

return should_fail(&fail_page_alloc.attr, 1 << order);

1549

return should_fail(&fail_page_alloc.attr, 1 << order);

1550

}

1550

}

1551

1552

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1552

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

1553

1554

static int __init fail_page_alloc_debugfs(void)

1554

static int __init fail_page_alloc_debugfs(void)

1555

{

1555

{

1556

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1556

umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

1557

struct dentry *dir;

1557

struct dentry *dir;

1558

1559

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1559

dir = fault_create_debugfs_attr("fail_page_alloc", NULL,

1560

&fail_page_alloc.attr);

1560

&fail_page_alloc.attr);

1561

if (IS_ERR(dir))

1561

if (IS_ERR(dir))

1562

return PTR_ERR(dir);

1562

return PTR_ERR(dir);

1563

1564

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1564

if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,

1565

&fail_page_alloc.ignore_gfp_wait))

1565

&fail_page_alloc.ignore_gfp_wait))

1566

goto fail;

1566

goto fail;

1567

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1567

if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,

1568

&fail_page_alloc.ignore_gfp_highmem))

1568

&fail_page_alloc.ignore_gfp_highmem))

1569

goto fail;

1569

goto fail;

1570

if (!debugfs_create_u32("min-order", mode, dir,

1570

if (!debugfs_create_u32("min-order", mode, dir,

1571

&fail_page_alloc.min_order))

1571

&fail_page_alloc.min_order))

1572

goto fail;

1572

goto fail;

1573

1574

return 0;

1574

return 0;

1575

fail:

1575

fail:

1576

debugfs_remove_recursive(dir);

1576

debugfs_remove_recursive(dir);

1577

1578

return -ENOMEM;

1578

return -ENOMEM;

1579

}

1579

}

1580

1581

late_initcall(fail_page_alloc_debugfs);

1581

late_initcall(fail_page_alloc_debugfs);

1582

1583

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1583

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

1584

1585

#else /* CONFIG_FAIL_PAGE_ALLOC */

1585

#else /* CONFIG_FAIL_PAGE_ALLOC */

1586

1587

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1587

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)

1588

{

1588

{

1589

return false;

1589

return false;

1590

}

1590

}

1591

1592

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1592

#endif /* CONFIG_FAIL_PAGE_ALLOC */

1593

1594

/*

1594

/*

1595

* Return true if free pages are above 'mark'. This takes into account the order

1595

* Return true if free pages are above 'mark'. This takes into account the order

1596

* of the allocation.

1596

* of the allocation.

1597

*/

1597

*/

1598

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1598

static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1599

int classzone_idx, int alloc_flags, long free_pages)

1599

int classzone_idx, int alloc_flags, long free_pages)

1600

{

1600

{

1601

/* free_pages my go negative - that's OK */

1601

/* free_pages my go negative - that's OK */

1602

long min = mark;

1602

long min = mark;

1603

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1603

long lowmem_reserve = z->lowmem_reserve[classzone_idx];

1604

int o;

1604

int o;

1605

1606

free_pages -= (1 << order) - 1;

1606

free_pages -= (1 << order) - 1;

1607

if (alloc_flags & ALLOC_HIGH)

1607

if (alloc_flags & ALLOC_HIGH)

1608

min -= min / 2;

1608

min -= min / 2;

1609

if (alloc_flags & ALLOC_HARDER)

1609

if (alloc_flags & ALLOC_HARDER)

1610

min -= min / 4;

1610

min -= min / 4;

1611

1612

if (free_pages <= min + lowmem_reserve)

1612

if (free_pages <= min + lowmem_reserve)

1613

return false;

1613

return false;

1614

for (o = 0; o < order; o++) {

1614

for (o = 0; o < order; o++) {

1615

/* At the next order, this order's pages become unavailable */

1615

/* At the next order, this order's pages become unavailable */

1616

free_pages -= z->free_area[o].nr_free << o;

1616

free_pages -= z->free_area[o].nr_free << o;

1617

1618

/* Require fewer higher order pages to be free */

1618

/* Require fewer higher order pages to be free */

1619

min >>= 1;

1619

min >>= 1;

1620

1621

if (free_pages <= min)

1621

if (free_pages <= min)

1622

return false;

1622

return false;

1623

}

1623

}

1624

return true;

1624

return true;

1625

}

1625

}

1626

1627

#ifdef CONFIG_MEMORY_ISOLATION

1627

#ifdef CONFIG_MEMORY_ISOLATION

1628

static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)

1628

static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)

1629

{

1629

{

1630

if (unlikely(zone->nr_pageblock_isolate))

1630

if (unlikely(zone->nr_pageblock_isolate))

1631

return zone->nr_pageblock_isolate * pageblock_nr_pages;

1631

return zone->nr_pageblock_isolate * pageblock_nr_pages;

1632

return 0;

1632

return 0;

1633

}

1633

}

1634

#else

1634

#else

1635

static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)

1635

static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)

1636

{

1636

{

1637

return 0;

1637

return 0;

1638

}

1638

}

1639

#endif

1639

#endif

1640

1641

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1641

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,

1642

int classzone_idx, int alloc_flags)

1642

int classzone_idx, int alloc_flags)

1643

{

1643

{

1644

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1644

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1645

zone_page_state(z, NR_FREE_PAGES));

1645

zone_page_state(z, NR_FREE_PAGES));

1646

}

1646

}

1647

1648

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1648

bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,

1649

int classzone_idx, int alloc_flags)

1649

int classzone_idx, int alloc_flags)

1650

{

1650

{

1651

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1651

long free_pages = zone_page_state(z, NR_FREE_PAGES);

1652

1653

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1653

if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)

1654

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1654

free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

1655

1656

/*

1656

/*

1657

* If the zone has MIGRATE_ISOLATE type free pages, we should consider

1657

* If the zone has MIGRATE_ISOLATE type free pages, we should consider

1658

* it. nr_zone_isolate_freepages is never accurate so kswapd might not

1658

* it. nr_zone_isolate_freepages is never accurate so kswapd might not

1659

* sleep although it could do so. But this is more desirable for memory

1659

* sleep although it could do so. But this is more desirable for memory

1660

* hotplug than sleeping which can cause a livelock in the direct

1660

* hotplug than sleeping which can cause a livelock in the direct

1661

* reclaim path.

1661

* reclaim path.

1662

*/

1662

*/

1663

free_pages -= nr_zone_isolate_freepages(z);

1663

free_pages -= nr_zone_isolate_freepages(z);

1664

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1664

return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,

1665

free_pages);

1665

free_pages);

1666

}

1666

}

1667

1668

#ifdef CONFIG_NUMA

1668

#ifdef CONFIG_NUMA

1669

/*

1669

/*

1670

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1670

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

1671

* skip over zones that are not allowed by the cpuset, or that have

1671

* skip over zones that are not allowed by the cpuset, or that have

1672

* been recently (in last second) found to be nearly full. See further

1672

* been recently (in last second) found to be nearly full. See further

1673

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1673

* comments in mmzone.h. Reduces cache footprint of zonelist scans

1674

* that have to skip over a lot of full or unallowed zones.

1674

* that have to skip over a lot of full or unallowed zones.

1675

*

1675

*

1676

* If the zonelist cache is present in the passed in zonelist, then

1676

* If the zonelist cache is present in the passed in zonelist, then

1677

* returns a pointer to the allowed node mask (either the current

1677

* returns a pointer to the allowed node mask (either the current

1678

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1678

* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)

1679

*

1679

*

1680

* If the zonelist cache is not available for this zonelist, does

1680

* If the zonelist cache is not available for this zonelist, does

1681

* nothing and returns NULL.

1681

* nothing and returns NULL.

1682

*

1682

*

1683

* If the fullzones BITMAP in the zonelist cache is stale (more than

1683

* If the fullzones BITMAP in the zonelist cache is stale (more than

1684

* a second since last zap'd) then we zap it out (clear its bits.)

1684

* a second since last zap'd) then we zap it out (clear its bits.)

1685

*

1685

*

1686

* We hold off even calling zlc_setup, until after we've checked the

1686

* We hold off even calling zlc_setup, until after we've checked the

1687

* first zone in the zonelist, on the theory that most allocations will

1687

* first zone in the zonelist, on the theory that most allocations will

1688

* be satisfied from that first zone, so best to examine that zone as

1688

* be satisfied from that first zone, so best to examine that zone as

1689

* quickly as we can.

1689

* quickly as we can.

1690

*/

1690

*/

1691

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1691

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1692

{

1692

{

1693

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1693

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1694

nodemask_t *allowednodes; /* zonelist_cache approximation */

1694

nodemask_t *allowednodes; /* zonelist_cache approximation */

1695

1696

zlc = zonelist->zlcache_ptr;

1696

zlc = zonelist->zlcache_ptr;

1697

if (!zlc)

1697

if (!zlc)

1698

return NULL;

1698

return NULL;

1699

1700

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1700

if (time_after(jiffies, zlc->last_full_zap + HZ)) {

1701

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1701

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1702

zlc->last_full_zap = jiffies;

1702

zlc->last_full_zap = jiffies;

1703

}

1703

}

1704

1705

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1705

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

1706

&cpuset_current_mems_allowed :

1706

&cpuset_current_mems_allowed :

1707

&node_states[N_HIGH_MEMORY];

1707

&node_states[N_HIGH_MEMORY];

1708

return allowednodes;

1708

return allowednodes;

1709

}

1709

}

1710

1711

/*

1711

/*

1712

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1712

* Given 'z' scanning a zonelist, run a couple of quick checks to see

1713

* if it is worth looking at further for free memory:

1713

* if it is worth looking at further for free memory:

1714

* 1) Check that the zone isn't thought to be full (doesn't have its

1714

* 1) Check that the zone isn't thought to be full (doesn't have its

1715

* bit set in the zonelist_cache fullzones BITMAP).

1715

* bit set in the zonelist_cache fullzones BITMAP).

1716

* 2) Check that the zones node (obtained from the zonelist_cache

1716

* 2) Check that the zones node (obtained from the zonelist_cache

1717

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1717

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

1718

* Return true (non-zero) if zone is worth looking at further, or

1718

* Return true (non-zero) if zone is worth looking at further, or

1719

* else return false (zero) if it is not.

1719

* else return false (zero) if it is not.

1720

*

1720

*

1721

* This check -ignores- the distinction between various watermarks,

1721

* This check -ignores- the distinction between various watermarks,

1722

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1722

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

1723

* found to be full for any variation of these watermarks, it will

1723

* found to be full for any variation of these watermarks, it will

1724

* be considered full for up to one second by all requests, unless

1724

* be considered full for up to one second by all requests, unless

1725

* we are so low on memory on all allowed nodes that we are forced

1725

* we are so low on memory on all allowed nodes that we are forced

1726

* into the second scan of the zonelist.

1726

* into the second scan of the zonelist.

1727

*

1727

*

1728

* In the second scan we ignore this zonelist cache and exactly

1728

* In the second scan we ignore this zonelist cache and exactly

1729

* apply the watermarks to all zones, even it is slower to do so.

1729

* apply the watermarks to all zones, even it is slower to do so.

1730

* We are low on memory in the second scan, and should leave no stone

1730

* We are low on memory in the second scan, and should leave no stone

1731

* unturned looking for a free page.

1731

* unturned looking for a free page.

1732

*/

1732

*/

1733

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1733

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1734

nodemask_t *allowednodes)

1734

nodemask_t *allowednodes)

1735

{

1735

{

1736

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1736

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1737

int i; /* index of *z in zonelist zones */

1737

int i; /* index of *z in zonelist zones */

1738

int n; /* node that zone *z is on */

1738

int n; /* node that zone *z is on */

1739

1740

zlc = zonelist->zlcache_ptr;

1740

zlc = zonelist->zlcache_ptr;

1741

if (!zlc)

1741

if (!zlc)

1742

return 1;

1742

return 1;

1743

1744

i = z - zonelist->_zonerefs;

1744

i = z - zonelist->_zonerefs;

1745

n = zlc->z_to_n[i];

1745

n = zlc->z_to_n[i];

1746

1747

/* This zone is worth trying if it is allowed but not full */

1747

/* This zone is worth trying if it is allowed but not full */

1748

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1748

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1749

}

1749

}

1750

1751

/*

1751

/*

1752

* Given 'z' scanning a zonelist, set the corresponding bit in

1752

* Given 'z' scanning a zonelist, set the corresponding bit in

1753

* zlc->fullzones, so that subsequent attempts to allocate a page

1753

* zlc->fullzones, so that subsequent attempts to allocate a page

1754

* from that zone don't waste time re-examining it.

1754

* from that zone don't waste time re-examining it.

1755

*/

1755

*/

1756

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1756

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1757

{

1757

{

1758

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1758

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1759

int i; /* index of *z in zonelist zones */

1759

int i; /* index of *z in zonelist zones */

1760

1761

zlc = zonelist->zlcache_ptr;

1761

zlc = zonelist->zlcache_ptr;

1762

if (!zlc)

1762

if (!zlc)

1763

return;

1763

return;

1764

1765

i = z - zonelist->_zonerefs;

1765

i = z - zonelist->_zonerefs;

1766

1767

set_bit(i, zlc->fullzones);

1767

set_bit(i, zlc->fullzones);

1768

}

1768

}

1769

1770

/*

1770

/*

1771

* clear all zones full, called after direct reclaim makes progress so that

1771

* clear all zones full, called after direct reclaim makes progress so that

1772

* a zone that was recently full is not skipped over for up to a second

1772

* a zone that was recently full is not skipped over for up to a second

1773

*/

1773

*/

1774

static void zlc_clear_zones_full(struct zonelist *zonelist)

1774

static void zlc_clear_zones_full(struct zonelist *zonelist)

1775

{

1775

{

1776

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1776

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1777

1778

zlc = zonelist->zlcache_ptr;

1778

zlc = zonelist->zlcache_ptr;

1779

if (!zlc)

1779

if (!zlc)

1780

return;

1780

return;

1781

1782

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1782

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1783

}

1783

}

1784

1785

#else /* CONFIG_NUMA */

1785

#else /* CONFIG_NUMA */

1786

1787

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1787

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1788

{

1788

{

1789

return NULL;

1789

return NULL;

1790

}

1790

}

1791

1792

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1792

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,

1793

nodemask_t *allowednodes)

1793

nodemask_t *allowednodes)

1794

{

1794

{

1795

return 1;

1795

return 1;

1796

}

1796

}

1797

1798

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1798

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)

1799

{

1799

{

1800

}

1800

}

1801

1802

static void zlc_clear_zones_full(struct zonelist *zonelist)

1802

static void zlc_clear_zones_full(struct zonelist *zonelist)

1803

{

1803

{

1804

}

1804

}

1805

#endif /* CONFIG_NUMA */

1805

#endif /* CONFIG_NUMA */

1806

1807

/*

1807

/*

1808

* get_page_from_freelist goes through the zonelist trying to allocate

1808

* get_page_from_freelist goes through the zonelist trying to allocate

1809

* a page.

1809

* a page.

1810

*/

1810

*/

1811

static struct page *

1811

static struct page *

1812

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1812

get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,

1813

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1813

struct zonelist *zonelist, int high_zoneidx, int alloc_flags,

1814

struct zone *preferred_zone, int migratetype)

1814

struct zone *preferred_zone, int migratetype)

1815

{

1815

{

1816

struct zoneref *z;

1816

struct zoneref *z;

1817

struct page *page = NULL;

1817

struct page *page = NULL;

1818

int classzone_idx;

1818

int classzone_idx;

1819

struct zone *zone;

1819

struct zone *zone;

1820

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1820

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1821

int zlc_active = 0; /* set if using zonelist_cache */

1821

int zlc_active = 0; /* set if using zonelist_cache */

1822

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1822

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1823

1824

classzone_idx = zone_idx(preferred_zone);

1824

classzone_idx = zone_idx(preferred_zone);

1825

zonelist_scan:

1825

zonelist_scan:

1826

/*

1826

/*

1827

* Scan zonelist, looking for a zone with enough free.

1827

* Scan zonelist, looking for a zone with enough free.

1828

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1828

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1829

*/

1829

*/

1830

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1830

for_each_zone_zonelist_nodemask(zone, z, zonelist,

1831

high_zoneidx, nodemask) {

1831

high_zoneidx, nodemask) {

1832

if (NUMA_BUILD && zlc_active &&

1832

if (NUMA_BUILD && zlc_active &&

1833

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1833

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1834

continue;

1834

continue;

1835

if ((alloc_flags & ALLOC_CPUSET) &&

1835

if ((alloc_flags & ALLOC_CPUSET) &&

1836

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1836

!cpuset_zone_allowed_softwall(zone, gfp_mask))

1837

continue;

1837

continue;

1838

/*

1838

/*

1839

* When allocating a page cache page for writing, we

1839

* When allocating a page cache page for writing, we

1840

* want to get it from a zone that is within its dirty

1840

* want to get it from a zone that is within its dirty

1841

* limit, such that no single zone holds more than its

1841

* limit, such that no single zone holds more than its

1842

* proportional share of globally allowed dirty pages.

1842

* proportional share of globally allowed dirty pages.

1843

* The dirty limits take into account the zone's

1843

* The dirty limits take into account the zone's

1844

* lowmem reserves and high watermark so that kswapd

1844

* lowmem reserves and high watermark so that kswapd

1845

* should be able to balance it without having to

1845

* should be able to balance it without having to

1846

* write pages from its LRU list.

1846

* write pages from its LRU list.

1847

*

1847

*

1848

* This may look like it could increase pressure on

1848

* This may look like it could increase pressure on

1849

* lower zones by failing allocations in higher zones

1849

* lower zones by failing allocations in higher zones

1850

* before they are full. But the pages that do spill

1850

* before they are full. But the pages that do spill

1851

* over are limited as the lower zones are protected

1851

* over are limited as the lower zones are protected

1852

* by this very same mechanism. It should not become

1852

* by this very same mechanism. It should not become

1853

* a practical burden to them.

1853

* a practical burden to them.

1854

*

1854

*

1855

* XXX: For now, allow allocations to potentially

1855

* XXX: For now, allow allocations to potentially

1856

* exceed the per-zone dirty limit in the slowpath

1856

* exceed the per-zone dirty limit in the slowpath

1857

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1857

* (ALLOC_WMARK_LOW unset) before going into reclaim,

1858

* which is important when on a NUMA setup the allowed

1858

* which is important when on a NUMA setup the allowed

1859

* zones are together not big enough to reach the

1859

* zones are together not big enough to reach the

1860

* global limit. The proper fix for these situations

1860

* global limit. The proper fix for these situations

1861

* will require awareness of zones in the

1861

* will require awareness of zones in the

1862

* dirty-throttling and the flusher threads.

1862

* dirty-throttling and the flusher threads.

1863

*/

1863

*/

1864

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1864

if ((alloc_flags & ALLOC_WMARK_LOW) &&

1865

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1865

(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))

1866

goto this_zone_full;

1866

goto this_zone_full;

1867

1868

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1868

BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);

1869

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1869

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1870

unsigned long mark;

1870

unsigned long mark;

1871

int ret;

1871

int ret;

1872

1873

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1873

mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];

1874

if (zone_watermark_ok(zone, order, mark,

1874

if (zone_watermark_ok(zone, order, mark,

1875

classzone_idx, alloc_flags))

1875

classzone_idx, alloc_flags))

1876

goto try_this_zone;

1876

goto try_this_zone;

1877

1878

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

1878

if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {

1879

/*

1879

/*

1880

* we do zlc_setup if there are multiple nodes

1880

* we do zlc_setup if there are multiple nodes

1881

* and before considering the first zone allowed

1881

* and before considering the first zone allowed

1882

* by the cpuset.

1882

* by the cpuset.

1883

*/

1883

*/

1884

allowednodes = zlc_setup(zonelist, alloc_flags);

1884

allowednodes = zlc_setup(zonelist, alloc_flags);

1885

zlc_active = 1;

1885

zlc_active = 1;

1886

did_zlc_setup = 1;

1886

did_zlc_setup = 1;

1887

}

1887

}

1888

1889

if (zone_reclaim_mode == 0)

1889

if (zone_reclaim_mode == 0)

1890

goto this_zone_full;

1890

goto this_zone_full;

1891

1892

/*

1892

/*

1893

* As we may have just activated ZLC, check if the first

1893

* As we may have just activated ZLC, check if the first

1894

* eligible zone has failed zone_reclaim recently.

1894

* eligible zone has failed zone_reclaim recently.

1895

*/

1895

*/

1896

if (NUMA_BUILD && zlc_active &&

1896

if (NUMA_BUILD && zlc_active &&

1897

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1897

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1898

continue;

1898

continue;

1899

1900

ret = zone_reclaim(zone, gfp_mask, order);

1900

ret = zone_reclaim(zone, gfp_mask, order);

1901

switch (ret) {

1901

switch (ret) {

1902

case ZONE_RECLAIM_NOSCAN:

1902

case ZONE_RECLAIM_NOSCAN:

1903

/* did not scan */

1903

/* did not scan */

1904

continue;

1904

continue;

1905

case ZONE_RECLAIM_FULL:

1905

case ZONE_RECLAIM_FULL:

1906

/* scanned but unreclaimable */

1906

/* scanned but unreclaimable */

1907

continue;

1907

continue;

1908

default:

1908

default:

1909

/* did we reclaim enough */

1909

/* did we reclaim enough */

1910

if (!zone_watermark_ok(zone, order, mark,

1910

if (!zone_watermark_ok(zone, order, mark,

1911

classzone_idx, alloc_flags))

1911

classzone_idx, alloc_flags))

1912

goto this_zone_full;

1912

goto this_zone_full;

1913

}

1913

}

1914

}

1914

}

1915

1916

try_this_zone:

1916

try_this_zone:

1917

page = buffered_rmqueue(preferred_zone, zone, order,

1917

page = buffered_rmqueue(preferred_zone, zone, order,

1918

gfp_mask, migratetype);

1918

gfp_mask, migratetype);

1919

if (page)

1919

if (page)

1920

break;

1920

break;

1921

this_zone_full:

1921

this_zone_full:

1922

if (NUMA_BUILD)

1922

if (NUMA_BUILD)

1923

zlc_mark_zone_full(zonelist, z);

1923

zlc_mark_zone_full(zonelist, z);

1924

}

1924

}

1925

1926

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1926

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1927

/* Disable zlc cache for second zonelist scan */

1927

/* Disable zlc cache for second zonelist scan */

1928

zlc_active = 0;

1928

zlc_active = 0;

1929

goto zonelist_scan;

1929

goto zonelist_scan;

1930

}

1930

}

1931

return page;

1931

return page;

1932

}

1932

}

1933

1934

/*

1934

/*

1935

* Large machines with many possible nodes should not always dump per-node

1935

* Large machines with many possible nodes should not always dump per-node

1936

* meminfo in irq context.

1936

* meminfo in irq context.

1937

*/

1937

*/

1938

static inline bool should_suppress_show_mem(void)

1938

static inline bool should_suppress_show_mem(void)

1939

{

1939

{

1940

bool ret = false;

1940

bool ret = false;

1941

1942

#if NODES_SHIFT > 8

1942

#if NODES_SHIFT > 8

1943

ret = in_interrupt();

1943

ret = in_interrupt();

1944

#endif

1944

#endif

1945

return ret;

1945

return ret;

1946

}

1946

}

1947

1948

static DEFINE_RATELIMIT_STATE(nopage_rs,

1948

static DEFINE_RATELIMIT_STATE(nopage_rs,

1949

DEFAULT_RATELIMIT_INTERVAL,

1949

DEFAULT_RATELIMIT_INTERVAL,

1950

DEFAULT_RATELIMIT_BURST);

1950

DEFAULT_RATELIMIT_BURST);

1951

1952

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1952

void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)

1953

{

1953

{

1954

unsigned int filter = SHOW_MEM_FILTER_NODES;

1954

unsigned int filter = SHOW_MEM_FILTER_NODES;

1955

1956

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1956

if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||

1957

debug_guardpage_minorder() > 0)

1957

debug_guardpage_minorder() > 0)

1958

return;

1958

return;

1959

1960

/*

1960

/*

1961

* This documents exceptions given to allocations in certain

1961

* This documents exceptions given to allocations in certain

1962

* contexts that are allowed to allocate outside current's set

1962

* contexts that are allowed to allocate outside current's set

1963

* of allowed nodes.

1963

* of allowed nodes.

1964

*/

1964

*/

1965

if (!(gfp_mask & __GFP_NOMEMALLOC))

1965

if (!(gfp_mask & __GFP_NOMEMALLOC))

1966

if (test_thread_flag(TIF_MEMDIE) ||

1966

if (test_thread_flag(TIF_MEMDIE) ||

1967

(current->flags & (PF_MEMALLOC | PF_EXITING)))

1967

(current->flags & (PF_MEMALLOC | PF_EXITING)))

1968

filter &= ~SHOW_MEM_FILTER_NODES;

1968

filter &= ~SHOW_MEM_FILTER_NODES;

1969

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

1969

if (in_interrupt() || !(gfp_mask & __GFP_WAIT))

1970

filter &= ~SHOW_MEM_FILTER_NODES;

1970

filter &= ~SHOW_MEM_FILTER_NODES;

1971

1972

if (fmt) {

1972

if (fmt) {

1973

struct va_format vaf;

1973

struct va_format vaf;

1974

va_list args;

1974

va_list args;

1975

1976

va_start(args, fmt);

1976

va_start(args, fmt);

1977

1978

vaf.fmt = fmt;

1978

vaf.fmt = fmt;

1979

vaf.va = &args;

1979

vaf.va = &args;

1980

1981

pr_warn("%pV", &vaf);

1981

pr_warn("%pV", &vaf);

1982

1983

va_end(args);

1983

va_end(args);

1984

}

1984

}

1985

1986

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

1986

pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",

1987

current->comm, order, gfp_mask);

1987

current->comm, order, gfp_mask);

1988

1989

dump_stack();

1989

dump_stack();

1990

if (!should_suppress_show_mem())

1990

if (!should_suppress_show_mem())

1991

show_mem(filter);

1991

show_mem(filter);

1992

}

1992

}

1993

1994

static inline int

1994

static inline int

1995

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

1995

should_alloc_retry(gfp_t gfp_mask, unsigned int order,

1996

unsigned long did_some_progress,

1996

unsigned long did_some_progress,

1997

unsigned long pages_reclaimed)

1997

unsigned long pages_reclaimed)

1998

{

1998

{

1999

/* Do not loop if specifically requested */

1999

/* Do not loop if specifically requested */

2000

if (gfp_mask & __GFP_NORETRY)

2000

if (gfp_mask & __GFP_NORETRY)

2001

return 0;

2001

return 0;

2002

2003

/* Always retry if specifically requested */

2003

/* Always retry if specifically requested */

2004

if (gfp_mask & __GFP_NOFAIL)

2004

if (gfp_mask & __GFP_NOFAIL)

2005

return 1;

2005

return 1;

2006

2007

/*

2007

/*

2008

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2008

* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim

2009

* making forward progress without invoking OOM. Suspend also disables

2009

* making forward progress without invoking OOM. Suspend also disables

2010

* storage devices so kswapd will not help. Bail if we are suspending.

2010

* storage devices so kswapd will not help. Bail if we are suspending.

2011

*/

2011

*/

2012

if (!did_some_progress && pm_suspended_storage())

2012

if (!did_some_progress && pm_suspended_storage())

2013

return 0;

2013

return 0;

2014

2015

/*

2015

/*

2016

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2016

* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER

2017

* means __GFP_NOFAIL, but that may not be true in other

2017

* means __GFP_NOFAIL, but that may not be true in other

2018

* implementations.

2018

* implementations.

2019

*/

2019

*/

2020

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2020

if (order <= PAGE_ALLOC_COSTLY_ORDER)

2021

return 1;

2021

return 1;

2022

2023

/*

2023

/*

2024

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2024

* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is

2025

* specified, then we retry until we no longer reclaim any pages

2025

* specified, then we retry until we no longer reclaim any pages

2026

* (above), or we've reclaimed an order of pages at least as

2026

* (above), or we've reclaimed an order of pages at least as

2027

* large as the allocation's order. In both cases, if the

2027

* large as the allocation's order. In both cases, if the

2028

* allocation still fails, we stop retrying.

2028

* allocation still fails, we stop retrying.

2029

*/

2029

*/

2030

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2030

if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))

2031

return 1;

2031

return 1;

2032

2033

return 0;

2033

return 0;

2034

}

2034

}

2035

2036

static inline struct page *

2036

static inline struct page *

2037

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2037

__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,

2038

struct zonelist *zonelist, enum zone_type high_zoneidx,

2038

struct zonelist *zonelist, enum zone_type high_zoneidx,

2039

nodemask_t *nodemask, struct zone *preferred_zone,

2039

nodemask_t *nodemask, struct zone *preferred_zone,

2040

int migratetype)

2040

int migratetype)

2041

{

2041

{

2042

struct page *page;

2042

struct page *page;

2043

2044

/* Acquire the OOM killer lock for the zones in zonelist */

2044

/* Acquire the OOM killer lock for the zones in zonelist */

2045

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2045

if (!try_set_zonelist_oom(zonelist, gfp_mask)) {

2046

schedule_timeout_uninterruptible(1);

2046

schedule_timeout_uninterruptible(1);

2047

return NULL;

2047

return NULL;

2048

}

2048

}

2049

2050

/*

2050

/*

2051

* Go through the zonelist yet one more time, keep very high watermark

2051

* Go through the zonelist yet one more time, keep very high watermark

2052

* here, this is only to catch a parallel oom killing, we must fail if

2052

* here, this is only to catch a parallel oom killing, we must fail if

2053

* we're still under heavy pressure.

2053

* we're still under heavy pressure.

2054

*/

2054

*/

2055

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2055

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,

2056

order, zonelist, high_zoneidx,

2056

order, zonelist, high_zoneidx,

2057

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2057

ALLOC_WMARK_HIGH|ALLOC_CPUSET,

2058

preferred_zone, migratetype);

2058

preferred_zone, migratetype);

2059

if (page)

2059

if (page)

2060

goto out;

2060

goto out;

2061

2062

if (!(gfp_mask & __GFP_NOFAIL)) {

2062

if (!(gfp_mask & __GFP_NOFAIL)) {

2063

/* The OOM killer will not help higher order allocs */

2063

/* The OOM killer will not help higher order allocs */

2064

if (order > PAGE_ALLOC_COSTLY_ORDER)

2064

if (order > PAGE_ALLOC_COSTLY_ORDER)

2065

goto out;

2065

goto out;

2066

/* The OOM killer does not needlessly kill tasks for lowmem */

2066

/* The OOM killer does not needlessly kill tasks for lowmem */

2067

if (high_zoneidx < ZONE_NORMAL)

2067

if (high_zoneidx < ZONE_NORMAL)

2068

goto out;

2068

goto out;

2069

/*

2069

/*

2070

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2070

* GFP_THISNODE contains __GFP_NORETRY and we never hit this.

2071

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2071

* Sanity check for bare calls of __GFP_THISNODE, not real OOM.

2072

* The caller should handle page allocation failure by itself if

2072

* The caller should handle page allocation failure by itself if

2073

* it specifies __GFP_THISNODE.

2073

* it specifies __GFP_THISNODE.

2074

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2074

* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.

2075

*/

2075

*/

2076

if (gfp_mask & __GFP_THISNODE)

2076

if (gfp_mask & __GFP_THISNODE)

2077

goto out;

2077

goto out;

2078

}

2078

}

2079

/* Exhausted what can be done so it's blamo time */

2079

/* Exhausted what can be done so it's blamo time */

2080

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2080

out_of_memory(zonelist, gfp_mask, order, nodemask, false);

2081

2082

out:

2082

out:

2083

clear_zonelist_oom(zonelist, gfp_mask);

2083

clear_zonelist_oom(zonelist, gfp_mask);

2084

return page;

2084

return page;

2085

}

2085

}

2086

2087

#ifdef CONFIG_COMPACTION

2087

#ifdef CONFIG_COMPACTION

2088

/* Try memory compaction for high-order allocations before reclaim */

2088

/* Try memory compaction for high-order allocations before reclaim */

2089

static struct page *

2089

static struct page *

2090

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2090

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2091

struct zonelist *zonelist, enum zone_type high_zoneidx,

2091

struct zonelist *zonelist, enum zone_type high_zoneidx,

2092

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2092

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2093

int migratetype, bool sync_migration,

2093

int migratetype, bool sync_migration,

2094

bool *deferred_compaction,

2094

bool *deferred_compaction,

2095

unsigned long *did_some_progress)

2095

unsigned long *did_some_progress)

2096

{

2096

{

2097

struct page *page;

2097

struct page *page;

2098

2099

if (!order)

2099

if (!order)

2100

return NULL;

2100

return NULL;

2101

2102

if (compaction_deferred(preferred_zone, order)) {

2102

if (compaction_deferred(preferred_zone, order)) {

2103

*deferred_compaction = true;

2103

*deferred_compaction = true;

2104

return NULL;

2104

return NULL;

2105

}

2105

}

2106

2107

current->flags |= PF_MEMALLOC;

2107

current->flags |= PF_MEMALLOC;

2108

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2108

*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,

2109

nodemask, sync_migration);

2109

nodemask, sync_migration);

2110

current->flags &= ~PF_MEMALLOC;

2110

current->flags &= ~PF_MEMALLOC;

2111

if (*did_some_progress != COMPACT_SKIPPED) {

2111

if (*did_some_progress != COMPACT_SKIPPED) {

2112

2113

/* Page migration frees to the PCP lists but we want merging */

2113

/* Page migration frees to the PCP lists but we want merging */

2114

drain_pages(get_cpu());

2114

drain_pages(get_cpu());

2115

put_cpu();

2115

put_cpu();

2116

2117

page = get_page_from_freelist(gfp_mask, nodemask,

2117

page = get_page_from_freelist(gfp_mask, nodemask,

2118

order, zonelist, high_zoneidx,

2118

order, zonelist, high_zoneidx,

2119

alloc_flags, preferred_zone,

2119

alloc_flags, preferred_zone,

2120

migratetype);

2120

migratetype);

2121

if (page) {

2121

if (page) {

2122

preferred_zone->compact_considered = 0;

2122

preferred_zone->compact_considered = 0;

2123

preferred_zone->compact_defer_shift = 0;

2123

preferred_zone->compact_defer_shift = 0;

2124

if (order >= preferred_zone->compact_order_failed)

2124

if (order >= preferred_zone->compact_order_failed)

2125

preferred_zone->compact_order_failed = order + 1;

2125

preferred_zone->compact_order_failed = order + 1;

2126

count_vm_event(COMPACTSUCCESS);

2126

count_vm_event(COMPACTSUCCESS);

2127

return page;

2127

return page;

2128

}

2128

}

2129

2130

/*

2130

/*

2131

* It's bad if compaction run occurs and fails.

2131

* It's bad if compaction run occurs and fails.

2132

* The most likely reason is that pages exist,

2132

* The most likely reason is that pages exist,

2133

* but not enough to satisfy watermarks.

2133

* but not enough to satisfy watermarks.

2134

*/

2134

*/

2135

count_vm_event(COMPACTFAIL);

2135

count_vm_event(COMPACTFAIL);

2136

2137

/*

2137

/*

2138

* As async compaction considers a subset of pageblocks, only

2138

* As async compaction considers a subset of pageblocks, only

2139

* defer if the failure was a sync compaction failure.

2139

* defer if the failure was a sync compaction failure.

2140

*/

2140

*/

2141

if (sync_migration)

2141

if (sync_migration)

2142

defer_compaction(preferred_zone, order);

2142

defer_compaction(preferred_zone, order);

2143

2144

cond_resched();

2144

cond_resched();

2145

}

2145

}

2146

2147

return NULL;

2147

return NULL;

2148

}

2148

}

2149

#else

2149

#else

2150

static inline struct page *

2150

static inline struct page *

2151

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2151

__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,

2152

struct zonelist *zonelist, enum zone_type high_zoneidx,

2152

struct zonelist *zonelist, enum zone_type high_zoneidx,

2153

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2153

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2154

int migratetype, bool sync_migration,

2154

int migratetype, bool sync_migration,

2155

bool *deferred_compaction,

2155

bool *deferred_compaction,

2156

unsigned long *did_some_progress)

2156

unsigned long *did_some_progress)

2157

{

2157

{

2158

return NULL;

2158

return NULL;

2159

}

2159

}

2160

#endif /* CONFIG_COMPACTION */

2160

#endif /* CONFIG_COMPACTION */

2161

2162

/* Perform direct synchronous page reclaim */

2162

/* Perform direct synchronous page reclaim */

2163

static int

2163

static int

2164

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2164

__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,

2165

nodemask_t *nodemask)

2165

nodemask_t *nodemask)

2166

{

2166

{

2167

struct reclaim_state reclaim_state;

2167

struct reclaim_state reclaim_state;

2168

int progress;

2168

int progress;

2169

2170

cond_resched();

2170

cond_resched();

2171

2172

/* We now go into synchronous reclaim */

2172

/* We now go into synchronous reclaim */

2173

cpuset_memory_pressure_bump();

2173

cpuset_memory_pressure_bump();

2174

current->flags |= PF_MEMALLOC;

2174

current->flags |= PF_MEMALLOC;

2175

lockdep_set_current_reclaim_state(gfp_mask);

2175

lockdep_set_current_reclaim_state(gfp_mask);

2176

reclaim_state.reclaimed_slab = 0;

2176

reclaim_state.reclaimed_slab = 0;

2177

current->reclaim_state = &reclaim_state;

2177

current->reclaim_state = &reclaim_state;

2178

2179

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2179

progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);

2180

2181

current->reclaim_state = NULL;

2181

current->reclaim_state = NULL;

2182

lockdep_clear_current_reclaim_state();

2182

lockdep_clear_current_reclaim_state();

2183

current->flags &= ~PF_MEMALLOC;

2183

current->flags &= ~PF_MEMALLOC;

2184

2185

cond_resched();

2185

cond_resched();

2186

2187

return progress;

2187

return progress;

2188

}

2188

}

2189

2190

/* The really slow allocator path where we enter direct reclaim */

2190

/* The really slow allocator path where we enter direct reclaim */

2191

static inline struct page *

2191

static inline struct page *

2192

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2192

__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,

2193

struct zonelist *zonelist, enum zone_type high_zoneidx,

2193

struct zonelist *zonelist, enum zone_type high_zoneidx,

2194

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2194

nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,

2195

int migratetype, unsigned long *did_some_progress)

2195

int migratetype, unsigned long *did_some_progress)

2196

{

2196

{

2197

struct page *page = NULL;

2197

struct page *page = NULL;

2198

bool drained = false;

2198

bool drained = false;

2199

2200

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2200

*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

2201

nodemask);

2201

nodemask);

2202

if (unlikely(!(*did_some_progress)))

2202

if (unlikely(!(*did_some_progress)))

2203

return NULL;

2203

return NULL;

2204

2205

/* After successful reclaim, reconsider all zones for allocation */

2205

/* After successful reclaim, reconsider all zones for allocation */

2206

if (NUMA_BUILD)

2206

if (NUMA_BUILD)

2207

zlc_clear_zones_full(zonelist);

2207

zlc_clear_zones_full(zonelist);

2208

2209

retry:

2209

retry:

2210

page = get_page_from_freelist(gfp_mask, nodemask, order,

2210

page = get_page_from_freelist(gfp_mask, nodemask, order,

2211

zonelist, high_zoneidx,

2211

zonelist, high_zoneidx,

2212

alloc_flags, preferred_zone,

2212

alloc_flags, preferred_zone,

2213

migratetype);

2213

migratetype);

2214

2215

/*

2215

/*

2216

* If an allocation failed after direct reclaim, it could be because

2216

* If an allocation failed after direct reclaim, it could be because

2217

* pages are pinned on the per-cpu lists. Drain them and try again

2217

* pages are pinned on the per-cpu lists. Drain them and try again

2218

*/

2218

*/

2219

if (!page && !drained) {

2219

if (!page && !drained) {

2220

drain_all_pages();

2220

drain_all_pages();

2221

drained = true;

2221

drained = true;

2222

goto retry;

2222

goto retry;

2223

}

2223

}

2224

2225

return page;

2225

return page;

2226

}

2226

}

2227

2228

/*

2228

/*

2229

* This is called in the allocator slow-path if the allocation request is of

2229

* This is called in the allocator slow-path if the allocation request is of

2230

* sufficient urgency to ignore watermarks and take other desperate measures

2230

* sufficient urgency to ignore watermarks and take other desperate measures

2231

*/

2231

*/

2232

static inline struct page *

2232

static inline struct page *

2233

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2233

__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

2234

struct zonelist *zonelist, enum zone_type high_zoneidx,

2234

struct zonelist *zonelist, enum zone_type high_zoneidx,

2235

nodemask_t *nodemask, struct zone *preferred_zone,

2235

nodemask_t *nodemask, struct zone *preferred_zone,

2236

int migratetype)

2236

int migratetype)

2237

{

2237

{

2238

struct page *page;

2238

struct page *page;

2239

2240

do {

2240

do {

2241

page = get_page_from_freelist(gfp_mask, nodemask, order,

2241

page = get_page_from_freelist(gfp_mask, nodemask, order,

2242

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2242

zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,

2243

preferred_zone, migratetype);

2243

preferred_zone, migratetype);

2244

2245

if (!page && gfp_mask & __GFP_NOFAIL)

2245

if (!page && gfp_mask & __GFP_NOFAIL)

2246

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2246

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2247

} while (!page && (gfp_mask & __GFP_NOFAIL));

2247

} while (!page && (gfp_mask & __GFP_NOFAIL));

2248

2249

return page;

2249

return page;

2250

}

2250

}

2251

2252

static inline

2252

static inline

2253

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2253

void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,

2254

enum zone_type high_zoneidx,

2254

enum zone_type high_zoneidx,

2255

enum zone_type classzone_idx)

2255

enum zone_type classzone_idx)

2256

{

2256

{

2257

struct zoneref *z;

2257

struct zoneref *z;

2258

struct zone *zone;

2258

struct zone *zone;

2259

2260

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2260

for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)

2261

wakeup_kswapd(zone, order, classzone_idx);

2261

wakeup_kswapd(zone, order, classzone_idx);

2262

}

2262

}

2263

2264

static inline int

2264

static inline int

2265

gfp_to_alloc_flags(gfp_t gfp_mask)

2265

gfp_to_alloc_flags(gfp_t gfp_mask)

2266

{

2266

{

2267

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2267

int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;

2268

const gfp_t wait = gfp_mask & __GFP_WAIT;

2268

const gfp_t wait = gfp_mask & __GFP_WAIT;

2269

2270

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2270

/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */

2271

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2271

BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);

2272

2273

/*

2273

/*

2274

* The caller may dip into page reserves a bit more if the caller

2274

* The caller may dip into page reserves a bit more if the caller

2275

* cannot run direct reclaim, or if the caller has realtime scheduling

2275

* cannot run direct reclaim, or if the caller has realtime scheduling

2276

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2276

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

2277

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2277

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

2278

*/

2278

*/

2279

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2279

alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);

2280

2281

if (!wait) {

2281

if (!wait) {

2282

/*

2282

/*

2283

* Not worth trying to allocate harder for

2283

* Not worth trying to allocate harder for

2284

* __GFP_NOMEMALLOC even if it can't schedule.

2284

* __GFP_NOMEMALLOC even if it can't schedule.

2285

*/

2285

*/

2286

if (!(gfp_mask & __GFP_NOMEMALLOC))

2286

if (!(gfp_mask & __GFP_NOMEMALLOC))

2287

alloc_flags |= ALLOC_HARDER;

2287

alloc_flags |= ALLOC_HARDER;

2288

/*

2288

/*

2289

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2289

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

2290

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2290

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

2291

*/

2291

*/

2292

alloc_flags &= ~ALLOC_CPUSET;

2292

alloc_flags &= ~ALLOC_CPUSET;

2293

} else if (unlikely(rt_task(current)) && !in_interrupt())

2293

} else if (unlikely(rt_task(current)) && !in_interrupt())

2294

alloc_flags |= ALLOC_HARDER;

2294

alloc_flags |= ALLOC_HARDER;

2295

2296

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2296

if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {

2297

if (gfp_mask & __GFP_MEMALLOC)

2297

if (gfp_mask & __GFP_MEMALLOC)

2298

alloc_flags |= ALLOC_NO_WATERMARKS;

2298

alloc_flags |= ALLOC_NO_WATERMARKS;

2299

else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())

2299

else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))

2300

alloc_flags |= ALLOC_NO_WATERMARKS;

2301

else if (!in_interrupt() &&

2302

((current->flags & PF_MEMALLOC) ||

2303

unlikely(test_thread_flag(TIF_MEMDIE))))

2300

alloc_flags |= ALLOC_NO_WATERMARKS;

2304

alloc_flags |= ALLOC_NO_WATERMARKS;

2301

}

2305

}

2302

2306

2303

return alloc_flags;

2307

return alloc_flags;

2304

}

2308

}

2305

2309

2306

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2310

bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)

2307

{

2311

{

2308

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2312

return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);

2309

}

2313

}

2310

2314

2311

static inline struct page *

2315

static inline struct page *

2312

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2316

__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,

2313

struct zonelist *zonelist, enum zone_type high_zoneidx,

2317

struct zonelist *zonelist, enum zone_type high_zoneidx,

2314

nodemask_t *nodemask, struct zone *preferred_zone,

2318

nodemask_t *nodemask, struct zone *preferred_zone,

2315

int migratetype)

2319

int migratetype)

2316

{

2320

{

2317

const gfp_t wait = gfp_mask & __GFP_WAIT;

2321

const gfp_t wait = gfp_mask & __GFP_WAIT;

2318

struct page *page = NULL;

2322

struct page *page = NULL;

2319

int alloc_flags;

2323

int alloc_flags;

2320

unsigned long pages_reclaimed = 0;

2324

unsigned long pages_reclaimed = 0;

2321

unsigned long did_some_progress;

2325

unsigned long did_some_progress;

2322

bool sync_migration = false;

2326

bool sync_migration = false;

2323

bool deferred_compaction = false;

2327

bool deferred_compaction = false;

2324

2328

2325

/*

2329

/*

2326

* In the slowpath, we sanity check order to avoid ever trying to

2330

* In the slowpath, we sanity check order to avoid ever trying to

2327

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2331

* reclaim >= MAX_ORDER areas which will never succeed. Callers may

2328

* be using allocators in order of preference for an area that is

2332

* be using allocators in order of preference for an area that is

2329

* too large.

2333

* too large.

2330

*/

2334

*/

2331

if (order >= MAX_ORDER) {

2335

if (order >= MAX_ORDER) {

2332

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2336

WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));

2333

return NULL;

2337

return NULL;

2334

}

2338

}

2335

2339

2336

/*

2340

/*

2337

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2341

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

2338

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2342

* __GFP_NOWARN set) should not cause reclaim since the subsystem

2339

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2343

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

2340

* using a larger set of nodes after it has established that the

2344

* using a larger set of nodes after it has established that the

2341

* allowed per node queues are empty and that nodes are

2345

* allowed per node queues are empty and that nodes are

2342

* over allocated.

2346

* over allocated.

2343

*/

2347

*/

2344

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2348

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

2345

goto nopage;

2349

goto nopage;

2346

2350

2347

restart:

2351

restart:

2348

if (!(gfp_mask & __GFP_NO_KSWAPD))

2352

if (!(gfp_mask & __GFP_NO_KSWAPD))

2349

wake_all_kswapd(order, zonelist, high_zoneidx,

2353

wake_all_kswapd(order, zonelist, high_zoneidx,

2350

zone_idx(preferred_zone));

2354

zone_idx(preferred_zone));

2351

2355

2352

/*

2356

/*

2353

* OK, we're below the kswapd watermark and have kicked background

2357

* OK, we're below the kswapd watermark and have kicked background

2354

* reclaim. Now things get more complex, so set up alloc_flags according

2358

* reclaim. Now things get more complex, so set up alloc_flags according

2355

* to how we want to proceed.

2359

* to how we want to proceed.

2356

*/

2360

*/

2357

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2361

alloc_flags = gfp_to_alloc_flags(gfp_mask);

2358

2362

2359

/*

2363

/*

2360

* Find the true preferred zone if the allocation is unconstrained by

2364

* Find the true preferred zone if the allocation is unconstrained by

2361

* cpusets.

2365

* cpusets.

2362

*/

2366

*/

2363

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2367

if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)

2364

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2368

first_zones_zonelist(zonelist, high_zoneidx, NULL,

2365

&preferred_zone);

2369

&preferred_zone);

2366

2370

2367

rebalance:

2371

rebalance:

2368

/* This is the last chance, in general, before the goto nopage. */

2372

/* This is the last chance, in general, before the goto nopage. */

2369

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2373

page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,

2370

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2374

high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,

2371

preferred_zone, migratetype);

2375

preferred_zone, migratetype);

2372

if (page)

2376

if (page)

2373

goto got_pg;

2377

goto got_pg;

2374

2378

2375

/* Allocate without watermarks if the context allows */

2379

/* Allocate without watermarks if the context allows */

2376

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2380

if (alloc_flags & ALLOC_NO_WATERMARKS) {

2377

page = __alloc_pages_high_priority(gfp_mask, order,

2381

page = __alloc_pages_high_priority(gfp_mask, order,

2378

zonelist, high_zoneidx, nodemask,

2382

zonelist, high_zoneidx, nodemask,

2379

preferred_zone, migratetype);

2383

preferred_zone, migratetype);

2380

if (page)

2384

if (page)

2381

goto got_pg;

2385

goto got_pg;

2382

}

2386

}

2383

2387

2384

/* Atomic allocations - we can't balance anything */

2388

/* Atomic allocations - we can't balance anything */

2385

if (!wait)

2389

if (!wait)

2386

goto nopage;

2390

goto nopage;

2387

2391

2388

/* Avoid recursion of direct reclaim */

2392

/* Avoid recursion of direct reclaim */

2389

if (current->flags & PF_MEMALLOC)

2393

if (current->flags & PF_MEMALLOC)

2390

goto nopage;

2394

goto nopage;

2391

2395

2392

/* Avoid allocations with no watermarks from looping endlessly */

2396

/* Avoid allocations with no watermarks from looping endlessly */

2393

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2397

if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))

2394

goto nopage;

2398

goto nopage;

2395

2399

2396

/*

2400

/*

2397

* Try direct compaction. The first pass is asynchronous. Subsequent

2401

* Try direct compaction. The first pass is asynchronous. Subsequent

2398

* attempts after direct reclaim are synchronous

2402

* attempts after direct reclaim are synchronous

2399

*/

2403

*/

2400

page = __alloc_pages_direct_compact(gfp_mask, order,

2404

page = __alloc_pages_direct_compact(gfp_mask, order,

2401

zonelist, high_zoneidx,

2405

zonelist, high_zoneidx,

2402

nodemask,

2406

nodemask,

2403

alloc_flags, preferred_zone,

2407

alloc_flags, preferred_zone,

2404

migratetype, sync_migration,

2408

migratetype, sync_migration,

2405

&deferred_compaction,

2409

&deferred_compaction,

2406

&did_some_progress);

2410

&did_some_progress);

2407

if (page)

2411

if (page)

2408

goto got_pg;

2412

goto got_pg;

2409

sync_migration = true;

2413

sync_migration = true;

2410

2414

2411

/*

2415

/*

2412

* If compaction is deferred for high-order allocations, it is because

2416

* If compaction is deferred for high-order allocations, it is because

2413

* sync compaction recently failed. In this is the case and the caller

2417

* sync compaction recently failed. In this is the case and the caller

2414

* has requested the system not be heavily disrupted, fail the

2418

* has requested the system not be heavily disrupted, fail the

2415

* allocation now instead of entering direct reclaim

2419

* allocation now instead of entering direct reclaim

2416

*/

2420

*/

2417

if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))

2421

if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))

2418

goto nopage;

2422

goto nopage;

2419

2423

2420

/* Try direct reclaim and then allocating */

2424

/* Try direct reclaim and then allocating */

2421

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2425

page = __alloc_pages_direct_reclaim(gfp_mask, order,

2422

zonelist, high_zoneidx,

2426

zonelist, high_zoneidx,

2423

nodemask,

2427

nodemask,

2424

alloc_flags, preferred_zone,

2428

alloc_flags, preferred_zone,

2425

migratetype, &did_some_progress);

2429

migratetype, &did_some_progress);

2426

if (page)

2430

if (page)

2427

goto got_pg;

2431

goto got_pg;

2428

2432

2429

/*

2433

/*

2430

* If we failed to make any progress reclaiming, then we are

2434

* If we failed to make any progress reclaiming, then we are

2431

* running out of options and have to consider going OOM

2435

* running out of options and have to consider going OOM

2432

*/

2436

*/

2433

if (!did_some_progress) {

2437

if (!did_some_progress) {

2434

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2438

if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

2435

if (oom_killer_disabled)

2439

if (oom_killer_disabled)

2436

goto nopage;

2440

goto nopage;

2437

/* Coredumps can quickly deplete all memory reserves */

2441

/* Coredumps can quickly deplete all memory reserves */

2438

if ((current->flags & PF_DUMPCORE) &&

2442

if ((current->flags & PF_DUMPCORE) &&

2439

!(gfp_mask & __GFP_NOFAIL))

2443

!(gfp_mask & __GFP_NOFAIL))

2440

goto nopage;

2444

goto nopage;

2441

page = __alloc_pages_may_oom(gfp_mask, order,

2445

page = __alloc_pages_may_oom(gfp_mask, order,

2442

zonelist, high_zoneidx,

2446

zonelist, high_zoneidx,

2443

nodemask, preferred_zone,

2447

nodemask, preferred_zone,

2444

migratetype);

2448

migratetype);

2445

if (page)

2449

if (page)

2446

goto got_pg;

2450

goto got_pg;

2447

2451

2448

if (!(gfp_mask & __GFP_NOFAIL)) {

2452

if (!(gfp_mask & __GFP_NOFAIL)) {

2449

/*

2453

/*

2450

* The oom killer is not called for high-order

2454

* The oom killer is not called for high-order

2451

* allocations that may fail, so if no progress

2455

* allocations that may fail, so if no progress

2452

* is being made, there are no other options and

2456

* is being made, there are no other options and

2453

* retrying is unlikely to help.

2457

* retrying is unlikely to help.

2454

*/

2458

*/

2455

if (order > PAGE_ALLOC_COSTLY_ORDER)

2459

if (order > PAGE_ALLOC_COSTLY_ORDER)

2456

goto nopage;

2460

goto nopage;

2457

/*

2461

/*

2458

* The oom killer is not called for lowmem

2462

* The oom killer is not called for lowmem

2459

* allocations to prevent needlessly killing

2463

* allocations to prevent needlessly killing

2460

* innocent tasks.

2464

* innocent tasks.

2461

*/

2465

*/

2462

if (high_zoneidx < ZONE_NORMAL)

2466

if (high_zoneidx < ZONE_NORMAL)

2463

goto nopage;

2467

goto nopage;

2464

}

2468

}

2465

2469

2466

goto restart;

2470

goto restart;

2467

}

2471

}

2468

}

2472

}

2469

2473

2470

/* Check if we should retry the allocation */

2474

/* Check if we should retry the allocation */

2471

pages_reclaimed += did_some_progress;

2475

pages_reclaimed += did_some_progress;

2472

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2476

if (should_alloc_retry(gfp_mask, order, did_some_progress,

2473

pages_reclaimed)) {

2477

pages_reclaimed)) {

2474

/* Wait for some write requests to complete then retry */

2478

/* Wait for some write requests to complete then retry */

2475

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2479

wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);

2476

goto rebalance;

2480

goto rebalance;

2477

} else {

2481

} else {

2478

/*

2482

/*

2479

* High-order allocations do not necessarily loop after

2483

* High-order allocations do not necessarily loop after

2480

* direct reclaim and reclaim/compaction depends on compaction

2484

* direct reclaim and reclaim/compaction depends on compaction

2481

* being called after reclaim so call directly if necessary

2485

* being called after reclaim so call directly if necessary

2482

*/

2486

*/

2483

page = __alloc_pages_direct_compact(gfp_mask, order,

2487

page = __alloc_pages_direct_compact(gfp_mask, order,

2484

zonelist, high_zoneidx,

2488

zonelist, high_zoneidx,

2485

nodemask,

2489

nodemask,

2486

alloc_flags, preferred_zone,

2490

alloc_flags, preferred_zone,

2487

migratetype, sync_migration,

2491

migratetype, sync_migration,

2488

&deferred_compaction,

2492

&deferred_compaction,

2489

&did_some_progress);

2493

&did_some_progress);

2490

if (page)

2494

if (page)

2491

goto got_pg;

2495

goto got_pg;

2492

}

2496

}

2493

2497

2494

nopage:

2498

nopage:

2495

warn_alloc_failed(gfp_mask, order, NULL);

2499

warn_alloc_failed(gfp_mask, order, NULL);

2496

return page;

2500

return page;

2497

got_pg:

2501

got_pg:

2498

/*

2502

/*

2499

* page->pfmemalloc is set when the caller had PFMEMALLOC set, is

2503

* page->pfmemalloc is set when the caller had PFMEMALLOC set, is

2500

* been OOM killed or specified __GFP_MEMALLOC. The expectation is

2504

* been OOM killed or specified __GFP_MEMALLOC. The expectation is

2501

* that the caller is taking steps that will free more memory. The

2505

* that the caller is taking steps that will free more memory. The

2502

* caller should avoid the page being used for !PFMEMALLOC purposes.

2506

* caller should avoid the page being used for !PFMEMALLOC purposes.

2503

*/

2507

*/

2504

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2508

page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

2505

2509

2506

if (kmemcheck_enabled)

2510

if (kmemcheck_enabled)

2507

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2511

kmemcheck_pagealloc_alloc(page, order, gfp_mask);

2508

2512

2509

return page;

2513

return page;

2510

}

2514

}

2511

2515

2512

/*

2516

/*

2513

* This is the 'heart' of the zoned buddy allocator.

2517

* This is the 'heart' of the zoned buddy allocator.

2514

*/

2518

*/

2515

struct page *

2519

struct page *

2516

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2520

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,

2517

struct zonelist *zonelist, nodemask_t *nodemask)

2521

struct zonelist *zonelist, nodemask_t *nodemask)

2518

{

2522

{

2519

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2523

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

2520

struct zone *preferred_zone;

2524

struct zone *preferred_zone;

2521

struct page *page = NULL;

2525

struct page *page = NULL;

2522

int migratetype = allocflags_to_migratetype(gfp_mask);

2526

int migratetype = allocflags_to_migratetype(gfp_mask);

2523

unsigned int cpuset_mems_cookie;

2527

unsigned int cpuset_mems_cookie;

2524

2528

2525

gfp_mask &= gfp_allowed_mask;

2529

gfp_mask &= gfp_allowed_mask;

2526

2530

2527

lockdep_trace_alloc(gfp_mask);

2531

lockdep_trace_alloc(gfp_mask);

2528

2532

2529

might_sleep_if(gfp_mask & __GFP_WAIT);

2533

might_sleep_if(gfp_mask & __GFP_WAIT);

2530

2534

2531

if (should_fail_alloc_page(gfp_mask, order))

2535

if (should_fail_alloc_page(gfp_mask, order))

2532

return NULL;

2536

return NULL;

2533

2537

2534

/*

2538

/*

2535

* Check the zones suitable for the gfp_mask contain at least one

2539

* Check the zones suitable for the gfp_mask contain at least one

2536

* valid zone. It's possible to have an empty zonelist as a result

2540

* valid zone. It's possible to have an empty zonelist as a result

2537

* of GFP_THISNODE and a memoryless node

2541

* of GFP_THISNODE and a memoryless node

2538

*/

2542

*/

2539

if (unlikely(!zonelist->_zonerefs->zone))

2543

if (unlikely(!zonelist->_zonerefs->zone))

2540

return NULL;

2544

return NULL;

2541

2545

2542

retry_cpuset:

2546

retry_cpuset:

2543

cpuset_mems_cookie = get_mems_allowed();

2547

cpuset_mems_cookie = get_mems_allowed();

2544

2548

2545

/* The preferred zone is used for statistics later */

2549

/* The preferred zone is used for statistics later */

2546

first_zones_zonelist(zonelist, high_zoneidx,

2550

first_zones_zonelist(zonelist, high_zoneidx,

2547

nodemask ? : &cpuset_current_mems_allowed,

2551

nodemask ? : &cpuset_current_mems_allowed,

2548

&preferred_zone);

2552

&preferred_zone);

2549

if (!preferred_zone)

2553

if (!preferred_zone)

2550

goto out;

2554

goto out;

2551

2555

2552

/* First allocation attempt */

2556

/* First allocation attempt */

2553

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2557

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

2554

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

2558

zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

2555

preferred_zone, migratetype);

2559

preferred_zone, migratetype);

2556

if (unlikely(!page))

2560

if (unlikely(!page))

2557

page = __alloc_pages_slowpath(gfp_mask, order,

2561

page = __alloc_pages_slowpath(gfp_mask, order,

2558

zonelist, high_zoneidx, nodemask,

2562

zonelist, high_zoneidx, nodemask,

2559

preferred_zone, migratetype);

2563

preferred_zone, migratetype);

2560

else

2564

else

2561

page->pfmemalloc = false;

2565

page->pfmemalloc = false;

2562

2566

2563

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2567

trace_mm_page_alloc(page, order, gfp_mask, migratetype);

2564

2568

2565

out:

2569

out:

2566

/*

2570

/*

2567

* When updating a task's mems_allowed, it is possible to race with

2571

* When updating a task's mems_allowed, it is possible to race with

2568

* parallel threads in such a way that an allocation can fail while

2572

* parallel threads in such a way that an allocation can fail while

2569

* the mask is being updated. If a page allocation is about to fail,

2573

* the mask is being updated. If a page allocation is about to fail,

2570

* check if the cpuset changed during allocation and if so, retry.

2574

* check if the cpuset changed during allocation and if so, retry.

2571

*/

2575

*/

2572

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2576

if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))

2573

goto retry_cpuset;

2577

goto retry_cpuset;

2574

2578

2575

return page;

2579

return page;

2576

}

2580

}

2577

EXPORT_SYMBOL(__alloc_pages_nodemask);

2581

EXPORT_SYMBOL(__alloc_pages_nodemask);

2578

2582

2579

/*

2583

/*

2580

* Common helper functions.

2584

* Common helper functions.

2581

*/

2585

*/

2582

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2586

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

2583

{

2587

{

2584

struct page *page;

2588

struct page *page;

2585

2589

2586

/*

2590

/*

2587

* __get_free_pages() returns a 32-bit address, which cannot represent

2591

* __get_free_pages() returns a 32-bit address, which cannot represent

2588

* a highmem page

2592

* a highmem page

2589

*/

2593

*/

2590

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2594

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

2591

2595

2592

page = alloc_pages(gfp_mask, order);

2596

page = alloc_pages(gfp_mask, order);

2593

if (!page)

2597

if (!page)

2594

return 0;

2598

return 0;

2595

return (unsigned long) page_address(page);

2599

return (unsigned long) page_address(page);

2596

}

2600

}

2597

EXPORT_SYMBOL(__get_free_pages);

2601

EXPORT_SYMBOL(__get_free_pages);

2598

2602

2599

unsigned long get_zeroed_page(gfp_t gfp_mask)

2603

unsigned long get_zeroed_page(gfp_t gfp_mask)

2600

{

2604

{

2601

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2605

return __get_free_pages(gfp_mask | __GFP_ZERO, 0);

2602

}

2606

}

2603

EXPORT_SYMBOL(get_zeroed_page);

2607

EXPORT_SYMBOL(get_zeroed_page);

2604

2608

2605

void __free_pages(struct page *page, unsigned int order)

2609

void __free_pages(struct page *page, unsigned int order)

2606

{

2610

{

2607

if (put_page_testzero(page)) {

2611

if (put_page_testzero(page)) {

2608

if (order == 0)

2612

if (order == 0)

2609

free_hot_cold_page(page, 0);

2613

free_hot_cold_page(page, 0);

2610

else

2614

else

2611

__free_pages_ok(page, order);

2615

__free_pages_ok(page, order);

2612

}

2616

}

2613

}

2617

}

2614

2618

2615

EXPORT_SYMBOL(__free_pages);

2619

EXPORT_SYMBOL(__free_pages);

2616

2620

2617

void free_pages(unsigned long addr, unsigned int order)

2621

void free_pages(unsigned long addr, unsigned int order)

2618

{

2622

{

2619

if (addr != 0) {

2623

if (addr != 0) {

2620

VM_BUG_ON(!virt_addr_valid((void *)addr));

2624

VM_BUG_ON(!virt_addr_valid((void *)addr));

2621

__free_pages(virt_to_page((void *)addr), order);

2625

__free_pages(virt_to_page((void *)addr), order);

2622

}

2626

}

2623

}

2627

}

2624

2628

2625

EXPORT_SYMBOL(free_pages);

2629

EXPORT_SYMBOL(free_pages);

2626

2630

2627

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2631

static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)

2628

{

2632

{

2629

if (addr) {

2633

if (addr) {

2630

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2634

unsigned long alloc_end = addr + (PAGE_SIZE << order);

2631

unsigned long used = addr + PAGE_ALIGN(size);

2635

unsigned long used = addr + PAGE_ALIGN(size);

2632

2636

2633

split_page(virt_to_page((void *)addr), order);

2637

split_page(virt_to_page((void *)addr), order);

2634

while (used < alloc_end) {

2638

while (used < alloc_end) {

2635

free_page(used);

2639

free_page(used);

2636

used += PAGE_SIZE;

2640

used += PAGE_SIZE;

2637

}

2641

}

2638

}

2642

}

2639

return (void *)addr;

2643

return (void *)addr;

2640

}

2644

}

2641

2645

2642

/**

2646

/**

2643

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2647

* alloc_pages_exact - allocate an exact number physically-contiguous pages.

2644

* @size: the number of bytes to allocate

2648

* @size: the number of bytes to allocate

2645

* @gfp_mask: GFP flags for the allocation

2649

* @gfp_mask: GFP flags for the allocation

2646

*

2650

*

2647

* This function is similar to alloc_pages(), except that it allocates the

2651

* This function is similar to alloc_pages(), except that it allocates the

2648

* minimum number of pages to satisfy the request. alloc_pages() can only

2652

* minimum number of pages to satisfy the request. alloc_pages() can only

2649

* allocate memory in power-of-two pages.

2653

* allocate memory in power-of-two pages.

2650

*

2654

*

2651

* This function is also limited by MAX_ORDER.

2655

* This function is also limited by MAX_ORDER.

2652

*

2656

*

2653

* Memory allocated by this function must be released by free_pages_exact().

2657

* Memory allocated by this function must be released by free_pages_exact().

2654

*/

2658

*/

2655

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2659

void *alloc_pages_exact(size_t size, gfp_t gfp_mask)

2656

{

2660

{

2657

unsigned int order = get_order(size);

2661

unsigned int order = get_order(size);

2658

unsigned long addr;

2662

unsigned long addr;

2659

2663

2660

addr = __get_free_pages(gfp_mask, order);

2664

addr = __get_free_pages(gfp_mask, order);

2661

return make_alloc_exact(addr, order, size);

2665

return make_alloc_exact(addr, order, size);

2662

}

2666

}

2663

EXPORT_SYMBOL(alloc_pages_exact);

2667

EXPORT_SYMBOL(alloc_pages_exact);

2664

2668

2665

/**

2669

/**

2666

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2670

* alloc_pages_exact_nid - allocate an exact number of physically-contiguous

2667

* pages on a node.

2671

* pages on a node.

2668

* @nid: the preferred node ID where memory should be allocated

2672

* @nid: the preferred node ID where memory should be allocated

2669

* @size: the number of bytes to allocate

2673

* @size: the number of bytes to allocate

2670

* @gfp_mask: GFP flags for the allocation

2674

* @gfp_mask: GFP flags for the allocation

2671

*

2675

*

2672

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2676

* Like alloc_pages_exact(), but try to allocate on node nid first before falling

2673

* back.

2677

* back.

2674

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2678

* Note this is not alloc_pages_exact_node() which allocates on a specific node,

2675

* but is not exact.

2679

* but is not exact.

2676

*/

2680

*/

2677

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2681

void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)

2678

{

2682

{

2679

unsigned order = get_order(size);

2683

unsigned order = get_order(size);

2680

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2684

struct page *p = alloc_pages_node(nid, gfp_mask, order);

2681

if (!p)

2685

if (!p)

2682

return NULL;

2686

return NULL;

2683

return make_alloc_exact((unsigned long)page_address(p), order, size);

2687

return make_alloc_exact((unsigned long)page_address(p), order, size);

2684

}

2688

}

2685

EXPORT_SYMBOL(alloc_pages_exact_nid);

2689

EXPORT_SYMBOL(alloc_pages_exact_nid);

2686

2690

2687

/**

2691

/**

2688

* free_pages_exact - release memory allocated via alloc_pages_exact()

2692

* free_pages_exact - release memory allocated via alloc_pages_exact()

2689

* @virt: the value returned by alloc_pages_exact.

2693

* @virt: the value returned by alloc_pages_exact.

2690

* @size: size of allocation, same value as passed to alloc_pages_exact().

2694

* @size: size of allocation, same value as passed to alloc_pages_exact().

2691

*

2695

*

2692

* Release the memory allocated by a previous call to alloc_pages_exact.

2696

* Release the memory allocated by a previous call to alloc_pages_exact.

2693

*/

2697

*/

2694

void free_pages_exact(void *virt, size_t size)

2698

void free_pages_exact(void *virt, size_t size)

2695

{

2699

{

2696

unsigned long addr = (unsigned long)virt;

2700

unsigned long addr = (unsigned long)virt;

2697

unsigned long end = addr + PAGE_ALIGN(size);

2701

unsigned long end = addr + PAGE_ALIGN(size);

2698

2702

2699

while (addr < end) {

2703

while (addr < end) {

2700

free_page(addr);

2704

free_page(addr);

2701

addr += PAGE_SIZE;

2705

addr += PAGE_SIZE;

2702

}

2706

}

2703

}

2707

}

2704

EXPORT_SYMBOL(free_pages_exact);

2708

EXPORT_SYMBOL(free_pages_exact);

2705

2709

2706

static unsigned int nr_free_zone_pages(int offset)

2710

static unsigned int nr_free_zone_pages(int offset)

2707

{

2711

{

2708

struct zoneref *z;

2712

struct zoneref *z;

2709

struct zone *zone;

2713

struct zone *zone;

2710

2714

2711

/* Just pick one node, since fallback list is circular */

2715

/* Just pick one node, since fallback list is circular */

2712

unsigned int sum = 0;

2716

unsigned int sum = 0;

2713

2717

2714

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2718

struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

2715

2719

2716

for_each_zone_zonelist(zone, z, zonelist, offset) {

2720

for_each_zone_zonelist(zone, z, zonelist, offset) {

2717

unsigned long size = zone->present_pages;

2721

unsigned long size = zone->present_pages;

2718

unsigned long high = high_wmark_pages(zone);

2722

unsigned long high = high_wmark_pages(zone);

2719

if (size > high)

2723

if (size > high)

2720

sum += size - high;

2724

sum += size - high;

2721

}

2725

}

2722

2726

2723

return sum;

2727

return sum;

2724

}

2728

}

2725

2729

2726

/*

2730

/*

2727

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2731

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

2728

*/

2732

*/

2729

unsigned int nr_free_buffer_pages(void)

2733

unsigned int nr_free_buffer_pages(void)

2730

{

2734

{

2731

return nr_free_zone_pages(gfp_zone(GFP_USER));

2735

return nr_free_zone_pages(gfp_zone(GFP_USER));

2732

}

2736

}

2733

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2737

EXPORT_SYMBOL_GPL(nr_free_buffer_pages);

2734

2738

2735

/*

2739

/*

2736

* Amount of free RAM allocatable within all zones

2740

* Amount of free RAM allocatable within all zones

2737

*/

2741

*/

2738

unsigned int nr_free_pagecache_pages(void)

2742

unsigned int nr_free_pagecache_pages(void)

2739

{

2743

{

2740

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2744

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));

2741

}

2745

}

2742

2746

2743

static inline void show_node(struct zone *zone)

2747

static inline void show_node(struct zone *zone)

2744

{

2748

{

2745

if (NUMA_BUILD)

2749

if (NUMA_BUILD)

2746

printk("Node %d ", zone_to_nid(zone));

2750

printk("Node %d ", zone_to_nid(zone));

2747

}

2751

}

2748

2752

2749

void si_meminfo(struct sysinfo *val)

2753

void si_meminfo(struct sysinfo *val)

2750

{

2754

{

2751

val->totalram = totalram_pages;

2755

val->totalram = totalram_pages;

2752

val->sharedram = 0;

2756

val->sharedram = 0;

2753

val->freeram = global_page_state(NR_FREE_PAGES);

2757

val->freeram = global_page_state(NR_FREE_PAGES);

2754

val->bufferram = nr_blockdev_pages();

2758

val->bufferram = nr_blockdev_pages();

2755

val->totalhigh = totalhigh_pages;

2759

val->totalhigh = totalhigh_pages;

2756

val->freehigh = nr_free_highpages();

2760

val->freehigh = nr_free_highpages();

2757

val->mem_unit = PAGE_SIZE;

2761

val->mem_unit = PAGE_SIZE;

2758

}

2762

}

2759

2763

2760

EXPORT_SYMBOL(si_meminfo);

2764

EXPORT_SYMBOL(si_meminfo);

2761

2765

2762

#ifdef CONFIG_NUMA

2766

#ifdef CONFIG_NUMA

2763

void si_meminfo_node(struct sysinfo *val, int nid)

2767

void si_meminfo_node(struct sysinfo *val, int nid)

2764

{

2768

{

2765

pg_data_t *pgdat = NODE_DATA(nid);

2769

pg_data_t *pgdat = NODE_DATA(nid);

2766

2770

2767

val->totalram = pgdat->node_present_pages;

2771

val->totalram = pgdat->node_present_pages;

2768

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2772

val->freeram = node_page_state(nid, NR_FREE_PAGES);

2769

#ifdef CONFIG_HIGHMEM

2773

#ifdef CONFIG_HIGHMEM

2770

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2774

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

2771

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2775

val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],

2772

NR_FREE_PAGES);

2776

NR_FREE_PAGES);

2773

#else

2777

#else

2774

val->totalhigh = 0;

2778

val->totalhigh = 0;

2775

val->freehigh = 0;

2779

val->freehigh = 0;

2776

#endif

2780

#endif

2777

val->mem_unit = PAGE_SIZE;

2781

val->mem_unit = PAGE_SIZE;

2778

}

2782

}

2779

#endif

2783

#endif

2780

2784

2781

/*

2785

/*

2782

* Determine whether the node should be displayed or not, depending on whether

2786

* Determine whether the node should be displayed or not, depending on whether

2783

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2787

* SHOW_MEM_FILTER_NODES was passed to show_free_areas().

2784

*/

2788

*/

2785

bool skip_free_areas_node(unsigned int flags, int nid)

2789

bool skip_free_areas_node(unsigned int flags, int nid)

2786

{

2790

{

2787

bool ret = false;

2791

bool ret = false;

2788

unsigned int cpuset_mems_cookie;

2792

unsigned int cpuset_mems_cookie;

2789

2793

2790

if (!(flags & SHOW_MEM_FILTER_NODES))

2794

if (!(flags & SHOW_MEM_FILTER_NODES))

2791

goto out;

2795

goto out;

2792

2796

2793

do {

2797

do {

2794

cpuset_mems_cookie = get_mems_allowed();

2798

cpuset_mems_cookie = get_mems_allowed();

2795

ret = !node_isset(nid, cpuset_current_mems_allowed);

2799

ret = !node_isset(nid, cpuset_current_mems_allowed);

2796

} while (!put_mems_allowed(cpuset_mems_cookie));

2800

} while (!put_mems_allowed(cpuset_mems_cookie));

2797

out:

2801

out:

2798

return ret;

2802

return ret;

2799

}

2803

}

2800

2804

2801

#define K(x) ((x) << (PAGE_SHIFT-10))

2805

#define K(x) ((x) << (PAGE_SHIFT-10))

2802

2806

2803

/*

2807

/*

2804

* Show free area list (used inside shift_scroll-lock stuff)

2808

* Show free area list (used inside shift_scroll-lock stuff)

2805

* We also calculate the percentage fragmentation. We do this by counting the

2809

* We also calculate the percentage fragmentation. We do this by counting the

2806

* memory on each free list with the exception of the first item on the list.

2810

* memory on each free list with the exception of the first item on the list.

2807

* Suppresses nodes that are not allowed by current's cpuset if

2811

* Suppresses nodes that are not allowed by current's cpuset if

2808

* SHOW_MEM_FILTER_NODES is passed.

2812

* SHOW_MEM_FILTER_NODES is passed.

2809

*/

2813

*/

2810

void show_free_areas(unsigned int filter)

2814

void show_free_areas(unsigned int filter)

2811

{

2815

{

2812

int cpu;

2816

int cpu;

2813

struct zone *zone;

2817

struct zone *zone;

2814

2818

2815

for_each_populated_zone(zone) {

2819

for_each_populated_zone(zone) {

2816

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2820

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2817

continue;

2821

continue;

2818

show_node(zone);

2822

show_node(zone);

2819

printk("%s per-cpu:\n", zone->name);

2823

printk("%s per-cpu:\n", zone->name);

2820

2824

2821

for_each_online_cpu(cpu) {

2825

for_each_online_cpu(cpu) {

2822

struct per_cpu_pageset *pageset;

2826

struct per_cpu_pageset *pageset;

2823

2827

2824

pageset = per_cpu_ptr(zone->pageset, cpu);

2828

pageset = per_cpu_ptr(zone->pageset, cpu);

2825

2829

2826

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2830

printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",

2827

cpu, pageset->pcp.high,

2831

cpu, pageset->pcp.high,

2828

pageset->pcp.batch, pageset->pcp.count);

2832

pageset->pcp.batch, pageset->pcp.count);

2829

}

2833

}

2830

}

2834

}

2831

2835

2832

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2836

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"

2833

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2837

" active_file:%lu inactive_file:%lu isolated_file:%lu\n"

2834

" unevictable:%lu"

2838

" unevictable:%lu"

2835

" dirty:%lu writeback:%lu unstable:%lu\n"

2839

" dirty:%lu writeback:%lu unstable:%lu\n"

2836

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2840

" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"

2837

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",

2841

" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",

2838

global_page_state(NR_ACTIVE_ANON),

2842

global_page_state(NR_ACTIVE_ANON),

2839

global_page_state(NR_INACTIVE_ANON),

2843

global_page_state(NR_INACTIVE_ANON),

2840

global_page_state(NR_ISOLATED_ANON),

2844

global_page_state(NR_ISOLATED_ANON),

2841

global_page_state(NR_ACTIVE_FILE),

2845

global_page_state(NR_ACTIVE_FILE),

2842

global_page_state(NR_INACTIVE_FILE),

2846

global_page_state(NR_INACTIVE_FILE),

2843

global_page_state(NR_ISOLATED_FILE),

2847

global_page_state(NR_ISOLATED_FILE),

2844

global_page_state(NR_UNEVICTABLE),

2848

global_page_state(NR_UNEVICTABLE),

2845

global_page_state(NR_FILE_DIRTY),

2849

global_page_state(NR_FILE_DIRTY),

2846

global_page_state(NR_WRITEBACK),

2850

global_page_state(NR_WRITEBACK),

2847

global_page_state(NR_UNSTABLE_NFS),

2851

global_page_state(NR_UNSTABLE_NFS),

2848

global_page_state(NR_FREE_PAGES),

2852

global_page_state(NR_FREE_PAGES),

2849

global_page_state(NR_SLAB_RECLAIMABLE),

2853

global_page_state(NR_SLAB_RECLAIMABLE),

2850

global_page_state(NR_SLAB_UNRECLAIMABLE),

2854

global_page_state(NR_SLAB_UNRECLAIMABLE),

2851

global_page_state(NR_FILE_MAPPED),

2855

global_page_state(NR_FILE_MAPPED),

2852

global_page_state(NR_SHMEM),

2856

global_page_state(NR_SHMEM),

2853

global_page_state(NR_PAGETABLE),

2857

global_page_state(NR_PAGETABLE),

2854

global_page_state(NR_BOUNCE));

2858

global_page_state(NR_BOUNCE));

2855

2859

2856

for_each_populated_zone(zone) {

2860

for_each_populated_zone(zone) {

2857

int i;

2861

int i;

2858

2862

2859

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2863

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2860

continue;

2864

continue;

2861

show_node(zone);

2865

show_node(zone);

2862

printk("%s"

2866

printk("%s"

2863

" free:%lukB"

2867

" free:%lukB"

2864

" min:%lukB"

2868

" min:%lukB"

2865

" low:%lukB"

2869

" low:%lukB"

2866

" high:%lukB"

2870

" high:%lukB"

2867

" active_anon:%lukB"

2871

" active_anon:%lukB"

2868

" inactive_anon:%lukB"

2872

" inactive_anon:%lukB"

2869

" active_file:%lukB"

2873

" active_file:%lukB"

2870

" inactive_file:%lukB"

2874

" inactive_file:%lukB"

2871

" unevictable:%lukB"

2875

" unevictable:%lukB"

2872

" isolated(anon):%lukB"

2876

" isolated(anon):%lukB"

2873

" isolated(file):%lukB"

2877

" isolated(file):%lukB"

2874

" present:%lukB"

2878

" present:%lukB"

2875

" mlocked:%lukB"

2879

" mlocked:%lukB"

2876

" dirty:%lukB"

2880

" dirty:%lukB"

2877

" writeback:%lukB"

2881

" writeback:%lukB"

2878

" mapped:%lukB"

2882

" mapped:%lukB"

2879

" shmem:%lukB"

2883

" shmem:%lukB"

2880

" slab_reclaimable:%lukB"

2884

" slab_reclaimable:%lukB"

2881

" slab_unreclaimable:%lukB"

2885

" slab_unreclaimable:%lukB"

2882

" kernel_stack:%lukB"

2886

" kernel_stack:%lukB"

2883

" pagetables:%lukB"

2887

" pagetables:%lukB"

2884

" unstable:%lukB"

2888

" unstable:%lukB"

2885

" bounce:%lukB"

2889

" bounce:%lukB"

2886

" writeback_tmp:%lukB"

2890

" writeback_tmp:%lukB"

2887

" pages_scanned:%lu"

2891

" pages_scanned:%lu"

2888

" all_unreclaimable? %s"

2892

" all_unreclaimable? %s"

2889

"\n",

2893

"\n",

2890

zone->name,

2894

zone->name,

2891

K(zone_page_state(zone, NR_FREE_PAGES)),

2895

K(zone_page_state(zone, NR_FREE_PAGES)),

2892

K(min_wmark_pages(zone)),

2896

K(min_wmark_pages(zone)),

2893

K(low_wmark_pages(zone)),

2897

K(low_wmark_pages(zone)),

2894

K(high_wmark_pages(zone)),

2898

K(high_wmark_pages(zone)),

2895

K(zone_page_state(zone, NR_ACTIVE_ANON)),

2899

K(zone_page_state(zone, NR_ACTIVE_ANON)),

2896

K(zone_page_state(zone, NR_INACTIVE_ANON)),

2900

K(zone_page_state(zone, NR_INACTIVE_ANON)),

2897

K(zone_page_state(zone, NR_ACTIVE_FILE)),

2901

K(zone_page_state(zone, NR_ACTIVE_FILE)),

2898

K(zone_page_state(zone, NR_INACTIVE_FILE)),

2902

K(zone_page_state(zone, NR_INACTIVE_FILE)),

2899

K(zone_page_state(zone, NR_UNEVICTABLE)),

2903

K(zone_page_state(zone, NR_UNEVICTABLE)),

2900

K(zone_page_state(zone, NR_ISOLATED_ANON)),

2904

K(zone_page_state(zone, NR_ISOLATED_ANON)),

2901

K(zone_page_state(zone, NR_ISOLATED_FILE)),

2905

K(zone_page_state(zone, NR_ISOLATED_FILE)),

2902

K(zone->present_pages),

2906

K(zone->present_pages),

2903

K(zone_page_state(zone, NR_MLOCK)),

2907

K(zone_page_state(zone, NR_MLOCK)),

2904

K(zone_page_state(zone, NR_FILE_DIRTY)),

2908

K(zone_page_state(zone, NR_FILE_DIRTY)),

2905

K(zone_page_state(zone, NR_WRITEBACK)),

2909

K(zone_page_state(zone, NR_WRITEBACK)),

2906

K(zone_page_state(zone, NR_FILE_MAPPED)),

2910

K(zone_page_state(zone, NR_FILE_MAPPED)),

2907

K(zone_page_state(zone, NR_SHMEM)),

2911

K(zone_page_state(zone, NR_SHMEM)),

2908

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

2912

K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),

2909

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

2913

K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),

2910

zone_page_state(zone, NR_KERNEL_STACK) *

2914

zone_page_state(zone, NR_KERNEL_STACK) *

2911

THREAD_SIZE / 1024,

2915

THREAD_SIZE / 1024,

2912

K(zone_page_state(zone, NR_PAGETABLE)),

2916

K(zone_page_state(zone, NR_PAGETABLE)),

2913

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

2917

K(zone_page_state(zone, NR_UNSTABLE_NFS)),

2914

K(zone_page_state(zone, NR_BOUNCE)),

2918

K(zone_page_state(zone, NR_BOUNCE)),

2915

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

2919

K(zone_page_state(zone, NR_WRITEBACK_TEMP)),

2916

zone->pages_scanned,

2920

zone->pages_scanned,

2917

(zone->all_unreclaimable ? "yes" : "no")

2921

(zone->all_unreclaimable ? "yes" : "no")

2918

);

2922

);

2919

printk("lowmem_reserve[]:");

2923

printk("lowmem_reserve[]:");

2920

for (i = 0; i < MAX_NR_ZONES; i++)

2924

for (i = 0; i < MAX_NR_ZONES; i++)

2921

printk(" %lu", zone->lowmem_reserve[i]);

2925

printk(" %lu", zone->lowmem_reserve[i]);

2922

printk("\n");

2926

printk("\n");

2923

}

2927

}

2924

2928

2925

for_each_populated_zone(zone) {

2929

for_each_populated_zone(zone) {

2926

unsigned long nr[MAX_ORDER], flags, order, total = 0;

2930

unsigned long nr[MAX_ORDER], flags, order, total = 0;

2927

2931

2928

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2932

if (skip_free_areas_node(filter, zone_to_nid(zone)))

2929

continue;

2933

continue;

2930

show_node(zone);

2934

show_node(zone);

2931

printk("%s: ", zone->name);

2935

printk("%s: ", zone->name);

2932

2936

2933

spin_lock_irqsave(&zone->lock, flags);

2937

spin_lock_irqsave(&zone->lock, flags);

2934

for (order = 0; order < MAX_ORDER; order++) {

2938

for (order = 0; order < MAX_ORDER; order++) {

2935

nr[order] = zone->free_area[order].nr_free;

2939

nr[order] = zone->free_area[order].nr_free;

2936

total += nr[order] << order;

2940

total += nr[order] << order;

2937

}

2941

}

2938

spin_unlock_irqrestore(&zone->lock, flags);

2942

spin_unlock_irqrestore(&zone->lock, flags);

2939

for (order = 0; order < MAX_ORDER; order++)

2943

for (order = 0; order < MAX_ORDER; order++)

2940

printk("%lu*%lukB ", nr[order], K(1UL) << order);

2944

printk("%lu*%lukB ", nr[order], K(1UL) << order);

2941

printk("= %lukB\n", K(total));

2945

printk("= %lukB\n", K(total));

2942

}

2946

}

2943

2947

2944

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

2948

printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

2945

2949

2946

show_swap_cache_info();

2950

show_swap_cache_info();

2947

}

2951

}

2948

2952

2949

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

2953

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)

2950

{

2954

{

2951

zoneref->zone = zone;

2955

zoneref->zone = zone;

2952

zoneref->zone_idx = zone_idx(zone);

2956

zoneref->zone_idx = zone_idx(zone);

2953

}

2957

}

2954

2958

2955

/*

2959

/*

2956

* Builds allocation fallback zone lists.

2960

* Builds allocation fallback zone lists.

2957

*

2961

*

2958

* Add all populated zones of a node to the zonelist.

2962

* Add all populated zones of a node to the zonelist.

2959

*/

2963

*/

2960

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

2964

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,

2961

int nr_zones, enum zone_type zone_type)

2965

int nr_zones, enum zone_type zone_type)

2962

{

2966

{

2963

struct zone *zone;

2967

struct zone *zone;

2964

2968

2965

BUG_ON(zone_type >= MAX_NR_ZONES);

2969

BUG_ON(zone_type >= MAX_NR_ZONES);

2966

zone_type++;

2970

zone_type++;

2967

2971

2968

do {

2972

do {

2969

zone_type--;

2973

zone_type--;

2970

zone = pgdat->node_zones + zone_type;

2974

zone = pgdat->node_zones + zone_type;

2971

if (populated_zone(zone)) {

2975

if (populated_zone(zone)) {

2972

zoneref_set_zone(zone,

2976

zoneref_set_zone(zone,

2973

&zonelist->_zonerefs[nr_zones++]);

2977

&zonelist->_zonerefs[nr_zones++]);

2974

check_highest_zone(zone_type);

2978

check_highest_zone(zone_type);

2975

}

2979

}

2976

2980

2977

} while (zone_type);

2981

} while (zone_type);

2978

return nr_zones;

2982

return nr_zones;

2979

}

2983

}

2980

2984

2981

2985

2982

/*

2986

/*

2983

* zonelist_order:

2987

* zonelist_order:

2984

* 0 = automatic detection of better ordering.

2988

* 0 = automatic detection of better ordering.

2985

* 1 = order by ([node] distance, -zonetype)

2989

* 1 = order by ([node] distance, -zonetype)

2986

* 2 = order by (-zonetype, [node] distance)

2990

* 2 = order by (-zonetype, [node] distance)

2987

*

2991

*

2988

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

2992

* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

2989

* the same zonelist. So only NUMA can configure this param.

2993

* the same zonelist. So only NUMA can configure this param.

2990

*/

2994

*/

2991

#define ZONELIST_ORDER_DEFAULT 0

2995

#define ZONELIST_ORDER_DEFAULT 0

2992

#define ZONELIST_ORDER_NODE 1

2996

#define ZONELIST_ORDER_NODE 1

2993

#define ZONELIST_ORDER_ZONE 2

2997

#define ZONELIST_ORDER_ZONE 2

2994

2998

2995

/* zonelist order in the kernel.

2999

/* zonelist order in the kernel.

2996

* set_zonelist_order() will set this to NODE or ZONE.

3000

* set_zonelist_order() will set this to NODE or ZONE.

2997

*/

3001

*/

2998

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

3002

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;

2999

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3003

static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};

3000

3004

3001

3005

3002

#ifdef CONFIG_NUMA

3006

#ifdef CONFIG_NUMA

3003

/* The value user specified ....changed by config */

3007

/* The value user specified ....changed by config */

3004

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3008

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3005

/* string for sysctl */

3009

/* string for sysctl */

3006

#define NUMA_ZONELIST_ORDER_LEN 16

3010

#define NUMA_ZONELIST_ORDER_LEN 16

3007

char numa_zonelist_order[16] = "default";

3011

char numa_zonelist_order[16] = "default";

3008

3012

3009

/*

3013

/*

3010

* interface for configure zonelist ordering.

3014

* interface for configure zonelist ordering.

3011

* command line option "numa_zonelist_order"

3015

* command line option "numa_zonelist_order"

3012

* = "[dD]efault - default, automatic configuration.

3016

* = "[dD]efault - default, automatic configuration.

3013

* = "[nN]ode - order by node locality, then by zone within node

3017

* = "[nN]ode - order by node locality, then by zone within node

3014

* = "[zZ]one - order by zone, then by locality within zone

3018

* = "[zZ]one - order by zone, then by locality within zone

3015

*/

3019

*/

3016

3020

3017

static int __parse_numa_zonelist_order(char *s)

3021

static int __parse_numa_zonelist_order(char *s)

3018

{

3022

{

3019

if (*s == 'd' || *s == 'D') {

3023

if (*s == 'd' || *s == 'D') {

3020

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3024

user_zonelist_order = ZONELIST_ORDER_DEFAULT;

3021

} else if (*s == 'n' || *s == 'N') {

3025

} else if (*s == 'n' || *s == 'N') {

3022

user_zonelist_order = ZONELIST_ORDER_NODE;

3026

user_zonelist_order = ZONELIST_ORDER_NODE;

3023

} else if (*s == 'z' || *s == 'Z') {

3027

} else if (*s == 'z' || *s == 'Z') {

3024

user_zonelist_order = ZONELIST_ORDER_ZONE;

3028

user_zonelist_order = ZONELIST_ORDER_ZONE;

3025

} else {

3029

} else {

3026

printk(KERN_WARNING

3030

printk(KERN_WARNING

3027

"Ignoring invalid numa_zonelist_order value: "

3031

"Ignoring invalid numa_zonelist_order value: "

3028

"%s\n", s);

3032

"%s\n", s);

3029

return -EINVAL;

3033

return -EINVAL;

3030

}

3034

}

3031

return 0;

3035

return 0;

3032

}

3036

}

3033

3037

3034

static __init int setup_numa_zonelist_order(char *s)

3038

static __init int setup_numa_zonelist_order(char *s)

3035

{

3039

{

3036

int ret;

3040

int ret;

3037

3041

3038

if (!s)

3042

if (!s)

3039

return 0;

3043

return 0;

3040

3044

3041

ret = __parse_numa_zonelist_order(s);

3045

ret = __parse_numa_zonelist_order(s);

3042

if (ret == 0)

3046

if (ret == 0)

3043

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3047

strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

3044

3048

3045

return ret;

3049

return ret;

3046

}

3050

}

3047

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3051

early_param("numa_zonelist_order", setup_numa_zonelist_order);

3048

3052

3049

/*

3053

/*

3050

* sysctl handler for numa_zonelist_order

3054

* sysctl handler for numa_zonelist_order

3051

*/

3055

*/

3052

int numa_zonelist_order_handler(ctl_table *table, int write,

3056

int numa_zonelist_order_handler(ctl_table *table, int write,

3053

void __user *buffer, size_t *length,

3057

void __user *buffer, size_t *length,

3054

loff_t *ppos)

3058

loff_t *ppos)

3055

{

3059

{

3056

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3060

char saved_string[NUMA_ZONELIST_ORDER_LEN];

3057

int ret;

3061

int ret;

3058

static DEFINE_MUTEX(zl_order_mutex);

3062

static DEFINE_MUTEX(zl_order_mutex);

3059

3063

3060

mutex_lock(&zl_order_mutex);

3064

mutex_lock(&zl_order_mutex);

3061

if (write)

3065

if (write)

3062

strcpy(saved_string, (char*)table->data);

3066

strcpy(saved_string, (char*)table->data);

3063

ret = proc_dostring(table, write, buffer, length, ppos);

3067

ret = proc_dostring(table, write, buffer, length, ppos);

3064

if (ret)

3068

if (ret)

3065

goto out;

3069

goto out;

3066

if (write) {

3070

if (write) {

3067

int oldval = user_zonelist_order;

3071

int oldval = user_zonelist_order;

3068

if (__parse_numa_zonelist_order((char*)table->data)) {

3072

if (__parse_numa_zonelist_order((char*)table->data)) {

3069

/*

3073

/*

3070

* bogus value. restore saved string

3074

* bogus value. restore saved string

3071

*/

3075

*/

3072

strncpy((char*)table->data, saved_string,

3076

strncpy((char*)table->data, saved_string,

3073

NUMA_ZONELIST_ORDER_LEN);

3077

NUMA_ZONELIST_ORDER_LEN);

3074

user_zonelist_order = oldval;

3078

user_zonelist_order = oldval;

3075

} else if (oldval != user_zonelist_order) {

3079

} else if (oldval != user_zonelist_order) {

3076

mutex_lock(&zonelists_mutex);

3080

mutex_lock(&zonelists_mutex);

3077

build_all_zonelists(NULL, NULL);

3081

build_all_zonelists(NULL, NULL);

3078

mutex_unlock(&zonelists_mutex);

3082

mutex_unlock(&zonelists_mutex);

3079

}

3083

}

3080

}

3084

}

3081

out:

3085

out:

3082

mutex_unlock(&zl_order_mutex);

3086

mutex_unlock(&zl_order_mutex);

3083

return ret;

3087

return ret;

3084

}

3088

}

3085

3089

3086

3090

3087

#define MAX_NODE_LOAD (nr_online_nodes)

3091

#define MAX_NODE_LOAD (nr_online_nodes)

3088

static int node_load[MAX_NUMNODES];

3092

static int node_load[MAX_NUMNODES];

3089

3093

3090

/**

3094

/**

3091

* find_next_best_node - find the next node that should appear in a given node's fallback list

3095

* find_next_best_node - find the next node that should appear in a given node's fallback list

3092

* @node: node whose fallback list we're appending

3096

* @node: node whose fallback list we're appending

3093

* @used_node_mask: nodemask_t of already used nodes

3097

* @used_node_mask: nodemask_t of already used nodes

3094

*

3098

*

3095

* We use a number of factors to determine which is the next node that should

3099

* We use a number of factors to determine which is the next node that should

3096

* appear on a given node's fallback list. The node should not have appeared

3100

* appear on a given node's fallback list. The node should not have appeared

3097

* already in @node's fallback list, and it should be the next closest node

3101

* already in @node's fallback list, and it should be the next closest node

3098

* according to the distance array (which contains arbitrary distance values

3102

* according to the distance array (which contains arbitrary distance values

3099

* from each node to each node in the system), and should also prefer nodes

3103

* from each node to each node in the system), and should also prefer nodes

3100

* with no CPUs, since presumably they'll have very little allocation pressure

3104

* with no CPUs, since presumably they'll have very little allocation pressure

3101

* on them otherwise.

3105

* on them otherwise.

3102

* It returns -1 if no node is found.

3106

* It returns -1 if no node is found.

3103

*/

3107

*/

3104

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3108

static int find_next_best_node(int node, nodemask_t *used_node_mask)

3105

{

3109

{

3106

int n, val;

3110

int n, val;

3107

int min_val = INT_MAX;

3111

int min_val = INT_MAX;

3108

int best_node = -1;

3112

int best_node = -1;

3109

const struct cpumask *tmp = cpumask_of_node(0);

3113

const struct cpumask *tmp = cpumask_of_node(0);

3110

3114

3111

/* Use the local node if we haven't already */

3115

/* Use the local node if we haven't already */

3112

if (!node_isset(node, *used_node_mask)) {

3116

if (!node_isset(node, *used_node_mask)) {

3113

node_set(node, *used_node_mask);

3117

node_set(node, *used_node_mask);

3114

return node;

3118

return node;

3115

}

3119

}

3116

3120

3117

for_each_node_state(n, N_HIGH_MEMORY) {

3121

for_each_node_state(n, N_HIGH_MEMORY) {

3118

3122

3119

/* Don't want a node to appear more than once */

3123

/* Don't want a node to appear more than once */

3120

if (node_isset(n, *used_node_mask))

3124

if (node_isset(n, *used_node_mask))

3121

continue;

3125

continue;

3122

3126

3123

/* Use the distance array to find the distance */

3127

/* Use the distance array to find the distance */

3124

val = node_distance(node, n);

3128

val = node_distance(node, n);

3125

3129

3126

/* Penalize nodes under us ("prefer the next node") */

3130

/* Penalize nodes under us ("prefer the next node") */

3127

val += (n < node);

3131

val += (n < node);

3128

3132

3129

/* Give preference to headless and unused nodes */

3133

/* Give preference to headless and unused nodes */

3130

tmp = cpumask_of_node(n);

3134

tmp = cpumask_of_node(n);

3131

if (!cpumask_empty(tmp))

3135

if (!cpumask_empty(tmp))

3132

val += PENALTY_FOR_NODE_WITH_CPUS;

3136

val += PENALTY_FOR_NODE_WITH_CPUS;

3133

3137

3134

/* Slight preference for less loaded node */

3138

/* Slight preference for less loaded node */

3135

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3139

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

3136

val += node_load[n];

3140

val += node_load[n];

3137

3141

3138

if (val < min_val) {

3142

if (val < min_val) {

3139

min_val = val;

3143

min_val = val;

3140

best_node = n;

3144

best_node = n;

3141

}

3145

}

3142

}

3146

}

3143

3147

3144

if (best_node >= 0)

3148

if (best_node >= 0)

3145

node_set(best_node, *used_node_mask);

3149

node_set(best_node, *used_node_mask);

3146

3150

3147

return best_node;

3151

return best_node;

3148

}

3152

}

3149

3153

3150

3154

3151

/*

3155

/*

3152

* Build zonelists ordered by node and zones within node.

3156

* Build zonelists ordered by node and zones within node.

3153

* This results in maximum locality--normal zone overflows into local

3157

* This results in maximum locality--normal zone overflows into local

3154

* DMA zone, if any--but risks exhausting DMA zone.

3158

* DMA zone, if any--but risks exhausting DMA zone.

3155

*/

3159

*/

3156

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3160

static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)

3157

{

3161

{

3158

int j;

3162

int j;

3159

struct zonelist *zonelist;

3163

struct zonelist *zonelist;

3160

3164

3161

zonelist = &pgdat->node_zonelists[0];

3165

zonelist = &pgdat->node_zonelists[0];

3162

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3166

for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)

3163

;

3167

;

3164

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3168

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3165

MAX_NR_ZONES - 1);

3169

MAX_NR_ZONES - 1);

3166

zonelist->_zonerefs[j].zone = NULL;

3170

zonelist->_zonerefs[j].zone = NULL;

3167

zonelist->_zonerefs[j].zone_idx = 0;

3171

zonelist->_zonerefs[j].zone_idx = 0;

3168

}

3172

}

3169

3173

3170

/*

3174

/*

3171

* Build gfp_thisnode zonelists

3175

* Build gfp_thisnode zonelists

3172

*/

3176

*/

3173

static void build_thisnode_zonelists(pg_data_t *pgdat)

3177

static void build_thisnode_zonelists(pg_data_t *pgdat)

3174

{

3178

{

3175

int j;

3179

int j;

3176

struct zonelist *zonelist;

3180

struct zonelist *zonelist;

3177

3181

3178

zonelist = &pgdat->node_zonelists[1];

3182

zonelist = &pgdat->node_zonelists[1];

3179

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3183

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3180

zonelist->_zonerefs[j].zone = NULL;

3184

zonelist->_zonerefs[j].zone = NULL;

3181

zonelist->_zonerefs[j].zone_idx = 0;

3185

zonelist->_zonerefs[j].zone_idx = 0;

3182

}

3186

}

3183

3187

3184

/*

3188

/*

3185

* Build zonelists ordered by zone and nodes within zones.

3189

* Build zonelists ordered by zone and nodes within zones.

3186

* This results in conserving DMA zone[s] until all Normal memory is

3190

* This results in conserving DMA zone[s] until all Normal memory is

3187

* exhausted, but results in overflowing to remote node while memory

3191

* exhausted, but results in overflowing to remote node while memory

3188

* may still exist in local DMA zone.

3192

* may still exist in local DMA zone.

3189

*/

3193

*/

3190

static int node_order[MAX_NUMNODES];

3194

static int node_order[MAX_NUMNODES];

3191

3195

3192

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3196

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)

3193

{

3197

{

3194

int pos, j, node;

3198

int pos, j, node;

3195

int zone_type; /* needs to be signed */

3199

int zone_type; /* needs to be signed */

3196

struct zone *z;

3200

struct zone *z;

3197

struct zonelist *zonelist;

3201

struct zonelist *zonelist;

3198

3202

3199

zonelist = &pgdat->node_zonelists[0];

3203

zonelist = &pgdat->node_zonelists[0];

3200

pos = 0;

3204

pos = 0;

3201

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3205

for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {

3202

for (j = 0; j < nr_nodes; j++) {

3206

for (j = 0; j < nr_nodes; j++) {

3203

node = node_order[j];

3207

node = node_order[j];

3204

z = &NODE_DATA(node)->node_zones[zone_type];

3208

z = &NODE_DATA(node)->node_zones[zone_type];

3205

if (populated_zone(z)) {

3209

if (populated_zone(z)) {

3206

zoneref_set_zone(z,

3210

zoneref_set_zone(z,

3207

&zonelist->_zonerefs[pos++]);

3211

&zonelist->_zonerefs[pos++]);

3208

check_highest_zone(zone_type);

3212

check_highest_zone(zone_type);

3209

}

3213

}

3210

}

3214

}

3211

}

3215

}

3212

zonelist->_zonerefs[pos].zone = NULL;

3216

zonelist->_zonerefs[pos].zone = NULL;

3213

zonelist->_zonerefs[pos].zone_idx = 0;

3217

zonelist->_zonerefs[pos].zone_idx = 0;

3214

}

3218

}

3215

3219

3216

static int default_zonelist_order(void)

3220

static int default_zonelist_order(void)

3217

{

3221

{

3218

int nid, zone_type;

3222

int nid, zone_type;

3219

unsigned long low_kmem_size,total_size;

3223

unsigned long low_kmem_size,total_size;

3220

struct zone *z;

3224

struct zone *z;

3221

int average_size;

3225

int average_size;

3222

/*

3226

/*

3223

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3227

* ZONE_DMA and ZONE_DMA32 can be very small area in the system.

3224

* If they are really small and used heavily, the system can fall

3228

* If they are really small and used heavily, the system can fall

3225

* into OOM very easily.

3229

* into OOM very easily.

3226

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3230

* This function detect ZONE_DMA/DMA32 size and configures zone order.

3227

*/

3231

*/

3228

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3232

/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */

3229

low_kmem_size = 0;

3233

low_kmem_size = 0;

3230

total_size = 0;

3234

total_size = 0;

3231

for_each_online_node(nid) {

3235

for_each_online_node(nid) {

3232

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3236

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3233

z = &NODE_DATA(nid)->node_zones[zone_type];

3237

z = &NODE_DATA(nid)->node_zones[zone_type];

3234

if (populated_zone(z)) {

3238

if (populated_zone(z)) {

3235

if (zone_type < ZONE_NORMAL)

3239

if (zone_type < ZONE_NORMAL)

3236

low_kmem_size += z->present_pages;

3240

low_kmem_size += z->present_pages;

3237

total_size += z->present_pages;

3241

total_size += z->present_pages;

3238

} else if (zone_type == ZONE_NORMAL) {

3242

} else if (zone_type == ZONE_NORMAL) {

3239

/*

3243

/*

3240

* If any node has only lowmem, then node order

3244

* If any node has only lowmem, then node order

3241

* is preferred to allow kernel allocations

3245

* is preferred to allow kernel allocations

3242

* locally; otherwise, they can easily infringe

3246

* locally; otherwise, they can easily infringe

3243

* on other nodes when there is an abundance of

3247

* on other nodes when there is an abundance of

3244

* lowmem available to allocate from.

3248

* lowmem available to allocate from.

3245

*/

3249

*/

3246

return ZONELIST_ORDER_NODE;

3250

return ZONELIST_ORDER_NODE;

3247

}

3251

}

3248

}

3252

}

3249

}

3253

}

3250

if (!low_kmem_size || /* there are no DMA area. */

3254

if (!low_kmem_size || /* there are no DMA area. */

3251

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3255

low_kmem_size > total_size/2) /* DMA/DMA32 is big. */

3252

return ZONELIST_ORDER_NODE;

3256

return ZONELIST_ORDER_NODE;

3253

/*

3257

/*

3254

* look into each node's config.

3258

* look into each node's config.

3255

* If there is a node whose DMA/DMA32 memory is very big area on

3259

* If there is a node whose DMA/DMA32 memory is very big area on

3256

* local memory, NODE_ORDER may be suitable.

3260

* local memory, NODE_ORDER may be suitable.

3257

*/

3261

*/

3258

average_size = total_size /

3262

average_size = total_size /

3259

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

3263

(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);

3260

for_each_online_node(nid) {

3264

for_each_online_node(nid) {

3261

low_kmem_size = 0;

3265

low_kmem_size = 0;

3262

total_size = 0;

3266

total_size = 0;

3263

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3267

for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {

3264

z = &NODE_DATA(nid)->node_zones[zone_type];

3268

z = &NODE_DATA(nid)->node_zones[zone_type];

3265

if (populated_zone(z)) {

3269

if (populated_zone(z)) {

3266

if (zone_type < ZONE_NORMAL)

3270

if (zone_type < ZONE_NORMAL)

3267

low_kmem_size += z->present_pages;

3271

low_kmem_size += z->present_pages;

3268

total_size += z->present_pages;

3272

total_size += z->present_pages;

3269

}

3273

}

3270

}

3274

}

3271

if (low_kmem_size &&

3275

if (low_kmem_size &&

3272

total_size > average_size && /* ignore small node */

3276

total_size > average_size && /* ignore small node */

3273

low_kmem_size > total_size * 70/100)

3277

low_kmem_size > total_size * 70/100)

3274

return ZONELIST_ORDER_NODE;

3278

return ZONELIST_ORDER_NODE;

3275

}

3279

}

3276

return ZONELIST_ORDER_ZONE;

3280

return ZONELIST_ORDER_ZONE;

3277

}

3281

}

3278

3282

3279

static void set_zonelist_order(void)

3283

static void set_zonelist_order(void)

3280

{

3284

{

3281

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3285

if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)

3282

current_zonelist_order = default_zonelist_order();

3286

current_zonelist_order = default_zonelist_order();

3283

else

3287

else

3284

current_zonelist_order = user_zonelist_order;

3288

current_zonelist_order = user_zonelist_order;

3285

}

3289

}

3286

3290

3287

static void build_zonelists(pg_data_t *pgdat)

3291

static void build_zonelists(pg_data_t *pgdat)

3288

{

3292

{

3289

int j, node, load;

3293

int j, node, load;

3290

enum zone_type i;

3294

enum zone_type i;

3291

nodemask_t used_mask;

3295

nodemask_t used_mask;

3292

int local_node, prev_node;

3296

int local_node, prev_node;

3293

struct zonelist *zonelist;

3297

struct zonelist *zonelist;

3294

int order = current_zonelist_order;

3298

int order = current_zonelist_order;

3295

3299

3296

/* initialize zonelists */

3300

/* initialize zonelists */

3297

for (i = 0; i < MAX_ZONELISTS; i++) {

3301

for (i = 0; i < MAX_ZONELISTS; i++) {

3298

zonelist = pgdat->node_zonelists + i;

3302

zonelist = pgdat->node_zonelists + i;

3299

zonelist->_zonerefs[0].zone = NULL;

3303

zonelist->_zonerefs[0].zone = NULL;

3300

zonelist->_zonerefs[0].zone_idx = 0;

3304

zonelist->_zonerefs[0].zone_idx = 0;

3301

}

3305

}

3302

3306

3303

/* NUMA-aware ordering of nodes */

3307

/* NUMA-aware ordering of nodes */

3304

local_node = pgdat->node_id;

3308

local_node = pgdat->node_id;

3305

load = nr_online_nodes;

3309

load = nr_online_nodes;

3306

prev_node = local_node;

3310

prev_node = local_node;

3307

nodes_clear(used_mask);

3311

nodes_clear(used_mask);

3308

3312

3309

memset(node_order, 0, sizeof(node_order));

3313

memset(node_order, 0, sizeof(node_order));

3310

j = 0;

3314

j = 0;

3311

3315

3312

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3316

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

3313

int distance = node_distance(local_node, node);

3317

int distance = node_distance(local_node, node);

3314

3318

3315

/*

3319

/*

3316

* If another node is sufficiently far away then it is better

3320

* If another node is sufficiently far away then it is better

3317

* to reclaim pages in a zone before going off node.

3321

* to reclaim pages in a zone before going off node.

3318

*/

3322

*/

3319

if (distance > RECLAIM_DISTANCE)

3323

if (distance > RECLAIM_DISTANCE)

3320

zone_reclaim_mode = 1;

3324

zone_reclaim_mode = 1;

3321

3325

3322

/*

3326

/*

3323

* We don't want to pressure a particular node.

3327

* We don't want to pressure a particular node.

3324

* So adding penalty to the first node in same

3328

* So adding penalty to the first node in same

3325

* distance group to make it round-robin.

3329

* distance group to make it round-robin.

3326

*/

3330

*/

3327

if (distance != node_distance(local_node, prev_node))

3331

if (distance != node_distance(local_node, prev_node))

3328

node_load[node] = load;

3332

node_load[node] = load;

3329

3333

3330

prev_node = node;

3334

prev_node = node;

3331

load--;

3335

load--;

3332

if (order == ZONELIST_ORDER_NODE)

3336

if (order == ZONELIST_ORDER_NODE)

3333

build_zonelists_in_node_order(pgdat, node);

3337

build_zonelists_in_node_order(pgdat, node);

3334

else

3338

else

3335

node_order[j++] = node; /* remember order */

3339

node_order[j++] = node; /* remember order */

3336

}

3340

}

3337

3341

3338

if (order == ZONELIST_ORDER_ZONE) {

3342

if (order == ZONELIST_ORDER_ZONE) {

3339

/* calculate node order -- i.e., DMA last! */

3343

/* calculate node order -- i.e., DMA last! */

3340

build_zonelists_in_zone_order(pgdat, j);

3344

build_zonelists_in_zone_order(pgdat, j);

3341

}

3345

}

3342

3346

3343

build_thisnode_zonelists(pgdat);

3347

build_thisnode_zonelists(pgdat);

3344

}

3348

}

3345

3349

3346

/* Construct the zonelist performance cache - see further mmzone.h */

3350

/* Construct the zonelist performance cache - see further mmzone.h */

3347

static void build_zonelist_cache(pg_data_t *pgdat)

3351

static void build_zonelist_cache(pg_data_t *pgdat)

3348

{

3352

{

3349

struct zonelist *zonelist;

3353

struct zonelist *zonelist;

3350

struct zonelist_cache *zlc;

3354

struct zonelist_cache *zlc;

3351

struct zoneref *z;

3355

struct zoneref *z;

3352

3356

3353

zonelist = &pgdat->node_zonelists[0];

3357

zonelist = &pgdat->node_zonelists[0];

3354

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3358

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

3355

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3359

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

3356

for (z = zonelist->_zonerefs; z->zone; z++)

3360

for (z = zonelist->_zonerefs; z->zone; z++)

3357

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3361

zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);

3358

}

3362

}

3359

3363

3360

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3364

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3361

/*

3365

/*

3362

* Return node id of node used for "local" allocations.

3366

* Return node id of node used for "local" allocations.

3363

* I.e., first node id of first zone in arg node's generic zonelist.

3367

* I.e., first node id of first zone in arg node's generic zonelist.

3364

* Used for initializing percpu 'numa_mem', which is used primarily

3368

* Used for initializing percpu 'numa_mem', which is used primarily

3365

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3369

* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.

3366

*/

3370

*/

3367

int local_memory_node(int node)

3371

int local_memory_node(int node)

3368

{

3372

{

3369

struct zone *zone;

3373

struct zone *zone;

3370

3374

3371

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3375

(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),

3372

gfp_zone(GFP_KERNEL),

3376

gfp_zone(GFP_KERNEL),

3373

NULL,

3377

NULL,

3374

&zone);

3378

&zone);

3375

return zone->node;

3379

return zone->node;

3376

}

3380

}

3377

#endif

3381

#endif

3378

3382

3379

#else /* CONFIG_NUMA */

3383

#else /* CONFIG_NUMA */

3380

3384

3381

static void set_zonelist_order(void)

3385

static void set_zonelist_order(void)

3382

{

3386

{

3383

current_zonelist_order = ZONELIST_ORDER_ZONE;

3387

current_zonelist_order = ZONELIST_ORDER_ZONE;

3384

}

3388

}

3385

3389

3386

static void build_zonelists(pg_data_t *pgdat)

3390

static void build_zonelists(pg_data_t *pgdat)

3387

{

3391

{

3388

int node, local_node;

3392

int node, local_node;

3389

enum zone_type j;

3393

enum zone_type j;

3390

struct zonelist *zonelist;

3394

struct zonelist *zonelist;

3391

3395

3392

local_node = pgdat->node_id;

3396

local_node = pgdat->node_id;

3393

3397

3394

zonelist = &pgdat->node_zonelists[0];

3398

zonelist = &pgdat->node_zonelists[0];

3395

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3399

j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3396

3400

3397

/*

3401

/*

3398

* Now we build the zonelist so that it contains the zones

3402

* Now we build the zonelist so that it contains the zones

3399

* of all the other nodes.

3403

* of all the other nodes.

3400

* We don't want to pressure a particular node, so when

3404

* We don't want to pressure a particular node, so when

3401

* building the zones for node N, we make sure that the

3405

* building the zones for node N, we make sure that the

3402

* zones coming right after the local ones are those from

3406

* zones coming right after the local ones are those from

3403

* node N+1 (modulo N)

3407

* node N+1 (modulo N)

3404

*/

3408

*/

3405

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3409

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3406

if (!node_online(node))

3410

if (!node_online(node))

3407

continue;

3411

continue;

3408

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3412

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3409

MAX_NR_ZONES - 1);

3413

MAX_NR_ZONES - 1);

3410

}

3414

}

3411

for (node = 0; node < local_node; node++) {

3415

for (node = 0; node < local_node; node++) {

3412

if (!node_online(node))

3416

if (!node_online(node))

3413

continue;

3417

continue;

3414

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3418

j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3415

MAX_NR_ZONES - 1);

3419

MAX_NR_ZONES - 1);

3416

}

3420

}

3417

3421

3418

zonelist->_zonerefs[j].zone = NULL;

3422

zonelist->_zonerefs[j].zone = NULL;

3419

zonelist->_zonerefs[j].zone_idx = 0;

3423

zonelist->_zonerefs[j].zone_idx = 0;

3420

}

3424

}

3421

3425

3422

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3426

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3423

static void build_zonelist_cache(pg_data_t *pgdat)

3427

static void build_zonelist_cache(pg_data_t *pgdat)

3424

{

3428

{

3425

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3429

pgdat->node_zonelists[0].zlcache_ptr = NULL;

3426

}

3430

}

3427

3431

3428

#endif /* CONFIG_NUMA */

3432

#endif /* CONFIG_NUMA */

3429

3433

3430

/*

3434

/*

3431

* Boot pageset table. One per cpu which is going to be used for all

3435

* Boot pageset table. One per cpu which is going to be used for all

3432

* zones and all nodes. The parameters will be set in such a way

3436

* zones and all nodes. The parameters will be set in such a way

3433

* that an item put on a list will immediately be handed over to

3437

* that an item put on a list will immediately be handed over to

3434

* the buddy list. This is safe since pageset manipulation is done

3438

* the buddy list. This is safe since pageset manipulation is done

3435

* with interrupts disabled.

3439

* with interrupts disabled.

3436

*

3440

*

3437

* The boot_pagesets must be kept even after bootup is complete for

3441

* The boot_pagesets must be kept even after bootup is complete for

3438

* unused processors and/or zones. They do play a role for bootstrapping

3442

* unused processors and/or zones. They do play a role for bootstrapping

3439

* hotplugged processors.

3443

* hotplugged processors.

3440

*

3444

*

3441

* zoneinfo_show() and maybe other functions do

3445

* zoneinfo_show() and maybe other functions do

3442

* not check if the processor is online before following the pageset pointer.

3446

* not check if the processor is online before following the pageset pointer.

3443

* Other parts of the kernel may not check if the zone is available.

3447

* Other parts of the kernel may not check if the zone is available.

3444

*/

3448

*/

3445

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3449

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);

3446

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3450

static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);

3447

static void setup_zone_pageset(struct zone *zone);

3451

static void setup_zone_pageset(struct zone *zone);

3448

3452

3449

/*

3453

/*

3450

* Global mutex to protect against size modification of zonelists

3454

* Global mutex to protect against size modification of zonelists

3451

* as well as to serialize pageset setup for the new populated zone.

3455

* as well as to serialize pageset setup for the new populated zone.

3452

*/

3456

*/

3453

DEFINE_MUTEX(zonelists_mutex);

3457

DEFINE_MUTEX(zonelists_mutex);

3454

3458

3455

/* return values int ....just for stop_machine() */

3459

/* return values int ....just for stop_machine() */

3456

static int __build_all_zonelists(void *data)

3460

static int __build_all_zonelists(void *data)

3457

{

3461

{

3458

int nid;

3462

int nid;

3459

int cpu;

3463

int cpu;

3460

pg_data_t *self = data;

3464

pg_data_t *self = data;

3461

3465

3462

#ifdef CONFIG_NUMA

3466

#ifdef CONFIG_NUMA

3463

memset(node_load, 0, sizeof(node_load));

3467

memset(node_load, 0, sizeof(node_load));

3464

#endif

3468

#endif

3465

3469

3466

if (self && !node_online(self->node_id)) {

3470

if (self && !node_online(self->node_id)) {

3467

build_zonelists(self);

3471

build_zonelists(self);

3468

build_zonelist_cache(self);

3472

build_zonelist_cache(self);

3469

}

3473

}

3470

3474

3471

for_each_online_node(nid) {

3475

for_each_online_node(nid) {

3472

pg_data_t *pgdat = NODE_DATA(nid);

3476

pg_data_t *pgdat = NODE_DATA(nid);

3473

3477

3474

build_zonelists(pgdat);

3478

build_zonelists(pgdat);

3475

build_zonelist_cache(pgdat);

3479

build_zonelist_cache(pgdat);

3476

}

3480

}

3477

3481

3478

/*

3482

/*

3479

* Initialize the boot_pagesets that are going to be used

3483

* Initialize the boot_pagesets that are going to be used

3480

* for bootstrapping processors. The real pagesets for

3484

* for bootstrapping processors. The real pagesets for

3481

* each zone will be allocated later when the per cpu

3485

* each zone will be allocated later when the per cpu

3482

* allocator is available.

3486

* allocator is available.

3483

*

3487

*

3484

* boot_pagesets are used also for bootstrapping offline

3488

* boot_pagesets are used also for bootstrapping offline

3485

* cpus if the system is already booted because the pagesets

3489

* cpus if the system is already booted because the pagesets

3486

* are needed to initialize allocators on a specific cpu too.

3490

* are needed to initialize allocators on a specific cpu too.

3487

* F.e. the percpu allocator needs the page allocator which

3491

* F.e. the percpu allocator needs the page allocator which

3488

* needs the percpu allocator in order to allocate its pagesets

3492

* needs the percpu allocator in order to allocate its pagesets

3489

* (a chicken-egg dilemma).

3493

* (a chicken-egg dilemma).

3490

*/

3494

*/

3491

for_each_possible_cpu(cpu) {

3495

for_each_possible_cpu(cpu) {

3492

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3496

setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3493

3497

3494

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3498

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

3495

/*

3499

/*

3496

* We now know the "local memory node" for each node--

3500

* We now know the "local memory node" for each node--

3497

* i.e., the node of the first zone in the generic zonelist.

3501

* i.e., the node of the first zone in the generic zonelist.

3498

* Set up numa_mem percpu variable for on-line cpus. During

3502

* Set up numa_mem percpu variable for on-line cpus. During

3499

* boot, only the boot cpu should be on-line; we'll init the

3503

* boot, only the boot cpu should be on-line; we'll init the

3500

* secondary cpus' numa_mem as they come on-line. During

3504

* secondary cpus' numa_mem as they come on-line. During

3501

* node/memory hotplug, we'll fixup all on-line cpus.

3505

* node/memory hotplug, we'll fixup all on-line cpus.

3502

*/

3506

*/

3503

if (cpu_online(cpu))

3507

if (cpu_online(cpu))

3504

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3508

set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3505

#endif

3509

#endif

3506

}

3510

}

3507

3511

3508

return 0;

3512

return 0;

3509

}

3513

}

3510

3514

3511

/*

3515

/*

3512

* Called with zonelists_mutex held always

3516

* Called with zonelists_mutex held always

3513

* unless system_state == SYSTEM_BOOTING.

3517

* unless system_state == SYSTEM_BOOTING.

3514

*/

3518

*/

3515

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3519

void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)

3516

{

3520

{

3517

set_zonelist_order();

3521

set_zonelist_order();

3518

3522

3519

if (system_state == SYSTEM_BOOTING) {

3523

if (system_state == SYSTEM_BOOTING) {

3520

__build_all_zonelists(NULL);

3524

__build_all_zonelists(NULL);

3521

mminit_verify_zonelist();

3525

mminit_verify_zonelist();

3522

cpuset_init_current_mems_allowed();

3526

cpuset_init_current_mems_allowed();

3523

} else {

3527

} else {

3524

/* we have to stop all cpus to guarantee there is no user

3528

/* we have to stop all cpus to guarantee there is no user

3525

of zonelist */

3529

of zonelist */

3526

#ifdef CONFIG_MEMORY_HOTPLUG

3530

#ifdef CONFIG_MEMORY_HOTPLUG

3527

if (zone)

3531

if (zone)

3528

setup_zone_pageset(zone);

3532

setup_zone_pageset(zone);

3529

#endif

3533

#endif

3530

stop_machine(__build_all_zonelists, pgdat, NULL);

3534

stop_machine(__build_all_zonelists, pgdat, NULL);

3531

/* cpuset refresh routine should be here */

3535

/* cpuset refresh routine should be here */

3532

}

3536

}

3533

vm_total_pages = nr_free_pagecache_pages();

3537

vm_total_pages = nr_free_pagecache_pages();

3534

/*

3538

/*

3535

* Disable grouping by mobility if the number of pages in the

3539

* Disable grouping by mobility if the number of pages in the

3536

* system is too low to allow the mechanism to work. It would be

3540

* system is too low to allow the mechanism to work. It would be

3537

* more accurate, but expensive to check per-zone. This check is

3541

* more accurate, but expensive to check per-zone. This check is

3538

* made on memory-hotadd so a system can start with mobility

3542

* made on memory-hotadd so a system can start with mobility

3539

* disabled and enable it later

3543

* disabled and enable it later

3540

*/

3544

*/

3541

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3545

if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3542

page_group_by_mobility_disabled = 1;

3546

page_group_by_mobility_disabled = 1;

3543

else

3547

else

3544

page_group_by_mobility_disabled = 0;

3548

page_group_by_mobility_disabled = 0;

3545

3549

3546

printk("Built %i zonelists in %s order, mobility grouping %s. "

3550

printk("Built %i zonelists in %s order, mobility grouping %s. "

3547

"Total pages: %ld\n",

3551

"Total pages: %ld\n",

3548

nr_online_nodes,

3552

nr_online_nodes,

3549

zonelist_order_name[current_zonelist_order],

3553

zonelist_order_name[current_zonelist_order],

3550

page_group_by_mobility_disabled ? "off" : "on",

3554

page_group_by_mobility_disabled ? "off" : "on",

3551

vm_total_pages);

3555

vm_total_pages);

3552

#ifdef CONFIG_NUMA

3556

#ifdef CONFIG_NUMA

3553

printk("Policy zone: %s\n", zone_names[policy_zone]);

3557

printk("Policy zone: %s\n", zone_names[policy_zone]);

3554

#endif

3558

#endif

3555

}

3559

}

3556

3560

3557

/*

3561

/*

3558

* Helper functions to size the waitqueue hash table.

3562

* Helper functions to size the waitqueue hash table.

3559

* Essentially these want to choose hash table sizes sufficiently

3563

* Essentially these want to choose hash table sizes sufficiently

3560

* large so that collisions trying to wait on pages are rare.

3564

* large so that collisions trying to wait on pages are rare.

3561

* But in fact, the number of active page waitqueues on typical

3565

* But in fact, the number of active page waitqueues on typical

3562

* systems is ridiculously low, less than 200. So this is even

3566

* systems is ridiculously low, less than 200. So this is even

3563

* conservative, even though it seems large.

3567

* conservative, even though it seems large.

3564

*

3568

*

3565

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3569

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

3566

* waitqueues, i.e. the size of the waitq table given the number of pages.

3570

* waitqueues, i.e. the size of the waitq table given the number of pages.

3567

*/

3571

*/

3568

#define PAGES_PER_WAITQUEUE 256

3572

#define PAGES_PER_WAITQUEUE 256

3569

3573

3570

#ifndef CONFIG_MEMORY_HOTPLUG

3574

#ifndef CONFIG_MEMORY_HOTPLUG

3571

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3575

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3572

{

3576

{

3573

unsigned long size = 1;

3577

unsigned long size = 1;

3574

3578

3575

pages /= PAGES_PER_WAITQUEUE;

3579

pages /= PAGES_PER_WAITQUEUE;

3576

3580

3577

while (size < pages)

3581

while (size < pages)

3578

size <<= 1;

3582

size <<= 1;

3579

3583

3580

/*

3584

/*

3581

* Once we have dozens or even hundreds of threads sleeping

3585

* Once we have dozens or even hundreds of threads sleeping

3582

* on IO we've got bigger problems than wait queue collision.

3586

* on IO we've got bigger problems than wait queue collision.

3583

* Limit the size of the wait table to a reasonable size.

3587

* Limit the size of the wait table to a reasonable size.

3584

*/

3588

*/

3585

size = min(size, 4096UL);

3589

size = min(size, 4096UL);

3586

3590

3587

return max(size, 4UL);

3591

return max(size, 4UL);

3588

}

3592

}

3589

#else

3593

#else

3590

/*

3594

/*

3591

* A zone's size might be changed by hot-add, so it is not possible to determine

3595

* A zone's size might be changed by hot-add, so it is not possible to determine

3592

* a suitable size for its wait_table. So we use the maximum size now.

3596

* a suitable size for its wait_table. So we use the maximum size now.

3593

*

3597

*

3594

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3598

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

3595

*

3599

*

3596

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3600

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

3597

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3601

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

3598

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3602

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

3599

*

3603

*

3600

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3604

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

3601

* or more by the traditional way. (See above). It equals:

3605

* or more by the traditional way. (See above). It equals:

3602

*

3606

*

3603

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3607

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

3604

* ia64(16K page size) : = ( 8G + 4M)byte.

3608

* ia64(16K page size) : = ( 8G + 4M)byte.

3605

* powerpc (64K page size) : = (32G +16M)byte.

3609

* powerpc (64K page size) : = (32G +16M)byte.

3606

*/

3610

*/

3607

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3611

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

3608

{

3612

{

3609

return 4096UL;

3613

return 4096UL;

3610

}

3614

}

3611

#endif

3615

#endif

3612

3616

3613

/*

3617

/*

3614

* This is an integer logarithm so that shifts can be used later

3618

* This is an integer logarithm so that shifts can be used later

3615

* to extract the more random high bits from the multiplicative

3619

* to extract the more random high bits from the multiplicative

3616

* hash function before the remainder is taken.

3620

* hash function before the remainder is taken.

3617

*/

3621

*/

3618

static inline unsigned long wait_table_bits(unsigned long size)

3622

static inline unsigned long wait_table_bits(unsigned long size)

3619

{

3623

{

3620

return ffz(~size);

3624

return ffz(~size);

3621

}

3625

}

3622

3626

3623

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3627

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

3624

3628

3625

/*

3629

/*

3626

* Check if a pageblock contains reserved pages

3630

* Check if a pageblock contains reserved pages

3627

*/

3631

*/

3628

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3632

static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)

3629

{

3633

{

3630

unsigned long pfn;

3634

unsigned long pfn;

3631

3635

3632

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3636

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3633

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3637

if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))

3634

return 1;

3638

return 1;

3635

}

3639

}

3636

return 0;

3640

return 0;

3637

}

3641

}

3638

3642

3639

/*

3643

/*

3640

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3644

* Mark a number of pageblocks as MIGRATE_RESERVE. The number

3641

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3645

* of blocks reserved is based on min_wmark_pages(zone). The memory within

3642

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3646

* the reserve will tend to store contiguous free pages. Setting min_free_kbytes

3643

* higher will lead to a bigger reserve which will get freed as contiguous

3647

* higher will lead to a bigger reserve which will get freed as contiguous

3644

* blocks as reclaim kicks in

3648

* blocks as reclaim kicks in

3645

*/

3649

*/

3646

static void setup_zone_migrate_reserve(struct zone *zone)

3650

static void setup_zone_migrate_reserve(struct zone *zone)

3647

{

3651

{

3648

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3652

unsigned long start_pfn, pfn, end_pfn, block_end_pfn;

3649

struct page *page;

3653

struct page *page;

3650

unsigned long block_migratetype;

3654

unsigned long block_migratetype;

3651

int reserve;

3655

int reserve;

3652

3656

3653

/*

3657

/*

3654

* Get the start pfn, end pfn and the number of blocks to reserve

3658

* Get the start pfn, end pfn and the number of blocks to reserve

3655

* We have to be careful to be aligned to pageblock_nr_pages to

3659

* We have to be careful to be aligned to pageblock_nr_pages to

3656

* make sure that we always check pfn_valid for the first page in

3660

* make sure that we always check pfn_valid for the first page in

3657

* the block.

3661

* the block.

3658

*/

3662

*/

3659

start_pfn = zone->zone_start_pfn;

3663

start_pfn = zone->zone_start_pfn;

3660

end_pfn = start_pfn + zone->spanned_pages;

3664

end_pfn = start_pfn + zone->spanned_pages;

3661

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3665

start_pfn = roundup(start_pfn, pageblock_nr_pages);

3662

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3666

reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>

3663

pageblock_order;

3667

pageblock_order;

3664

3668

3665

/*

3669

/*

3666

* Reserve blocks are generally in place to help high-order atomic

3670

* Reserve blocks are generally in place to help high-order atomic

3667

* allocations that are short-lived. A min_free_kbytes value that

3671

* allocations that are short-lived. A min_free_kbytes value that

3668

* would result in more than 2 reserve blocks for atomic allocations

3672

* would result in more than 2 reserve blocks for atomic allocations

3669

* is assumed to be in place to help anti-fragmentation for the

3673

* is assumed to be in place to help anti-fragmentation for the

3670

* future allocation of hugepages at runtime.

3674

* future allocation of hugepages at runtime.

3671

*/

3675

*/

3672

reserve = min(2, reserve);

3676

reserve = min(2, reserve);

3673

3677

3674

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3678

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {

3675

if (!pfn_valid(pfn))

3679

if (!pfn_valid(pfn))

3676

continue;

3680

continue;

3677

page = pfn_to_page(pfn);

3681

page = pfn_to_page(pfn);

3678

3682

3679

/* Watch out for overlapping nodes */

3683

/* Watch out for overlapping nodes */

3680

if (page_to_nid(page) != zone_to_nid(zone))

3684

if (page_to_nid(page) != zone_to_nid(zone))

3681

continue;

3685

continue;

3682

3686

3683

block_migratetype = get_pageblock_migratetype(page);

3687

block_migratetype = get_pageblock_migratetype(page);

3684

3688

3685

/* Only test what is necessary when the reserves are not met */

3689

/* Only test what is necessary when the reserves are not met */

3686

if (reserve > 0) {

3690

if (reserve > 0) {

3687

/*

3691

/*

3688

* Blocks with reserved pages will never free, skip

3692

* Blocks with reserved pages will never free, skip

3689

* them.

3693

* them.

3690

*/

3694

*/

3691

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3695

block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);

3692

if (pageblock_is_reserved(pfn, block_end_pfn))

3696

if (pageblock_is_reserved(pfn, block_end_pfn))

3693

continue;

3697

continue;

3694

3698

3695

/* If this block is reserved, account for it */

3699

/* If this block is reserved, account for it */

3696

if (block_migratetype == MIGRATE_RESERVE) {

3700

if (block_migratetype == MIGRATE_RESERVE) {

3697

reserve--;

3701

reserve--;

3698

continue;

3702

continue;

3699

}

3703

}

3700

3704

3701

/* Suitable for reserving if this block is movable */

3705

/* Suitable for reserving if this block is movable */

3702

if (block_migratetype == MIGRATE_MOVABLE) {

3706

if (block_migratetype == MIGRATE_MOVABLE) {

3703

set_pageblock_migratetype(page,

3707

set_pageblock_migratetype(page,

3704

MIGRATE_RESERVE);

3708

MIGRATE_RESERVE);

3705

move_freepages_block(zone, page,

3709

move_freepages_block(zone, page,

3706

MIGRATE_RESERVE);

3710

MIGRATE_RESERVE);

3707

reserve--;

3711

reserve--;

3708

continue;

3712

continue;

3709

}

3713

}

3710

}

3714

}

3711

3715

3712

/*

3716

/*

3713

* If the reserve is met and this is a previous reserved block,

3717

* If the reserve is met and this is a previous reserved block,

3714

* take it back

3718

* take it back

3715

*/

3719

*/

3716

if (block_migratetype == MIGRATE_RESERVE) {

3720

if (block_migratetype == MIGRATE_RESERVE) {

3717

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3721

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3718

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3722

move_freepages_block(zone, page, MIGRATE_MOVABLE);

3719

}

3723

}

3720

}

3724

}

3721

}

3725

}

3722

3726

3723

/*

3727

/*

3724

* Initially all pages are reserved - free ones are freed

3728

* Initially all pages are reserved - free ones are freed

3725

* up by free_all_bootmem() once the early boot process is

3729

* up by free_all_bootmem() once the early boot process is

3726

* done. Non-atomic initialization, single-pass.

3730

* done. Non-atomic initialization, single-pass.

3727

*/

3731

*/

3728

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3732

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

3729

unsigned long start_pfn, enum memmap_context context)

3733

unsigned long start_pfn, enum memmap_context context)

3730

{

3734

{

3731

struct page *page;

3735

struct page *page;

3732

unsigned long end_pfn = start_pfn + size;

3736

unsigned long end_pfn = start_pfn + size;

3733

unsigned long pfn;

3737

unsigned long pfn;

3734

struct zone *z;

3738

struct zone *z;

3735

3739

3736

if (highest_memmap_pfn < end_pfn - 1)

3740

if (highest_memmap_pfn < end_pfn - 1)

3737

highest_memmap_pfn = end_pfn - 1;

3741

highest_memmap_pfn = end_pfn - 1;

3738

3742

3739

z = &NODE_DATA(nid)->node_zones[zone];

3743

z = &NODE_DATA(nid)->node_zones[zone];

3740

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3744

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

3741

/*

3745

/*

3742

* There can be holes in boot-time mem_map[]s

3746

* There can be holes in boot-time mem_map[]s

3743

* handed to this function. They do not

3747

* handed to this function. They do not

3744

* exist on hotplugged memory.

3748

* exist on hotplugged memory.

3745

*/

3749

*/

3746

if (context == MEMMAP_EARLY) {

3750

if (context == MEMMAP_EARLY) {

3747

if (!early_pfn_valid(pfn))

3751

if (!early_pfn_valid(pfn))

3748

continue;

3752

continue;

3749

if (!early_pfn_in_nid(pfn, nid))

3753

if (!early_pfn_in_nid(pfn, nid))

3750

continue;

3754

continue;

3751

}

3755

}

3752

page = pfn_to_page(pfn);

3756

page = pfn_to_page(pfn);

3753

set_page_links(page, zone, nid, pfn);

3757

set_page_links(page, zone, nid, pfn);

3754

mminit_verify_page_links(page, zone, nid, pfn);

3758

mminit_verify_page_links(page, zone, nid, pfn);

3755

init_page_count(page);

3759

init_page_count(page);

3756

reset_page_mapcount(page);

3760

reset_page_mapcount(page);

3757

SetPageReserved(page);

3761

SetPageReserved(page);

3758

/*

3762

/*

3759

* Mark the block movable so that blocks are reserved for

3763

* Mark the block movable so that blocks are reserved for

3760

* movable at startup. This will force kernel allocations

3764

* movable at startup. This will force kernel allocations

3761

* to reserve their blocks rather than leaking throughout

3765

* to reserve their blocks rather than leaking throughout

3762

* the address space during boot when many long-lived

3766

* the address space during boot when many long-lived

3763

* kernel allocations are made. Later some blocks near

3767

* kernel allocations are made. Later some blocks near

3764

* the start are marked MIGRATE_RESERVE by

3768

* the start are marked MIGRATE_RESERVE by

3765

* setup_zone_migrate_reserve()

3769

* setup_zone_migrate_reserve()

3766

*

3770

*

3767

* bitmap is created for zone's valid pfn range. but memmap

3771

* bitmap is created for zone's valid pfn range. but memmap

3768

* can be created for invalid pages (for alignment)

3772

* can be created for invalid pages (for alignment)

3769

* check here not to call set_pageblock_migratetype() against

3773

* check here not to call set_pageblock_migratetype() against

3770

* pfn out of zone.

3774

* pfn out of zone.

3771

*/

3775

*/

3772

if ((z->zone_start_pfn <= pfn)

3776

if ((z->zone_start_pfn <= pfn)

3773

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3777

&& (pfn < z->zone_start_pfn + z->spanned_pages)

3774

&& !(pfn & (pageblock_nr_pages - 1)))

3778

&& !(pfn & (pageblock_nr_pages - 1)))

3775

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3779

set_pageblock_migratetype(page, MIGRATE_MOVABLE);

3776

3780

3777

INIT_LIST_HEAD(&page->lru);

3781

INIT_LIST_HEAD(&page->lru);

3778

#ifdef WANT_PAGE_VIRTUAL

3782

#ifdef WANT_PAGE_VIRTUAL

3779

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3783

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

3780

if (!is_highmem_idx(zone))

3784

if (!is_highmem_idx(zone))

3781

set_page_address(page, __va(pfn << PAGE_SHIFT));

3785

set_page_address(page, __va(pfn << PAGE_SHIFT));

3782

#endif

3786

#endif

3783

}

3787

}

3784

}

3788

}

3785

3789

3786

static void __meminit zone_init_free_lists(struct zone *zone)

3790

static void __meminit zone_init_free_lists(struct zone *zone)

3787

{

3791

{

3788

int order, t;

3792

int order, t;

3789

for_each_migratetype_order(order, t) {

3793

for_each_migratetype_order(order, t) {

3790

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3794

INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);

3791

zone->free_area[order].nr_free = 0;

3795

zone->free_area[order].nr_free = 0;

3792

}

3796

}

3793

}

3797

}

3794

3798

3795

#ifndef __HAVE_ARCH_MEMMAP_INIT

3799

#ifndef __HAVE_ARCH_MEMMAP_INIT

3796

#define memmap_init(size, nid, zone, start_pfn) \

3800

#define memmap_init(size, nid, zone, start_pfn) \

3797

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3801

memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)

3798

#endif

3802

#endif

3799

3803

3800

static int __meminit zone_batchsize(struct zone *zone)

3804

static int __meminit zone_batchsize(struct zone *zone)

3801

{

3805

{

3802

#ifdef CONFIG_MMU

3806

#ifdef CONFIG_MMU

3803

int batch;

3807

int batch;

3804

3808

3805

/*

3809

/*

3806

* The per-cpu-pages pools are set to around 1000th of the

3810

* The per-cpu-pages pools are set to around 1000th of the

3807

* size of the zone. But no more than 1/2 of a meg.

3811

* size of the zone. But no more than 1/2 of a meg.

3808

*

3812

*

3809

* OK, so we don't know how big the cache is. So guess.

3813

* OK, so we don't know how big the cache is. So guess.

3810

*/

3814

*/

3811

batch = zone->present_pages / 1024;

3815

batch = zone->present_pages / 1024;

3812

if (batch * PAGE_SIZE > 512 * 1024)

3816

if (batch * PAGE_SIZE > 512 * 1024)

3813

batch = (512 * 1024) / PAGE_SIZE;

3817

batch = (512 * 1024) / PAGE_SIZE;

3814

batch /= 4; /* We effectively *= 4 below */

3818

batch /= 4; /* We effectively *= 4 below */

3815

if (batch < 1)

3819

if (batch < 1)

3816

batch = 1;

3820

batch = 1;

3817

3821

3818

/*

3822

/*

3819

* Clamp the batch to a 2^n - 1 value. Having a power

3823

* Clamp the batch to a 2^n - 1 value. Having a power

3820

* of 2 value was found to be more likely to have

3824

* of 2 value was found to be more likely to have

3821

* suboptimal cache aliasing properties in some cases.

3825

* suboptimal cache aliasing properties in some cases.

3822

*

3826

*

3823

* For example if 2 tasks are alternately allocating

3827

* For example if 2 tasks are alternately allocating

3824

* batches of pages, one task can end up with a lot

3828

* batches of pages, one task can end up with a lot

3825

* of pages of one half of the possible page colors

3829

* of pages of one half of the possible page colors

3826

* and the other with pages of the other colors.

3830

* and the other with pages of the other colors.

3827

*/

3831

*/

3828

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3832

batch = rounddown_pow_of_two(batch + batch/2) - 1;

3829

3833

3830

return batch;

3834

return batch;

3831

3835

3832

#else

3836

#else

3833

/* The deferral and batching of frees should be suppressed under NOMMU

3837

/* The deferral and batching of frees should be suppressed under NOMMU

3834

* conditions.

3838

* conditions.

3835

*

3839

*

3836

* The problem is that NOMMU needs to be able to allocate large chunks

3840

* The problem is that NOMMU needs to be able to allocate large chunks

3837

* of contiguous memory as there's no hardware page translation to

3841

* of contiguous memory as there's no hardware page translation to

3838

* assemble apparent contiguous memory from discontiguous pages.

3842

* assemble apparent contiguous memory from discontiguous pages.

3839

*

3843

*

3840

* Queueing large contiguous runs of pages for batching, however,

3844

* Queueing large contiguous runs of pages for batching, however,

3841

* causes the pages to actually be freed in smaller chunks. As there

3845

* causes the pages to actually be freed in smaller chunks. As there

3842

* can be a significant delay between the individual batches being

3846

* can be a significant delay between the individual batches being

3843

* recycled, this leads to the once large chunks of space being

3847

* recycled, this leads to the once large chunks of space being

3844

* fragmented and becoming unavailable for high-order allocations.

3848

* fragmented and becoming unavailable for high-order allocations.

3845

*/

3849

*/

3846

return 0;

3850

return 0;

3847

#endif

3851

#endif

3848

}

3852

}

3849

3853

3850

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3854

static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

3851

{

3855

{

3852

struct per_cpu_pages *pcp;

3856

struct per_cpu_pages *pcp;

3853

int migratetype;

3857

int migratetype;

3854

3858

3855

memset(p, 0, sizeof(*p));

3859

memset(p, 0, sizeof(*p));

3856

3860

3857

pcp = &p->pcp;

3861

pcp = &p->pcp;

3858

pcp->count = 0;

3862

pcp->count = 0;

3859

pcp->high = 6 * batch;

3863

pcp->high = 6 * batch;

3860

pcp->batch = max(1UL, 1 * batch);

3864

pcp->batch = max(1UL, 1 * batch);

3861

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3865

for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)

3862

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3866

INIT_LIST_HEAD(&pcp->lists[migratetype]);

3863

}

3867

}

3864

3868

3865

/*

3869

/*

3866

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3870

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

3867

* to the value high for the pageset p.

3871

* to the value high for the pageset p.

3868

*/

3872

*/

3869

3873

3870

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3874

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

3871

unsigned long high)

3875

unsigned long high)

3872

{

3876

{

3873

struct per_cpu_pages *pcp;

3877

struct per_cpu_pages *pcp;

3874

3878

3875

pcp = &p->pcp;

3879

pcp = &p->pcp;

3876

pcp->high = high;

3880

pcp->high = high;

3877

pcp->batch = max(1UL, high/4);

3881

pcp->batch = max(1UL, high/4);

3878

if ((high/4) > (PAGE_SHIFT * 8))

3882

if ((high/4) > (PAGE_SHIFT * 8))

3879

pcp->batch = PAGE_SHIFT * 8;

3883

pcp->batch = PAGE_SHIFT * 8;

3880

}

3884

}

3881

3885

3882

static void __meminit setup_zone_pageset(struct zone *zone)

3886

static void __meminit setup_zone_pageset(struct zone *zone)

3883

{

3887

{

3884

int cpu;

3888

int cpu;

3885

3889

3886

zone->pageset = alloc_percpu(struct per_cpu_pageset);

3890

zone->pageset = alloc_percpu(struct per_cpu_pageset);

3887

3891

3888

for_each_possible_cpu(cpu) {

3892

for_each_possible_cpu(cpu) {

3889

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

3893

struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

3890

3894

3891

setup_pageset(pcp, zone_batchsize(zone));

3895

setup_pageset(pcp, zone_batchsize(zone));

3892

3896

3893

if (percpu_pagelist_fraction)

3897

if (percpu_pagelist_fraction)

3894

setup_pagelist_highmark(pcp,

3898

setup_pagelist_highmark(pcp,

3895

(zone->present_pages /

3899

(zone->present_pages /

3896

percpu_pagelist_fraction));

3900

percpu_pagelist_fraction));

3897

}

3901

}

3898

}

3902

}

3899

3903

3900

/*

3904

/*

3901

* Allocate per cpu pagesets and initialize them.

3905

* Allocate per cpu pagesets and initialize them.

3902

* Before this call only boot pagesets were available.

3906

* Before this call only boot pagesets were available.

3903

*/

3907

*/

3904

void __init setup_per_cpu_pageset(void)

3908

void __init setup_per_cpu_pageset(void)

3905

{

3909

{

3906

struct zone *zone;

3910

struct zone *zone;

3907

3911

3908

for_each_populated_zone(zone)

3912

for_each_populated_zone(zone)

3909

setup_zone_pageset(zone);

3913

setup_zone_pageset(zone);

3910

}

3914

}

3911

3915

3912

static noinline __init_refok

3916

static noinline __init_refok

3913

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

3917

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

3914

{

3918

{

3915

int i;

3919

int i;

3916

struct pglist_data *pgdat = zone->zone_pgdat;

3920

struct pglist_data *pgdat = zone->zone_pgdat;

3917

size_t alloc_size;

3921

size_t alloc_size;

3918

3922

3919

/*

3923

/*

3920

* The per-page waitqueue mechanism uses hashed waitqueues

3924

* The per-page waitqueue mechanism uses hashed waitqueues

3921

* per zone.

3925

* per zone.

3922

*/

3926

*/

3923

zone->wait_table_hash_nr_entries =

3927

zone->wait_table_hash_nr_entries =

3924

wait_table_hash_nr_entries(zone_size_pages);

3928

wait_table_hash_nr_entries(zone_size_pages);

3925

zone->wait_table_bits =

3929

zone->wait_table_bits =

3926

wait_table_bits(zone->wait_table_hash_nr_entries);

3930

wait_table_bits(zone->wait_table_hash_nr_entries);

3927

alloc_size = zone->wait_table_hash_nr_entries

3931

alloc_size = zone->wait_table_hash_nr_entries

3928

* sizeof(wait_queue_head_t);

3932

* sizeof(wait_queue_head_t);

3929

3933

3930

if (!slab_is_available()) {

3934

if (!slab_is_available()) {

3931

zone->wait_table = (wait_queue_head_t *)

3935

zone->wait_table = (wait_queue_head_t *)

3932

alloc_bootmem_node_nopanic(pgdat, alloc_size);

3936

alloc_bootmem_node_nopanic(pgdat, alloc_size);

3933

} else {

3937

} else {

3934

/*

3938

/*

3935

* This case means that a zone whose size was 0 gets new memory

3939

* This case means that a zone whose size was 0 gets new memory

3936

* via memory hot-add.

3940

* via memory hot-add.

3937

* But it may be the case that a new node was hot-added. In

3941

* But it may be the case that a new node was hot-added. In

3938

* this case vmalloc() will not be able to use this new node's

3942

* this case vmalloc() will not be able to use this new node's

3939

* memory - this wait_table must be initialized to use this new

3943

* memory - this wait_table must be initialized to use this new

3940

* node itself as well.

3944

* node itself as well.

3941

* To use this new node's memory, further consideration will be

3945

* To use this new node's memory, further consideration will be

3942

* necessary.

3946

* necessary.

3943

*/

3947

*/

3944

zone->wait_table = vmalloc(alloc_size);

3948

zone->wait_table = vmalloc(alloc_size);

3945

}

3949

}

3946

if (!zone->wait_table)

3950

if (!zone->wait_table)

3947

return -ENOMEM;

3951

return -ENOMEM;

3948

3952

3949

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

3953

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

3950

init_waitqueue_head(zone->wait_table + i);

3954

init_waitqueue_head(zone->wait_table + i);

3951

3955

3952

return 0;

3956

return 0;

3953

}

3957

}

3954

3958

3955

static __meminit void zone_pcp_init(struct zone *zone)

3959

static __meminit void zone_pcp_init(struct zone *zone)

3956

{

3960

{

3957

/*

3961

/*

3958

* per cpu subsystem is not up at this point. The following code

3962

* per cpu subsystem is not up at this point. The following code

3959

* relies on the ability of the linker to provide the

3963

* relies on the ability of the linker to provide the

3960

* offset of a (static) per cpu variable into the per cpu area.

3964

* offset of a (static) per cpu variable into the per cpu area.

3961

*/

3965

*/

3962

zone->pageset = &boot_pageset;

3966

zone->pageset = &boot_pageset;

3963

3967

3964

if (zone->present_pages)

3968

if (zone->present_pages)

3965

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

3969

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",

3966

zone->name, zone->present_pages,

3970

zone->name, zone->present_pages,

3967

zone_batchsize(zone));

3971

zone_batchsize(zone));

3968

}

3972

}

3969

3973

3970

int __meminit init_currently_empty_zone(struct zone *zone,

3974

int __meminit init_currently_empty_zone(struct zone *zone,

3971

unsigned long zone_start_pfn,

3975

unsigned long zone_start_pfn,

3972

unsigned long size,

3976

unsigned long size,

3973

enum memmap_context context)

3977

enum memmap_context context)

3974

{

3978

{

3975

struct pglist_data *pgdat = zone->zone_pgdat;

3979

struct pglist_data *pgdat = zone->zone_pgdat;

3976

int ret;

3980

int ret;

3977

ret = zone_wait_table_init(zone, size);

3981

ret = zone_wait_table_init(zone, size);

3978

if (ret)

3982

if (ret)

3979

return ret;

3983

return ret;

3980

pgdat->nr_zones = zone_idx(zone) + 1;

3984

pgdat->nr_zones = zone_idx(zone) + 1;

3981

3985

3982

zone->zone_start_pfn = zone_start_pfn;

3986

zone->zone_start_pfn = zone_start_pfn;

3983

3987

3984

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3988

mminit_dprintk(MMINIT_TRACE, "memmap_init",

3985

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

3989

"Initialising map node %d zone %lu pfns %lu -> %lu\n",

3986

pgdat->node_id,

3990

pgdat->node_id,

3987

(unsigned long)zone_idx(zone),

3991

(unsigned long)zone_idx(zone),

3988

zone_start_pfn, (zone_start_pfn + size));

3992

zone_start_pfn, (zone_start_pfn + size));

3989

3993

3990

zone_init_free_lists(zone);

3994

zone_init_free_lists(zone);

3991

3995

3992

return 0;

3996

return 0;

3993

}

3997

}

3994

3998

3995

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

3999

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

3996

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

4000

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

3997

/*

4001

/*

3998

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

4002

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

3999

* Architectures may implement their own version but if add_active_range()

4003

* Architectures may implement their own version but if add_active_range()

4000

* was used and there are no special requirements, this is a convenient

4004

* was used and there are no special requirements, this is a convenient

4001

* alternative

4005

* alternative

4002

*/

4006

*/

4003

int __meminit __early_pfn_to_nid(unsigned long pfn)

4007

int __meminit __early_pfn_to_nid(unsigned long pfn)

4004

{

4008

{

4005

unsigned long start_pfn, end_pfn;

4009

unsigned long start_pfn, end_pfn;

4006

int i, nid;

4010

int i, nid;

4007

4011

4008

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4012

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4009

if (start_pfn <= pfn && pfn < end_pfn)

4013

if (start_pfn <= pfn && pfn < end_pfn)

4010

return nid;

4014

return nid;

4011

/* This is a memory hole */

4015

/* This is a memory hole */

4012

return -1;

4016

return -1;

4013

}

4017

}

4014

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4018

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

4015

4019

4016

int __meminit early_pfn_to_nid(unsigned long pfn)

4020

int __meminit early_pfn_to_nid(unsigned long pfn)

4017

{

4021

{

4018

int nid;

4022

int nid;

4019

4023

4020

nid = __early_pfn_to_nid(pfn);

4024

nid = __early_pfn_to_nid(pfn);

4021

if (nid >= 0)

4025

if (nid >= 0)

4022

return nid;

4026

return nid;

4023

/* just returns 0 */

4027

/* just returns 0 */

4024

return 0;

4028

return 0;

4025

}

4029

}

4026

4030

4027

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4031

#ifdef CONFIG_NODES_SPAN_OTHER_NODES

4028

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4032

bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

4029

{

4033

{

4030

int nid;

4034

int nid;

4031

4035

4032

nid = __early_pfn_to_nid(pfn);

4036

nid = __early_pfn_to_nid(pfn);

4033

if (nid >= 0 && nid != node)

4037

if (nid >= 0 && nid != node)

4034

return false;

4038

return false;

4035

return true;

4039

return true;

4036

}

4040

}

4037

#endif

4041

#endif

4038

4042

4039

/**

4043

/**

4040

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4044

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

4041

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4045

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

4042

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4046

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

4043

*

4047

*

4044

* If an architecture guarantees that all ranges registered with

4048

* If an architecture guarantees that all ranges registered with

4045

* add_active_ranges() contain no holes and may be freed, this

4049

* add_active_ranges() contain no holes and may be freed, this

4046

* this function may be used instead of calling free_bootmem() manually.

4050

* this function may be used instead of calling free_bootmem() manually.

4047

*/

4051

*/

4048

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4052

void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

4049

{

4053

{

4050

unsigned long start_pfn, end_pfn;

4054

unsigned long start_pfn, end_pfn;

4051

int i, this_nid;

4055

int i, this_nid;

4052

4056

4053

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4057

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {

4054

start_pfn = min(start_pfn, max_low_pfn);

4058

start_pfn = min(start_pfn, max_low_pfn);

4055

end_pfn = min(end_pfn, max_low_pfn);

4059

end_pfn = min(end_pfn, max_low_pfn);

4056

4060

4057

if (start_pfn < end_pfn)

4061

if (start_pfn < end_pfn)

4058

free_bootmem_node(NODE_DATA(this_nid),

4062

free_bootmem_node(NODE_DATA(this_nid),

4059

PFN_PHYS(start_pfn),

4063

PFN_PHYS(start_pfn),

4060

(end_pfn - start_pfn) << PAGE_SHIFT);

4064

(end_pfn - start_pfn) << PAGE_SHIFT);

4061

}

4065

}

4062

}

4066

}

4063

4067

4064

/**

4068

/**

4065

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4069

* sparse_memory_present_with_active_regions - Call memory_present for each active range

4066

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4070

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

4067

*

4071

*

4068

* If an architecture guarantees that all ranges registered with

4072

* If an architecture guarantees that all ranges registered with

4069

* add_active_ranges() contain no holes and may be freed, this

4073

* add_active_ranges() contain no holes and may be freed, this

4070

* function may be used instead of calling memory_present() manually.

4074

* function may be used instead of calling memory_present() manually.

4071

*/

4075

*/

4072

void __init sparse_memory_present_with_active_regions(int nid)

4076

void __init sparse_memory_present_with_active_regions(int nid)

4073

{

4077

{

4074

unsigned long start_pfn, end_pfn;

4078

unsigned long start_pfn, end_pfn;

4075

int i, this_nid;

4079

int i, this_nid;

4076

4080

4077

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4081

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)

4078

memory_present(this_nid, start_pfn, end_pfn);

4082

memory_present(this_nid, start_pfn, end_pfn);

4079

}

4083

}

4080

4084

4081

/**

4085

/**

4082

* get_pfn_range_for_nid - Return the start and end page frames for a node

4086

* get_pfn_range_for_nid - Return the start and end page frames for a node

4083

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4087

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

4084

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4088

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

4085

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4089

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

4086

*

4090

*

4087

* It returns the start and end page frame of a node based on information

4091

* It returns the start and end page frame of a node based on information

4088

* provided by an arch calling add_active_range(). If called for a node

4092

* provided by an arch calling add_active_range(). If called for a node

4089

* with no available memory, a warning is printed and the start and end

4093

* with no available memory, a warning is printed and the start and end

4090

* PFNs will be 0.

4094

* PFNs will be 0.

4091

*/

4095

*/

4092

void __meminit get_pfn_range_for_nid(unsigned int nid,

4096

void __meminit get_pfn_range_for_nid(unsigned int nid,

4093

unsigned long *start_pfn, unsigned long *end_pfn)

4097

unsigned long *start_pfn, unsigned long *end_pfn)

4094

{

4098

{

4095

unsigned long this_start_pfn, this_end_pfn;

4099

unsigned long this_start_pfn, this_end_pfn;

4096

int i;

4100

int i;

4097

4101

4098

*start_pfn = -1UL;

4102

*start_pfn = -1UL;

4099

*end_pfn = 0;

4103

*end_pfn = 0;

4100

4104

4101

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4105

for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {

4102

*start_pfn = min(*start_pfn, this_start_pfn);

4106

*start_pfn = min(*start_pfn, this_start_pfn);

4103

*end_pfn = max(*end_pfn, this_end_pfn);

4107

*end_pfn = max(*end_pfn, this_end_pfn);

4104

}

4108

}

4105

4109

4106

if (*start_pfn == -1UL)

4110

if (*start_pfn == -1UL)

4107

*start_pfn = 0;

4111

*start_pfn = 0;

4108

}

4112

}

4109

4113

4110

/*

4114

/*

4111

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4115

* This finds a zone that can be used for ZONE_MOVABLE pages. The

4112

* assumption is made that zones within a node are ordered in monotonic

4116

* assumption is made that zones within a node are ordered in monotonic

4113

* increasing memory addresses so that the "highest" populated zone is used

4117

* increasing memory addresses so that the "highest" populated zone is used

4114

*/

4118

*/

4115

static void __init find_usable_zone_for_movable(void)

4119

static void __init find_usable_zone_for_movable(void)

4116

{

4120

{

4117

int zone_index;

4121

int zone_index;

4118

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4122

for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {

4119

if (zone_index == ZONE_MOVABLE)

4123

if (zone_index == ZONE_MOVABLE)

4120

continue;

4124

continue;

4121

4125

4122

if (arch_zone_highest_possible_pfn[zone_index] >

4126

if (arch_zone_highest_possible_pfn[zone_index] >

4123

arch_zone_lowest_possible_pfn[zone_index])

4127

arch_zone_lowest_possible_pfn[zone_index])

4124

break;

4128

break;

4125

}

4129

}

4126

4130

4127

VM_BUG_ON(zone_index == -1);

4131

VM_BUG_ON(zone_index == -1);

4128

movable_zone = zone_index;

4132

movable_zone = zone_index;

4129

}

4133

}

4130

4134

4131

/*

4135

/*

4132

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4136

* The zone ranges provided by the architecture do not include ZONE_MOVABLE

4133

* because it is sized independent of architecture. Unlike the other zones,

4137

* because it is sized independent of architecture. Unlike the other zones,

4134

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4138

* the starting point for ZONE_MOVABLE is not fixed. It may be different

4135

* in each node depending on the size of each node and how evenly kernelcore

4139

* in each node depending on the size of each node and how evenly kernelcore

4136

* is distributed. This helper function adjusts the zone ranges

4140

* is distributed. This helper function adjusts the zone ranges

4137

* provided by the architecture for a given node by using the end of the

4141

* provided by the architecture for a given node by using the end of the

4138

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4142

* highest usable zone for ZONE_MOVABLE. This preserves the assumption that

4139

* zones within a node are in order of monotonic increases memory addresses

4143

* zones within a node are in order of monotonic increases memory addresses

4140

*/

4144

*/

4141

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4145

static void __meminit adjust_zone_range_for_zone_movable(int nid,

4142

unsigned long zone_type,

4146

unsigned long zone_type,

4143

unsigned long node_start_pfn,

4147

unsigned long node_start_pfn,

4144

unsigned long node_end_pfn,

4148

unsigned long node_end_pfn,

4145

unsigned long *zone_start_pfn,

4149

unsigned long *zone_start_pfn,

4146

unsigned long *zone_end_pfn)

4150

unsigned long *zone_end_pfn)

4147

{

4151

{

4148

/* Only adjust if ZONE_MOVABLE is on this node */

4152

/* Only adjust if ZONE_MOVABLE is on this node */

4149

if (zone_movable_pfn[nid]) {

4153

if (zone_movable_pfn[nid]) {

4150

/* Size ZONE_MOVABLE */

4154

/* Size ZONE_MOVABLE */

4151

if (zone_type == ZONE_MOVABLE) {

4155

if (zone_type == ZONE_MOVABLE) {

4152

*zone_start_pfn = zone_movable_pfn[nid];

4156

*zone_start_pfn = zone_movable_pfn[nid];

4153

*zone_end_pfn = min(node_end_pfn,

4157

*zone_end_pfn = min(node_end_pfn,

4154

arch_zone_highest_possible_pfn[movable_zone]);

4158

arch_zone_highest_possible_pfn[movable_zone]);

4155

4159

4156

/* Adjust for ZONE_MOVABLE starting within this range */

4160

/* Adjust for ZONE_MOVABLE starting within this range */

4157

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4161

} else if (*zone_start_pfn < zone_movable_pfn[nid] &&

4158

*zone_end_pfn > zone_movable_pfn[nid]) {

4162

*zone_end_pfn > zone_movable_pfn[nid]) {

4159

*zone_end_pfn = zone_movable_pfn[nid];

4163

*zone_end_pfn = zone_movable_pfn[nid];

4160

4164

4161

/* Check if this whole range is within ZONE_MOVABLE */

4165

/* Check if this whole range is within ZONE_MOVABLE */

4162

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4166

} else if (*zone_start_pfn >= zone_movable_pfn[nid])

4163

*zone_start_pfn = *zone_end_pfn;

4167

*zone_start_pfn = *zone_end_pfn;

4164

}

4168

}

4165

}

4169

}

4166

4170

4167

/*

4171

/*

4168

* Return the number of pages a zone spans in a node, including holes

4172

* Return the number of pages a zone spans in a node, including holes

4169

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4173

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

4170

*/

4174

*/

4171

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4175

static unsigned long __meminit zone_spanned_pages_in_node(int nid,

4172

unsigned long zone_type,

4176

unsigned long zone_type,

4173

unsigned long *ignored)

4177

unsigned long *ignored)

4174

{

4178

{

4175

unsigned long node_start_pfn, node_end_pfn;

4179

unsigned long node_start_pfn, node_end_pfn;

4176

unsigned long zone_start_pfn, zone_end_pfn;

4180

unsigned long zone_start_pfn, zone_end_pfn;

4177

4181

4178

/* Get the start and end of the node and zone */

4182

/* Get the start and end of the node and zone */

4179

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4183

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4180

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4184

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

4181

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4185

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

4182

adjust_zone_range_for_zone_movable(nid, zone_type,

4186

adjust_zone_range_for_zone_movable(nid, zone_type,

4183

node_start_pfn, node_end_pfn,

4187

node_start_pfn, node_end_pfn,

4184

&zone_start_pfn, &zone_end_pfn);

4188

&zone_start_pfn, &zone_end_pfn);

4185

4189

4186

/* Check that this node has pages within the zone's required range */

4190

/* Check that this node has pages within the zone's required range */

4187

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4191

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

4188

return 0;

4192

return 0;

4189

4193

4190

/* Move the zone boundaries inside the node if necessary */

4194

/* Move the zone boundaries inside the node if necessary */

4191

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4195

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

4192

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4196

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

4193

4197

4194

/* Return the spanned pages */

4198

/* Return the spanned pages */

4195

return zone_end_pfn - zone_start_pfn;

4199

return zone_end_pfn - zone_start_pfn;

4196

}

4200

}

4197

4201

4198

/*

4202

/*

4199

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4203

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

4200

* then all holes in the requested range will be accounted for.

4204

* then all holes in the requested range will be accounted for.

4201

*/

4205

*/

4202

unsigned long __meminit __absent_pages_in_range(int nid,

4206

unsigned long __meminit __absent_pages_in_range(int nid,

4203

unsigned long range_start_pfn,

4207

unsigned long range_start_pfn,

4204

unsigned long range_end_pfn)

4208

unsigned long range_end_pfn)

4205

{

4209

{

4206

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4210

unsigned long nr_absent = range_end_pfn - range_start_pfn;

4207

unsigned long start_pfn, end_pfn;

4211

unsigned long start_pfn, end_pfn;

4208

int i;

4212

int i;

4209

4213

4210

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4214

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4211

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4215

start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);

4212

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4216

end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);

4213

nr_absent -= end_pfn - start_pfn;

4217

nr_absent -= end_pfn - start_pfn;

4214

}

4218

}

4215

return nr_absent;

4219

return nr_absent;

4216

}

4220

}

4217

4221

4218

/**

4222

/**

4219

* absent_pages_in_range - Return number of page frames in holes within a range

4223

* absent_pages_in_range - Return number of page frames in holes within a range

4220

* @start_pfn: The start PFN to start searching for holes

4224

* @start_pfn: The start PFN to start searching for holes

4221

* @end_pfn: The end PFN to stop searching for holes

4225

* @end_pfn: The end PFN to stop searching for holes

4222

*

4226

*

4223

* It returns the number of pages frames in memory holes within a range.

4227

* It returns the number of pages frames in memory holes within a range.

4224

*/

4228

*/

4225

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4229

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

4226

unsigned long end_pfn)

4230

unsigned long end_pfn)

4227

{

4231

{

4228

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4232

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

4229

}

4233

}

4230

4234

4231

/* Return the number of page frames in holes in a zone on a node */

4235

/* Return the number of page frames in holes in a zone on a node */

4232

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4236

static unsigned long __meminit zone_absent_pages_in_node(int nid,

4233

unsigned long zone_type,

4237

unsigned long zone_type,

4234

unsigned long *ignored)

4238

unsigned long *ignored)

4235

{

4239

{

4236

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4240

unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];

4237

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4241

unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];

4238

unsigned long node_start_pfn, node_end_pfn;

4242

unsigned long node_start_pfn, node_end_pfn;

4239

unsigned long zone_start_pfn, zone_end_pfn;

4243

unsigned long zone_start_pfn, zone_end_pfn;

4240

4244

4241

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4245

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

4242

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4246

zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);

4243

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4247

zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);

4244

4248

4245

adjust_zone_range_for_zone_movable(nid, zone_type,

4249

adjust_zone_range_for_zone_movable(nid, zone_type,

4246

node_start_pfn, node_end_pfn,

4250

node_start_pfn, node_end_pfn,

4247

&zone_start_pfn, &zone_end_pfn);

4251

&zone_start_pfn, &zone_end_pfn);

4248

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4252

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

4249

}

4253

}

4250

4254

4251

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4255

#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4252

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4256

static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,

4253

unsigned long zone_type,

4257

unsigned long zone_type,

4254

unsigned long *zones_size)

4258

unsigned long *zones_size)

4255

{

4259

{

4256

return zones_size[zone_type];

4260

return zones_size[zone_type];

4257

}

4261

}

4258

4262

4259

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4263

static inline unsigned long __meminit zone_absent_pages_in_node(int nid,

4260

unsigned long zone_type,

4264

unsigned long zone_type,

4261

unsigned long *zholes_size)

4265

unsigned long *zholes_size)

4262

{

4266

{

4263

if (!zholes_size)

4267

if (!zholes_size)

4264

return 0;

4268

return 0;

4265

4269

4266

return zholes_size[zone_type];

4270

return zholes_size[zone_type];

4267

}

4271

}

4268

4272

4269

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4273

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4270

4274

4271

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4275

static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,

4272

unsigned long *zones_size, unsigned long *zholes_size)

4276

unsigned long *zones_size, unsigned long *zholes_size)

4273

{

4277

{

4274

unsigned long realtotalpages, totalpages = 0;

4278

unsigned long realtotalpages, totalpages = 0;

4275

enum zone_type i;

4279

enum zone_type i;

4276

4280

4277

for (i = 0; i < MAX_NR_ZONES; i++)

4281

for (i = 0; i < MAX_NR_ZONES; i++)

4278

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4282

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

4279

zones_size);

4283

zones_size);

4280

pgdat->node_spanned_pages = totalpages;

4284

pgdat->node_spanned_pages = totalpages;

4281

4285

4282

realtotalpages = totalpages;

4286

realtotalpages = totalpages;

4283

for (i = 0; i < MAX_NR_ZONES; i++)

4287

for (i = 0; i < MAX_NR_ZONES; i++)

4284

realtotalpages -=

4288

realtotalpages -=

4285

zone_absent_pages_in_node(pgdat->node_id, i,

4289

zone_absent_pages_in_node(pgdat->node_id, i,

4286

zholes_size);

4290

zholes_size);

4287

pgdat->node_present_pages = realtotalpages;

4291

pgdat->node_present_pages = realtotalpages;

4288

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4292

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

4289

realtotalpages);

4293

realtotalpages);

4290

}

4294

}

4291

4295

4292

#ifndef CONFIG_SPARSEMEM

4296

#ifndef CONFIG_SPARSEMEM

4293

/*

4297

/*

4294

* Calculate the size of the zone->blockflags rounded to an unsigned long

4298

* Calculate the size of the zone->blockflags rounded to an unsigned long

4295

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4299

* Start by making sure zonesize is a multiple of pageblock_order by rounding

4296

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4300

* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally

4297

* round what is now in bits to nearest long in bits, then return it in

4301

* round what is now in bits to nearest long in bits, then return it in

4298

* bytes.

4302

* bytes.

4299

*/

4303

*/

4300

static unsigned long __init usemap_size(unsigned long zonesize)

4304

static unsigned long __init usemap_size(unsigned long zonesize)

4301

{

4305

{

4302

unsigned long usemapsize;

4306

unsigned long usemapsize;

4303

4307

4304

usemapsize = roundup(zonesize, pageblock_nr_pages);

4308

usemapsize = roundup(zonesize, pageblock_nr_pages);

4305

usemapsize = usemapsize >> pageblock_order;

4309

usemapsize = usemapsize >> pageblock_order;

4306

usemapsize *= NR_PAGEBLOCK_BITS;

4310

usemapsize *= NR_PAGEBLOCK_BITS;

4307

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4311

usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));

4308

4312

4309

return usemapsize / 8;

4313

return usemapsize / 8;

4310

}

4314

}

4311

4315

4312

static void __init setup_usemap(struct pglist_data *pgdat,

4316

static void __init setup_usemap(struct pglist_data *pgdat,

4313

struct zone *zone, unsigned long zonesize)

4317

struct zone *zone, unsigned long zonesize)

4314

{

4318

{

4315

unsigned long usemapsize = usemap_size(zonesize);

4319

unsigned long usemapsize = usemap_size(zonesize);

4316

zone->pageblock_flags = NULL;

4320

zone->pageblock_flags = NULL;

4317

if (usemapsize)

4321

if (usemapsize)

4318

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4322

zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,

4319

usemapsize);

4323

usemapsize);

4320

}

4324

}

4321

#else

4325

#else

4322

static inline void setup_usemap(struct pglist_data *pgdat,

4326

static inline void setup_usemap(struct pglist_data *pgdat,

4323

struct zone *zone, unsigned long zonesize) {}

4327

struct zone *zone, unsigned long zonesize) {}

4324

#endif /* CONFIG_SPARSEMEM */

4328

#endif /* CONFIG_SPARSEMEM */

4325

4329

4326

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4330

#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE

4327

4331

4328

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4332

/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */

4329

void __init set_pageblock_order(void)

4333

void __init set_pageblock_order(void)

4330

{

4334

{

4331

unsigned int order;

4335

unsigned int order;

4332

4336

4333

/* Check that pageblock_nr_pages has not already been setup */

4337

/* Check that pageblock_nr_pages has not already been setup */

4334

if (pageblock_order)

4338

if (pageblock_order)

4335

return;

4339

return;

4336

4340

4337

if (HPAGE_SHIFT > PAGE_SHIFT)

4341

if (HPAGE_SHIFT > PAGE_SHIFT)

4338

order = HUGETLB_PAGE_ORDER;

4342

order = HUGETLB_PAGE_ORDER;

4339

else

4343

else

4340

order = MAX_ORDER - 1;

4344

order = MAX_ORDER - 1;

4341

4345

4342

/*

4346

/*

4343

* Assume the largest contiguous order of interest is a huge page.

4347

* Assume the largest contiguous order of interest is a huge page.

4344

* This value may be variable depending on boot parameters on IA64 and

4348

* This value may be variable depending on boot parameters on IA64 and

4345

* powerpc.

4349

* powerpc.

4346

*/

4350

*/

4347

pageblock_order = order;

4351

pageblock_order = order;

4348

}

4352

}

4349

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4353

#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4350

4354

4351

/*

4355

/*

4352

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4356

* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()

4353

* is unused as pageblock_order is set at compile-time. See

4357

* is unused as pageblock_order is set at compile-time. See

4354

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4358

* include/linux/pageblock-flags.h for the values of pageblock_order based on

4355

* the kernel config

4359

* the kernel config

4356

*/

4360

*/

4357

void __init set_pageblock_order(void)

4361

void __init set_pageblock_order(void)

4358

{

4362

{

4359

}

4363

}

4360

4364

4361

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4365

#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */

4362

4366

4363

/*

4367

/*

4364

* Set up the zone data structures:

4368

* Set up the zone data structures:

4365

* - mark all pages reserved

4369

* - mark all pages reserved

4366

* - mark all memory queues empty

4370

* - mark all memory queues empty

4367

* - clear the memory bitmaps

4371

* - clear the memory bitmaps

4368

*/

4372

*/

4369

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4373

static void __paginginit free_area_init_core(struct pglist_data *pgdat,

4370

unsigned long *zones_size, unsigned long *zholes_size)

4374

unsigned long *zones_size, unsigned long *zholes_size)

4371

{

4375

{

4372

enum zone_type j;

4376

enum zone_type j;

4373

int nid = pgdat->node_id;

4377

int nid = pgdat->node_id;

4374

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4378

unsigned long zone_start_pfn = pgdat->node_start_pfn;

4375

int ret;

4379

int ret;

4376

4380

4377

pgdat_resize_init(pgdat);

4381

pgdat_resize_init(pgdat);

4378

pgdat->nr_zones = 0;

4382

pgdat->nr_zones = 0;

4379

init_waitqueue_head(&pgdat->kswapd_wait);

4383

init_waitqueue_head(&pgdat->kswapd_wait);

4380

pgdat->kswapd_max_order = 0;

4384

pgdat->kswapd_max_order = 0;

4381

pgdat_page_cgroup_init(pgdat);

4385

pgdat_page_cgroup_init(pgdat);

4382

4386

4383

for (j = 0; j < MAX_NR_ZONES; j++) {

4387

for (j = 0; j < MAX_NR_ZONES; j++) {

4384

struct zone *zone = pgdat->node_zones + j;

4388

struct zone *zone = pgdat->node_zones + j;

4385

unsigned long size, realsize, memmap_pages;

4389

unsigned long size, realsize, memmap_pages;

4386

4390

4387

size = zone_spanned_pages_in_node(nid, j, zones_size);

4391

size = zone_spanned_pages_in_node(nid, j, zones_size);

4388

realsize = size - zone_absent_pages_in_node(nid, j,

4392

realsize = size - zone_absent_pages_in_node(nid, j,

4389

zholes_size);

4393

zholes_size);

4390

4394

4391

/*

4395

/*

4392

* Adjust realsize so that it accounts for how much memory

4396

* Adjust realsize so that it accounts for how much memory

4393

* is used by this zone for memmap. This affects the watermark

4397

* is used by this zone for memmap. This affects the watermark

4394

* and per-cpu initialisations

4398

* and per-cpu initialisations

4395

*/

4399

*/

4396

memmap_pages =

4400

memmap_pages =

4397

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

4401

PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;

4398

if (realsize >= memmap_pages) {

4402

if (realsize >= memmap_pages) {

4399

realsize -= memmap_pages;

4403

realsize -= memmap_pages;

4400

if (memmap_pages)

4404

if (memmap_pages)

4401

printk(KERN_DEBUG

4405

printk(KERN_DEBUG

4402

" %s zone: %lu pages used for memmap\n",

4406

" %s zone: %lu pages used for memmap\n",

4403

zone_names[j], memmap_pages);

4407

zone_names[j], memmap_pages);

4404

} else

4408

} else

4405

printk(KERN_WARNING

4409

printk(KERN_WARNING

4406

" %s zone: %lu pages exceeds realsize %lu\n",

4410

" %s zone: %lu pages exceeds realsize %lu\n",

4407

zone_names[j], memmap_pages, realsize);

4411

zone_names[j], memmap_pages, realsize);

4408

4412

4409

/* Account for reserved pages */

4413

/* Account for reserved pages */

4410

if (j == 0 && realsize > dma_reserve) {

4414

if (j == 0 && realsize > dma_reserve) {

4411

realsize -= dma_reserve;

4415

realsize -= dma_reserve;

4412

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4416

printk(KERN_DEBUG " %s zone: %lu pages reserved\n",

4413

zone_names[0], dma_reserve);

4417

zone_names[0], dma_reserve);

4414

}

4418

}

4415

4419

4416

if (!is_highmem_idx(j))

4420

if (!is_highmem_idx(j))

4417

nr_kernel_pages += realsize;

4421

nr_kernel_pages += realsize;

4418

nr_all_pages += realsize;

4422

nr_all_pages += realsize;

4419

4423

4420

zone->spanned_pages = size;

4424

zone->spanned_pages = size;

4421

zone->present_pages = realsize;

4425

zone->present_pages = realsize;

4422

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

4426

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

4423

zone->compact_cached_free_pfn = zone->zone_start_pfn +

4427

zone->compact_cached_free_pfn = zone->zone_start_pfn +

4424

zone->spanned_pages;

4428

zone->spanned_pages;

4425

zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);

4429

zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);

4426

#endif

4430

#endif

4427

#ifdef CONFIG_NUMA

4431

#ifdef CONFIG_NUMA

4428

zone->node = nid;

4432

zone->node = nid;

4429

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

4433

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

4430

/ 100;

4434

/ 100;

4431

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

4435

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

4432

#endif

4436

#endif

4433

zone->name = zone_names[j];

4437

zone->name = zone_names[j];

4434

spin_lock_init(&zone->lock);

4438

spin_lock_init(&zone->lock);

4435

spin_lock_init(&zone->lru_lock);

4439

spin_lock_init(&zone->lru_lock);

4436

zone_seqlock_init(zone);

4440

zone_seqlock_init(zone);

4437

zone->zone_pgdat = pgdat;

4441

zone->zone_pgdat = pgdat;

4438

4442

4439

zone_pcp_init(zone);

4443

zone_pcp_init(zone);

4440

lruvec_init(&zone->lruvec, zone);

4444

lruvec_init(&zone->lruvec, zone);

4441

zap_zone_vm_stats(zone);

4445

zap_zone_vm_stats(zone);

4442

zone->flags = 0;

4446

zone->flags = 0;

4443

#ifdef CONFIG_MEMORY_ISOLATION

4447

#ifdef CONFIG_MEMORY_ISOLATION

4444

zone->nr_pageblock_isolate = 0;

4448

zone->nr_pageblock_isolate = 0;

4445

#endif

4449

#endif

4446

if (!size)

4450

if (!size)

4447

continue;

4451

continue;

4448

4452

4449

set_pageblock_order();

4453

set_pageblock_order();

4450

setup_usemap(pgdat, zone, size);

4454

setup_usemap(pgdat, zone, size);

4451

ret = init_currently_empty_zone(zone, zone_start_pfn,

4455

ret = init_currently_empty_zone(zone, zone_start_pfn,

4452

size, MEMMAP_EARLY);

4456

size, MEMMAP_EARLY);

4453

BUG_ON(ret);

4457

BUG_ON(ret);

4454

memmap_init(size, nid, j, zone_start_pfn);

4458

memmap_init(size, nid, j, zone_start_pfn);

4455

zone_start_pfn += size;

4459

zone_start_pfn += size;

4456

}

4460

}

4457

}

4461

}

4458

4462

4459

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4463

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)

4460

{

4464

{

4461

/* Skip empty nodes */

4465

/* Skip empty nodes */

4462

if (!pgdat->node_spanned_pages)

4466

if (!pgdat->node_spanned_pages)

4463

return;

4467

return;

4464

4468

4465

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4469

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4466

/* ia64 gets its own node_mem_map, before this, without bootmem */

4470

/* ia64 gets its own node_mem_map, before this, without bootmem */

4467

if (!pgdat->node_mem_map) {

4471

if (!pgdat->node_mem_map) {

4468

unsigned long size, start, end;

4472

unsigned long size, start, end;

4469

struct page *map;

4473

struct page *map;

4470

4474

4471

/*

4475

/*

4472

* The zone's endpoints aren't required to be MAX_ORDER

4476

* The zone's endpoints aren't required to be MAX_ORDER

4473

* aligned but the node_mem_map endpoints must be in order

4477

* aligned but the node_mem_map endpoints must be in order

4474

* for the buddy allocator to function correctly.

4478

* for the buddy allocator to function correctly.

4475

*/

4479

*/

4476

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4480

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

4477

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4481

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

4478

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4482

end = ALIGN(end, MAX_ORDER_NR_PAGES);

4479

size = (end - start) * sizeof(struct page);

4483

size = (end - start) * sizeof(struct page);

4480

map = alloc_remap(pgdat->node_id, size);

4484

map = alloc_remap(pgdat->node_id, size);

4481

if (!map)

4485

if (!map)

4482

map = alloc_bootmem_node_nopanic(pgdat, size);

4486

map = alloc_bootmem_node_nopanic(pgdat, size);

4483

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4487

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

4484

}

4488

}

4485

#ifndef CONFIG_NEED_MULTIPLE_NODES

4489

#ifndef CONFIG_NEED_MULTIPLE_NODES

4486

/*

4490

/*

4487

* With no DISCONTIG, the global mem_map is just set as node 0's

4491

* With no DISCONTIG, the global mem_map is just set as node 0's

4488

*/

4492

*/

4489

if (pgdat == NODE_DATA(0)) {

4493

if (pgdat == NODE_DATA(0)) {

4490

mem_map = NODE_DATA(0)->node_mem_map;

4494

mem_map = NODE_DATA(0)->node_mem_map;

4491

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4495

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4492

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4496

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

4493

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4497

mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);

4494

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4498

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4495

}

4499

}

4496

#endif

4500

#endif

4497

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4501

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

4498

}

4502

}

4499

4503

4500

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4504

void __paginginit free_area_init_node(int nid, unsigned long *zones_size,

4501

unsigned long node_start_pfn, unsigned long *zholes_size)

4505

unsigned long node_start_pfn, unsigned long *zholes_size)

4502

{

4506

{

4503

pg_data_t *pgdat = NODE_DATA(nid);

4507

pg_data_t *pgdat = NODE_DATA(nid);

4504

4508

4505

pgdat->node_id = nid;

4509

pgdat->node_id = nid;

4506

pgdat->node_start_pfn = node_start_pfn;

4510

pgdat->node_start_pfn = node_start_pfn;

4507

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4511

calculate_node_totalpages(pgdat, zones_size, zholes_size);

4508

4512

4509

alloc_node_mem_map(pgdat);

4513

alloc_node_mem_map(pgdat);

4510

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4514

#ifdef CONFIG_FLAT_NODE_MEM_MAP

4511

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4515

printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",

4512

nid, (unsigned long)pgdat,

4516

nid, (unsigned long)pgdat,

4513

(unsigned long)pgdat->node_mem_map);

4517

(unsigned long)pgdat->node_mem_map);

4514

#endif

4518

#endif

4515

4519

4516

free_area_init_core(pgdat, zones_size, zholes_size);

4520

free_area_init_core(pgdat, zones_size, zholes_size);

4517

}

4521

}

4518

4522

4519

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4523

#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP

4520

4524

4521

#if MAX_NUMNODES > 1

4525

#if MAX_NUMNODES > 1

4522

/*

4526

/*

4523

* Figure out the number of possible node ids.

4527

* Figure out the number of possible node ids.

4524

*/

4528

*/

4525

static void __init setup_nr_node_ids(void)

4529

static void __init setup_nr_node_ids(void)

4526

{

4530

{

4527

unsigned int node;

4531

unsigned int node;

4528

unsigned int highest = 0;

4532

unsigned int highest = 0;

4529

4533

4530

for_each_node_mask(node, node_possible_map)

4534

for_each_node_mask(node, node_possible_map)

4531

highest = node;

4535

highest = node;

4532

nr_node_ids = highest + 1;

4536

nr_node_ids = highest + 1;

4533

}

4537

}

4534

#else

4538

#else

4535

static inline void setup_nr_node_ids(void)

4539

static inline void setup_nr_node_ids(void)

4536

{

4540

{

4537

}

4541

}

4538

#endif

4542

#endif

4539

4543

4540

/**

4544

/**

4541

* node_map_pfn_alignment - determine the maximum internode alignment

4545

* node_map_pfn_alignment - determine the maximum internode alignment

4542

*

4546

*

4543

* This function should be called after node map is populated and sorted.

4547

* This function should be called after node map is populated and sorted.

4544

* It calculates the maximum power of two alignment which can distinguish

4548

* It calculates the maximum power of two alignment which can distinguish

4545

* all the nodes.

4549

* all the nodes.

4546

*

4550

*

4547

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4551

* For example, if all nodes are 1GiB and aligned to 1GiB, the return value

4548

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4552

* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the

4549

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4553

* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is

4550

* shifted, 1GiB is enough and this function will indicate so.

4554

* shifted, 1GiB is enough and this function will indicate so.

4551

*

4555

*

4552

* This is used to test whether pfn -> nid mapping of the chosen memory

4556

* This is used to test whether pfn -> nid mapping of the chosen memory

4553

* model has fine enough granularity to avoid incorrect mapping for the

4557

* model has fine enough granularity to avoid incorrect mapping for the

4554

* populated node map.

4558

* populated node map.

4555

*

4559

*

4556

* Returns the determined alignment in pfn's. 0 if there is no alignment

4560

* Returns the determined alignment in pfn's. 0 if there is no alignment

4557

* requirement (single node).

4561

* requirement (single node).

4558

*/

4562

*/

4559

unsigned long __init node_map_pfn_alignment(void)

4563

unsigned long __init node_map_pfn_alignment(void)

4560

{

4564

{

4561

unsigned long accl_mask = 0, last_end = 0;

4565

unsigned long accl_mask = 0, last_end = 0;

4562

unsigned long start, end, mask;

4566

unsigned long start, end, mask;

4563

int last_nid = -1;

4567

int last_nid = -1;

4564

int i, nid;

4568

int i, nid;

4565

4569

4566

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4570

for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {

4567

if (!start || last_nid < 0 || last_nid == nid) {

4571

if (!start || last_nid < 0 || last_nid == nid) {

4568

last_nid = nid;

4572

last_nid = nid;

4569

last_end = end;

4573

last_end = end;

4570

continue;

4574

continue;

4571

}

4575

}

4572

4576

4573

/*

4577

/*

4574

* Start with a mask granular enough to pin-point to the

4578

* Start with a mask granular enough to pin-point to the

4575

* start pfn and tick off bits one-by-one until it becomes

4579

* start pfn and tick off bits one-by-one until it becomes

4576

* too coarse to separate the current node from the last.

4580

* too coarse to separate the current node from the last.

4577

*/

4581

*/

4578

mask = ~((1 << __ffs(start)) - 1);

4582

mask = ~((1 << __ffs(start)) - 1);

4579

while (mask && last_end <= (start & (mask << 1)))

4583

while (mask && last_end <= (start & (mask << 1)))

4580

mask <<= 1;

4584

mask <<= 1;

4581

4585

4582

/* accumulate all internode masks */

4586

/* accumulate all internode masks */

4583

accl_mask |= mask;

4587

accl_mask |= mask;

4584

}

4588

}

4585

4589

4586

/* convert mask to number of pages */

4590

/* convert mask to number of pages */

4587

return ~accl_mask + 1;

4591

return ~accl_mask + 1;

4588

}

4592

}

4589

4593

4590

/* Find the lowest pfn for a node */

4594

/* Find the lowest pfn for a node */

4591

static unsigned long __init find_min_pfn_for_node(int nid)

4595

static unsigned long __init find_min_pfn_for_node(int nid)

4592

{

4596

{

4593

unsigned long min_pfn = ULONG_MAX;

4597

unsigned long min_pfn = ULONG_MAX;

4594

unsigned long start_pfn;

4598

unsigned long start_pfn;

4595

int i;

4599

int i;

4596

4600

4597

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4601

for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)

4598

min_pfn = min(min_pfn, start_pfn);

4602

min_pfn = min(min_pfn, start_pfn);

4599

4603

4600

if (min_pfn == ULONG_MAX) {

4604

if (min_pfn == ULONG_MAX) {

4601

printk(KERN_WARNING

4605

printk(KERN_WARNING

4602

"Could not find start_pfn for node %d\n", nid);

4606

"Could not find start_pfn for node %d\n", nid);

4603

return 0;

4607

return 0;

4604

}

4608

}

4605

4609

4606

return min_pfn;

4610

return min_pfn;

4607

}

4611

}

4608

4612

4609

/**

4613

/**

4610

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4614

* find_min_pfn_with_active_regions - Find the minimum PFN registered

4611

*

4615

*

4612

* It returns the minimum PFN based on information provided via

4616

* It returns the minimum PFN based on information provided via

4613

* add_active_range().

4617

* add_active_range().

4614

*/

4618

*/

4615

unsigned long __init find_min_pfn_with_active_regions(void)

4619

unsigned long __init find_min_pfn_with_active_regions(void)

4616

{

4620

{

4617

return find_min_pfn_for_node(MAX_NUMNODES);

4621

return find_min_pfn_for_node(MAX_NUMNODES);

4618

}

4622

}

4619

4623

4620

/*

4624

/*

4621

* early_calculate_totalpages()

4625

* early_calculate_totalpages()

4622

* Sum pages in active regions for movable zone.

4626

* Sum pages in active regions for movable zone.

4623

* Populate N_HIGH_MEMORY for calculating usable_nodes.

4627

* Populate N_HIGH_MEMORY for calculating usable_nodes.

4624

*/

4628

*/

4625

static unsigned long __init early_calculate_totalpages(void)

4629

static unsigned long __init early_calculate_totalpages(void)

4626

{

4630

{

4627

unsigned long totalpages = 0;

4631

unsigned long totalpages = 0;

4628

unsigned long start_pfn, end_pfn;

4632

unsigned long start_pfn, end_pfn;

4629

int i, nid;

4633

int i, nid;

4630

4634

4631

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4635

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {

4632

unsigned long pages = end_pfn - start_pfn;

4636

unsigned long pages = end_pfn - start_pfn;

4633

4637

4634

totalpages += pages;

4638

totalpages += pages;

4635

if (pages)

4639

if (pages)

4636

node_set_state(nid, N_HIGH_MEMORY);

4640

node_set_state(nid, N_HIGH_MEMORY);

4637

}

4641

}

4638

return totalpages;

4642

return totalpages;

4639

}

4643

}

4640

4644

4641

/*

4645

/*

4642

* Find the PFN the Movable zone begins in each node. Kernel memory

4646

* Find the PFN the Movable zone begins in each node. Kernel memory

4643

* is spread evenly between nodes as long as the nodes have enough

4647

* is spread evenly between nodes as long as the nodes have enough

4644

* memory. When they don't, some nodes will have more kernelcore than

4648

* memory. When they don't, some nodes will have more kernelcore than

4645

* others

4649

* others

4646

*/

4650

*/

4647

static void __init find_zone_movable_pfns_for_nodes(void)

4651

static void __init find_zone_movable_pfns_for_nodes(void)

4648

{

4652

{

4649

int i, nid;

4653

int i, nid;

4650

unsigned long usable_startpfn;

4654

unsigned long usable_startpfn;

4651

unsigned long kernelcore_node, kernelcore_remaining;

4655

unsigned long kernelcore_node, kernelcore_remaining;

4652

/* save the state before borrow the nodemask */

4656

/* save the state before borrow the nodemask */

4653

nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];

4657

nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];

4654

unsigned long totalpages = early_calculate_totalpages();

4658

unsigned long totalpages = early_calculate_totalpages();

4655

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

4659

int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);

4656

4660

4657

/*

4661

/*

4658

* If movablecore was specified, calculate what size of

4662

* If movablecore was specified, calculate what size of

4659

* kernelcore that corresponds so that memory usable for

4663

* kernelcore that corresponds so that memory usable for

4660

* any allocation type is evenly spread. If both kernelcore

4664

* any allocation type is evenly spread. If both kernelcore

4661

* and movablecore are specified, then the value of kernelcore

4665

* and movablecore are specified, then the value of kernelcore

4662

* will be used for required_kernelcore if it's greater than

4666

* will be used for required_kernelcore if it's greater than

4663

* what movablecore would have allowed.

4667

* what movablecore would have allowed.

4664

*/

4668

*/

4665

if (required_movablecore) {

4669

if (required_movablecore) {

4666

unsigned long corepages;

4670

unsigned long corepages;

4667

4671

4668

/*

4672

/*

4669

* Round-up so that ZONE_MOVABLE is at least as large as what

4673

* Round-up so that ZONE_MOVABLE is at least as large as what

4670

* was requested by the user

4674

* was requested by the user

4671

*/

4675

*/

4672

required_movablecore =

4676

required_movablecore =

4673

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4677

roundup(required_movablecore, MAX_ORDER_NR_PAGES);

4674

corepages = totalpages - required_movablecore;

4678

corepages = totalpages - required_movablecore;

4675

4679

4676

required_kernelcore = max(required_kernelcore, corepages);

4680

required_kernelcore = max(required_kernelcore, corepages);

4677

}

4681

}

4678

4682

4679

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4683

/* If kernelcore was not specified, there is no ZONE_MOVABLE */

4680

if (!required_kernelcore)

4684

if (!required_kernelcore)

4681

goto out;

4685

goto out;

4682

4686

4683

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4687

/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */

4684

find_usable_zone_for_movable();

4688

find_usable_zone_for_movable();

4685

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4689

usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

4686

4690

4687

restart:

4691

restart:

4688

/* Spread kernelcore memory as evenly as possible throughout nodes */

4692

/* Spread kernelcore memory as evenly as possible throughout nodes */

4689

kernelcore_node = required_kernelcore / usable_nodes;

4693

kernelcore_node = required_kernelcore / usable_nodes;

4690

for_each_node_state(nid, N_HIGH_MEMORY) {

4694

for_each_node_state(nid, N_HIGH_MEMORY) {

4691

unsigned long start_pfn, end_pfn;

4695

unsigned long start_pfn, end_pfn;

4692

4696

4693

/*

4697

/*

4694

* Recalculate kernelcore_node if the division per node

4698

* Recalculate kernelcore_node if the division per node

4695

* now exceeds what is necessary to satisfy the requested

4699

* now exceeds what is necessary to satisfy the requested

4696

* amount of memory for the kernel

4700

* amount of memory for the kernel

4697

*/

4701

*/

4698

if (required_kernelcore < kernelcore_node)

4702

if (required_kernelcore < kernelcore_node)

4699

kernelcore_node = required_kernelcore / usable_nodes;

4703

kernelcore_node = required_kernelcore / usable_nodes;

4700

4704

4701

/*

4705

/*

4702

* As the map is walked, we track how much memory is usable

4706

* As the map is walked, we track how much memory is usable

4703

* by the kernel using kernelcore_remaining. When it is

4707

* by the kernel using kernelcore_remaining. When it is

4704

* 0, the rest of the node is usable by ZONE_MOVABLE

4708

* 0, the rest of the node is usable by ZONE_MOVABLE

4705

*/

4709

*/

4706

kernelcore_remaining = kernelcore_node;

4710

kernelcore_remaining = kernelcore_node;

4707

4711

4708

/* Go through each range of PFNs within this node */

4712

/* Go through each range of PFNs within this node */

4709

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4713

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {

4710

unsigned long size_pages;

4714

unsigned long size_pages;

4711

4715

4712

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4716

start_pfn = max(start_pfn, zone_movable_pfn[nid]);

4713

if (start_pfn >= end_pfn)

4717

if (start_pfn >= end_pfn)

4714

continue;

4718

continue;

4715

4719

4716

/* Account for what is only usable for kernelcore */

4720

/* Account for what is only usable for kernelcore */

4717

if (start_pfn < usable_startpfn) {

4721

if (start_pfn < usable_startpfn) {

4718

unsigned long kernel_pages;

4722

unsigned long kernel_pages;

4719

kernel_pages = min(end_pfn, usable_startpfn)

4723

kernel_pages = min(end_pfn, usable_startpfn)

4720

- start_pfn;

4724

- start_pfn;

4721

4725

4722

kernelcore_remaining -= min(kernel_pages,

4726

kernelcore_remaining -= min(kernel_pages,

4723

kernelcore_remaining);

4727

kernelcore_remaining);

4724

required_kernelcore -= min(kernel_pages,

4728

required_kernelcore -= min(kernel_pages,

4725

required_kernelcore);

4729

required_kernelcore);

4726

4730

4727

/* Continue if range is now fully accounted */

4731

/* Continue if range is now fully accounted */

4728

if (end_pfn <= usable_startpfn) {

4732

if (end_pfn <= usable_startpfn) {

4729

4733

4730

/*

4734

/*

4731

* Push zone_movable_pfn to the end so

4735

* Push zone_movable_pfn to the end so

4732

* that if we have to rebalance

4736

* that if we have to rebalance

4733

* kernelcore across nodes, we will

4737

* kernelcore across nodes, we will

4734

* not double account here

4738

* not double account here

4735

*/

4739

*/

4736

zone_movable_pfn[nid] = end_pfn;

4740

zone_movable_pfn[nid] = end_pfn;

4737

continue;

4741

continue;

4738

}

4742

}

4739

start_pfn = usable_startpfn;

4743

start_pfn = usable_startpfn;

4740

}

4744

}

4741

4745

4742

/*

4746

/*

4743

* The usable PFN range for ZONE_MOVABLE is from

4747

* The usable PFN range for ZONE_MOVABLE is from

4744

* start_pfn->end_pfn. Calculate size_pages as the

4748

* start_pfn->end_pfn. Calculate size_pages as the

4745

* number of pages used as kernelcore

4749

* number of pages used as kernelcore

4746

*/

4750

*/

4747

size_pages = end_pfn - start_pfn;

4751

size_pages = end_pfn - start_pfn;

4748

if (size_pages > kernelcore_remaining)

4752

if (size_pages > kernelcore_remaining)

4749

size_pages = kernelcore_remaining;

4753

size_pages = kernelcore_remaining;

4750

zone_movable_pfn[nid] = start_pfn + size_pages;

4754

zone_movable_pfn[nid] = start_pfn + size_pages;

4751

4755

4752

/*

4756

/*

4753

* Some kernelcore has been met, update counts and

4757

* Some kernelcore has been met, update counts and

4754

* break if the kernelcore for this node has been

4758

* break if the kernelcore for this node has been

4755

* satisified

4759

* satisified

4756

*/

4760

*/

4757

required_kernelcore -= min(required_kernelcore,

4761

required_kernelcore -= min(required_kernelcore,

4758

size_pages);

4762

size_pages);

4759

kernelcore_remaining -= size_pages;

4763

kernelcore_remaining -= size_pages;

4760

if (!kernelcore_remaining)

4764

if (!kernelcore_remaining)

4761

break;

4765

break;

4762

}

4766

}

4763

}

4767

}

4764

4768

4765

/*

4769

/*

4766

* If there is still required_kernelcore, we do another pass with one

4770

* If there is still required_kernelcore, we do another pass with one

4767

* less node in the count. This will push zone_movable_pfn[nid] further

4771

* less node in the count. This will push zone_movable_pfn[nid] further

4768

* along on the nodes that still have memory until kernelcore is

4772

* along on the nodes that still have memory until kernelcore is

4769

* satisified

4773

* satisified

4770

*/

4774

*/

4771

usable_nodes--;

4775

usable_nodes--;

4772

if (usable_nodes && required_kernelcore > usable_nodes)

4776

if (usable_nodes && required_kernelcore > usable_nodes)

4773

goto restart;

4777

goto restart;

4774

4778

4775

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4779

/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */

4776

for (nid = 0; nid < MAX_NUMNODES; nid++)

4780

for (nid = 0; nid < MAX_NUMNODES; nid++)

4777

zone_movable_pfn[nid] =

4781

zone_movable_pfn[nid] =

4778

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4782

roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

4779

4783

4780

out:

4784

out:

4781

/* restore the node_state */

4785

/* restore the node_state */

4782

node_states[N_HIGH_MEMORY] = saved_node_state;

4786

node_states[N_HIGH_MEMORY] = saved_node_state;

4783

}

4787

}

4784

4788

4785

/* Any regular memory on that node ? */

4789

/* Any regular memory on that node ? */

4786

static void __init check_for_regular_memory(pg_data_t *pgdat)

4790

static void __init check_for_regular_memory(pg_data_t *pgdat)

4787

{

4791

{

4788

#ifdef CONFIG_HIGHMEM

4792

#ifdef CONFIG_HIGHMEM

4789

enum zone_type zone_type;

4793

enum zone_type zone_type;

4790

4794

4791

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

4795

for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {

4792

struct zone *zone = &pgdat->node_zones[zone_type];

4796

struct zone *zone = &pgdat->node_zones[zone_type];

4793

if (zone->present_pages) {

4797

if (zone->present_pages) {

4794

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

4798

node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);

4795

break;

4799

break;

4796

}

4800

}

4797

}

4801

}

4798

#endif

4802

#endif

4799

}

4803

}

4800

4804

4801

/**

4805

/**

4802

* free_area_init_nodes - Initialise all pg_data_t and zone data

4806

* free_area_init_nodes - Initialise all pg_data_t and zone data

4803

* @max_zone_pfn: an array of max PFNs for each zone

4807

* @max_zone_pfn: an array of max PFNs for each zone

4804

*

4808

*

4805

* This will call free_area_init_node() for each active node in the system.

4809

* This will call free_area_init_node() for each active node in the system.

4806

* Using the page ranges provided by add_active_range(), the size of each

4810

* Using the page ranges provided by add_active_range(), the size of each

4807

* zone in each node and their holes is calculated. If the maximum PFN

4811

* zone in each node and their holes is calculated. If the maximum PFN

4808

* between two adjacent zones match, it is assumed that the zone is empty.

4812

* between two adjacent zones match, it is assumed that the zone is empty.

4809

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4813

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

4810

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4814

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

4811

* starts where the previous one ended. For example, ZONE_DMA32 starts

4815

* starts where the previous one ended. For example, ZONE_DMA32 starts

4812

* at arch_max_dma_pfn.

4816

* at arch_max_dma_pfn.

4813

*/

4817

*/

4814

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4818

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

4815

{

4819

{

4816

unsigned long start_pfn, end_pfn;

4820

unsigned long start_pfn, end_pfn;

4817

int i, nid;

4821

int i, nid;

4818

4822

4819

/* Record where the zone boundaries are */

4823

/* Record where the zone boundaries are */

4820

memset(arch_zone_lowest_possible_pfn, 0,

4824

memset(arch_zone_lowest_possible_pfn, 0,

4821

sizeof(arch_zone_lowest_possible_pfn));

4825

sizeof(arch_zone_lowest_possible_pfn));

4822

memset(arch_zone_highest_possible_pfn, 0,

4826

memset(arch_zone_highest_possible_pfn, 0,

4823

sizeof(arch_zone_highest_possible_pfn));

4827

sizeof(arch_zone_highest_possible_pfn));

4824

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4828

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

4825

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4829

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

4826

for (i = 1; i < MAX_NR_ZONES; i++) {

4830

for (i = 1; i < MAX_NR_ZONES; i++) {

4827

if (i == ZONE_MOVABLE)

4831

if (i == ZONE_MOVABLE)

4828

continue;

4832

continue;

4829

arch_zone_lowest_possible_pfn[i] =

4833

arch_zone_lowest_possible_pfn[i] =

4830

arch_zone_highest_possible_pfn[i-1];

4834

arch_zone_highest_possible_pfn[i-1];

4831

arch_zone_highest_possible_pfn[i] =

4835

arch_zone_highest_possible_pfn[i] =

4832

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4836

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

4833

}

4837

}

4834

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4838

arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;

4835

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4839

arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;

4836

4840

4837

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4841

/* Find the PFNs that ZONE_MOVABLE begins at in each node */

4838

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4842

memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));

4839

find_zone_movable_pfns_for_nodes();

4843

find_zone_movable_pfns_for_nodes();

4840

4844

4841

/* Print out the zone ranges */

4845

/* Print out the zone ranges */

4842

printk("Zone ranges:\n");

4846

printk("Zone ranges:\n");

4843

for (i = 0; i < MAX_NR_ZONES; i++) {

4847

for (i = 0; i < MAX_NR_ZONES; i++) {

4844

if (i == ZONE_MOVABLE)

4848

if (i == ZONE_MOVABLE)

4845

continue;

4849

continue;

4846

printk(KERN_CONT " %-8s ", zone_names[i]);

4850

printk(KERN_CONT " %-8s ", zone_names[i]);

4847

if (arch_zone_lowest_possible_pfn[i] ==

4851

if (arch_zone_lowest_possible_pfn[i] ==

4848

arch_zone_highest_possible_pfn[i])

4852

arch_zone_highest_possible_pfn[i])

4849

printk(KERN_CONT "empty\n");

4853

printk(KERN_CONT "empty\n");

4850

else

4854

else

4851

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

4855

printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",

4852

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

4856

arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,

4853

(arch_zone_highest_possible_pfn[i]

4857

(arch_zone_highest_possible_pfn[i]

4854

<< PAGE_SHIFT) - 1);

4858

<< PAGE_SHIFT) - 1);

4855

}

4859

}

4856

4860

4857

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

4861

/* Print out the PFNs ZONE_MOVABLE begins at in each node */

4858

printk("Movable zone start for each node\n");

4862

printk("Movable zone start for each node\n");

4859

for (i = 0; i < MAX_NUMNODES; i++) {

4863

for (i = 0; i < MAX_NUMNODES; i++) {

4860

if (zone_movable_pfn[i])

4864

if (zone_movable_pfn[i])

4861

printk(" Node %d: %#010lx\n", i,

4865

printk(" Node %d: %#010lx\n", i,

4862

zone_movable_pfn[i] << PAGE_SHIFT);

4866

zone_movable_pfn[i] << PAGE_SHIFT);

4863

}

4867

}

4864

4868

4865

/* Print out the early_node_map[] */

4869

/* Print out the early_node_map[] */

4866

printk("Early memory node ranges\n");

4870

printk("Early memory node ranges\n");

4867

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4871

for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)

4868

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

4872

printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,

4869

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

4873

start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

4870

4874

4871

/* Initialise every node */

4875

/* Initialise every node */

4872

mminit_verify_pageflags_layout();

4876

mminit_verify_pageflags_layout();

4873

setup_nr_node_ids();

4877

setup_nr_node_ids();

4874

for_each_online_node(nid) {

4878

for_each_online_node(nid) {

4875

pg_data_t *pgdat = NODE_DATA(nid);

4879

pg_data_t *pgdat = NODE_DATA(nid);

4876

free_area_init_node(nid, NULL,

4880

free_area_init_node(nid, NULL,

4877

find_min_pfn_for_node(nid), NULL);

4881

find_min_pfn_for_node(nid), NULL);

4878

4882

4879

/* Any memory on that node */

4883

/* Any memory on that node */

4880

if (pgdat->node_present_pages)

4884

if (pgdat->node_present_pages)

4881

node_set_state(nid, N_HIGH_MEMORY);

4885

node_set_state(nid, N_HIGH_MEMORY);

4882

check_for_regular_memory(pgdat);

4886

check_for_regular_memory(pgdat);

4883

}

4887

}

4884

}

4888

}

4885

4889

4886

static int __init cmdline_parse_core(char *p, unsigned long *core)

4890

static int __init cmdline_parse_core(char *p, unsigned long *core)

4887

{

4891

{

4888

unsigned long long coremem;

4892

unsigned long long coremem;

4889

if (!p)

4893

if (!p)

4890

return -EINVAL;

4894

return -EINVAL;

4891

4895

4892

coremem = memparse(p, &p);

4896

coremem = memparse(p, &p);

4893

*core = coremem >> PAGE_SHIFT;

4897

*core = coremem >> PAGE_SHIFT;

4894

4898

4895

/* Paranoid check that UL is enough for the coremem value */

4899

/* Paranoid check that UL is enough for the coremem value */

4896

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

4900

WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);

4897

4901

4898

return 0;

4902

return 0;

4899

}

4903

}

4900

4904

4901

/*

4905

/*

4902

* kernelcore=size sets the amount of memory for use for allocations that

4906

* kernelcore=size sets the amount of memory for use for allocations that

4903

* cannot be reclaimed or migrated.

4907

* cannot be reclaimed or migrated.

4904

*/

4908

*/

4905

static int __init cmdline_parse_kernelcore(char *p)

4909

static int __init cmdline_parse_kernelcore(char *p)

4906

{

4910

{

4907

return cmdline_parse_core(p, &required_kernelcore);

4911

return cmdline_parse_core(p, &required_kernelcore);

4908

}

4912

}

4909

4913

4910

/*

4914

/*

4911

* movablecore=size sets the amount of memory for use for allocations that

4915

* movablecore=size sets the amount of memory for use for allocations that

4912

* can be reclaimed or migrated.

4916

* can be reclaimed or migrated.

4913

*/

4917

*/

4914

static int __init cmdline_parse_movablecore(char *p)

4918

static int __init cmdline_parse_movablecore(char *p)

4915

{

4919

{

4916

return cmdline_parse_core(p, &required_movablecore);

4920

return cmdline_parse_core(p, &required_movablecore);

4917

}

4921

}

4918

4922

4919

early_param("kernelcore", cmdline_parse_kernelcore);

4923

early_param("kernelcore", cmdline_parse_kernelcore);

4920

early_param("movablecore", cmdline_parse_movablecore);

4924

early_param("movablecore", cmdline_parse_movablecore);

4921

4925

4922

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4926

#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */

4923

4927

4924

/**

4928

/**

4925

* set_dma_reserve - set the specified number of pages reserved in the first zone

4929

* set_dma_reserve - set the specified number of pages reserved in the first zone

4926

* @new_dma_reserve: The number of pages to mark reserved

4930

* @new_dma_reserve: The number of pages to mark reserved

4927

*

4931

*

4928

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4932

* The per-cpu batchsize and zone watermarks are determined by present_pages.

4929

* In the DMA zone, a significant percentage may be consumed by kernel image

4933

* In the DMA zone, a significant percentage may be consumed by kernel image

4930

* and other unfreeable allocations which can skew the watermarks badly. This

4934

* and other unfreeable allocations which can skew the watermarks badly. This

4931

* function may optionally be used to account for unfreeable pages in the

4935

* function may optionally be used to account for unfreeable pages in the

4932

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4936

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

4933

* smaller per-cpu batchsize.

4937

* smaller per-cpu batchsize.

4934

*/

4938

*/

4935

void __init set_dma_reserve(unsigned long new_dma_reserve)

4939

void __init set_dma_reserve(unsigned long new_dma_reserve)

4936

{

4940

{

4937

dma_reserve = new_dma_reserve;

4941

dma_reserve = new_dma_reserve;

4938

}

4942

}

4939

4943

4940

void __init free_area_init(unsigned long *zones_size)

4944

void __init free_area_init(unsigned long *zones_size)

4941

{

4945

{

4942

free_area_init_node(0, zones_size,

4946

free_area_init_node(0, zones_size,

4943

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4947

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

4944

}

4948

}

4945

4949

4946

static int page_alloc_cpu_notify(struct notifier_block *self,

4950

static int page_alloc_cpu_notify(struct notifier_block *self,

4947

unsigned long action, void *hcpu)

4951

unsigned long action, void *hcpu)

4948

{

4952

{

4949

int cpu = (unsigned long)hcpu;

4953

int cpu = (unsigned long)hcpu;

4950

4954

4951

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4955

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

4952

lru_add_drain_cpu(cpu);

4956

lru_add_drain_cpu(cpu);

4953

drain_pages(cpu);

4957

drain_pages(cpu);

4954

4958

4955

/*

4959

/*

4956

* Spill the event counters of the dead processor

4960

* Spill the event counters of the dead processor

4957

* into the current processors event counters.

4961

* into the current processors event counters.

4958

* This artificially elevates the count of the current

4962

* This artificially elevates the count of the current

4959

* processor.

4963

* processor.

4960

*/

4964

*/

4961

vm_events_fold_cpu(cpu);

4965

vm_events_fold_cpu(cpu);

4962

4966

4963

/*

4967

/*

4964

* Zero the differential counters of the dead processor

4968

* Zero the differential counters of the dead processor

4965

* so that the vm statistics are consistent.

4969

* so that the vm statistics are consistent.

4966

*

4970

*

4967

* This is only okay since the processor is dead and cannot

4971

* This is only okay since the processor is dead and cannot

4968

* race with what we are doing.

4972

* race with what we are doing.

4969

*/

4973

*/

4970

refresh_cpu_vm_stats(cpu);

4974

refresh_cpu_vm_stats(cpu);

4971

}

4975

}

4972

return NOTIFY_OK;

4976

return NOTIFY_OK;

4973

}

4977

}

4974

4978

4975

void __init page_alloc_init(void)

4979

void __init page_alloc_init(void)

4976

{

4980

{

4977

hotcpu_notifier(page_alloc_cpu_notify, 0);

4981

hotcpu_notifier(page_alloc_cpu_notify, 0);

4978

}

4982

}

4979

4983

4980

/*

4984

/*

4981

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4985

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

4982

* or min_free_kbytes changes.

4986

* or min_free_kbytes changes.

4983

*/

4987

*/

4984

static void calculate_totalreserve_pages(void)

4988

static void calculate_totalreserve_pages(void)

4985

{

4989

{

4986

struct pglist_data *pgdat;

4990

struct pglist_data *pgdat;

4987

unsigned long reserve_pages = 0;

4991

unsigned long reserve_pages = 0;

4988

enum zone_type i, j;

4992

enum zone_type i, j;

4989

4993

4990

for_each_online_pgdat(pgdat) {

4994

for_each_online_pgdat(pgdat) {

4991

for (i = 0; i < MAX_NR_ZONES; i++) {

4995

for (i = 0; i < MAX_NR_ZONES; i++) {

4992

struct zone *zone = pgdat->node_zones + i;

4996

struct zone *zone = pgdat->node_zones + i;

4993

unsigned long max = 0;

4997

unsigned long max = 0;

4994

4998

4995

/* Find valid and maximum lowmem_reserve in the zone */

4999

/* Find valid and maximum lowmem_reserve in the zone */

4996

for (j = i; j < MAX_NR_ZONES; j++) {

5000

for (j = i; j < MAX_NR_ZONES; j++) {

4997

if (zone->lowmem_reserve[j] > max)

5001

if (zone->lowmem_reserve[j] > max)

4998

max = zone->lowmem_reserve[j];

5002

max = zone->lowmem_reserve[j];

4999

}

5003

}

5000

5004

5001

/* we treat the high watermark as reserved pages. */

5005

/* we treat the high watermark as reserved pages. */

5002

max += high_wmark_pages(zone);

5006

max += high_wmark_pages(zone);

5003

5007

5004

if (max > zone->present_pages)

5008

if (max > zone->present_pages)

5005

max = zone->present_pages;

5009

max = zone->present_pages;

5006

reserve_pages += max;

5010

reserve_pages += max;

5007

/*

5011

/*

5008

* Lowmem reserves are not available to

5012

* Lowmem reserves are not available to

5009

* GFP_HIGHUSER page cache allocations and

5013

* GFP_HIGHUSER page cache allocations and

5010

* kswapd tries to balance zones to their high

5014

* kswapd tries to balance zones to their high

5011

* watermark. As a result, neither should be

5015

* watermark. As a result, neither should be

5012

* regarded as dirtyable memory, to prevent a

5016

* regarded as dirtyable memory, to prevent a

5013

* situation where reclaim has to clean pages

5017

* situation where reclaim has to clean pages

5014

* in order to balance the zones.

5018

* in order to balance the zones.

5015

*/

5019

*/

5016

zone->dirty_balance_reserve = max;

5020

zone->dirty_balance_reserve = max;

5017

}

5021

}

5018

}

5022

}

5019

dirty_balance_reserve = reserve_pages;

5023

dirty_balance_reserve = reserve_pages;

5020

totalreserve_pages = reserve_pages;

5024

totalreserve_pages = reserve_pages;

5021

}

5025

}

5022

5026

5023

/*

5027

/*

5024

* setup_per_zone_lowmem_reserve - called whenever

5028

* setup_per_zone_lowmem_reserve - called whenever

5025

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5029

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

5026

* has a correct pages reserved value, so an adequate number of

5030

* has a correct pages reserved value, so an adequate number of

5027

* pages are left in the zone after a successful __alloc_pages().

5031

* pages are left in the zone after a successful __alloc_pages().

5028

*/

5032

*/

5029

static void setup_per_zone_lowmem_reserve(void)

5033

static void setup_per_zone_lowmem_reserve(void)

5030

{

5034

{

5031

struct pglist_data *pgdat;

5035

struct pglist_data *pgdat;

5032

enum zone_type j, idx;

5036

enum zone_type j, idx;

5033

5037

5034

for_each_online_pgdat(pgdat) {

5038

for_each_online_pgdat(pgdat) {

5035

for (j = 0; j < MAX_NR_ZONES; j++) {

5039

for (j = 0; j < MAX_NR_ZONES; j++) {

5036

struct zone *zone = pgdat->node_zones + j;

5040

struct zone *zone = pgdat->node_zones + j;

5037

unsigned long present_pages = zone->present_pages;

5041

unsigned long present_pages = zone->present_pages;

5038

5042

5039

zone->lowmem_reserve[j] = 0;

5043

zone->lowmem_reserve[j] = 0;

5040

5044

5041

idx = j;

5045

idx = j;

5042

while (idx) {

5046

while (idx) {

5043

struct zone *lower_zone;

5047

struct zone *lower_zone;

5044

5048

5045

idx--;

5049

idx--;

5046

5050

5047

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5051

if (sysctl_lowmem_reserve_ratio[idx] < 1)

5048

sysctl_lowmem_reserve_ratio[idx] = 1;

5052

sysctl_lowmem_reserve_ratio[idx] = 1;

5049

5053

5050

lower_zone = pgdat->node_zones + idx;

5054

lower_zone = pgdat->node_zones + idx;

5051

lower_zone->lowmem_reserve[j] = present_pages /

5055

lower_zone->lowmem_reserve[j] = present_pages /

5052

sysctl_lowmem_reserve_ratio[idx];

5056

sysctl_lowmem_reserve_ratio[idx];

5053

present_pages += lower_zone->present_pages;

5057

present_pages += lower_zone->present_pages;

5054

}

5058

}

5055

}

5059

}

5056

}

5060

}

5057

5061

5058

/* update totalreserve_pages */

5062

/* update totalreserve_pages */

5059

calculate_totalreserve_pages();

5063

calculate_totalreserve_pages();

5060

}

5064

}

5061

5065

5062

static void __setup_per_zone_wmarks(void)

5066

static void __setup_per_zone_wmarks(void)

5063

{

5067

{

5064

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5068

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

5065

unsigned long lowmem_pages = 0;

5069

unsigned long lowmem_pages = 0;

5066

struct zone *zone;

5070

struct zone *zone;

5067

unsigned long flags;

5071

unsigned long flags;

5068

5072

5069

/* Calculate total number of !ZONE_HIGHMEM pages */

5073

/* Calculate total number of !ZONE_HIGHMEM pages */

5070

for_each_zone(zone) {

5074

for_each_zone(zone) {

5071

if (!is_highmem(zone))

5075

if (!is_highmem(zone))

5072

lowmem_pages += zone->present_pages;

5076

lowmem_pages += zone->present_pages;

5073

}

5077

}

5074

5078

5075

for_each_zone(zone) {

5079

for_each_zone(zone) {

5076

u64 tmp;

5080

u64 tmp;

5077

5081

5078

spin_lock_irqsave(&zone->lock, flags);

5082

spin_lock_irqsave(&zone->lock, flags);

5079

tmp = (u64)pages_min * zone->present_pages;

5083

tmp = (u64)pages_min * zone->present_pages;

5080

do_div(tmp, lowmem_pages);

5084

do_div(tmp, lowmem_pages);

5081

if (is_highmem(zone)) {

5085

if (is_highmem(zone)) {

5082

/*

5086

/*

5083

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5087

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

5084

* need highmem pages, so cap pages_min to a small

5088

* need highmem pages, so cap pages_min to a small

5085

* value here.

5089

* value here.

5086

*

5090

*

5087

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5091

* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)

5088

* deltas controls asynch page reclaim, and so should

5092

* deltas controls asynch page reclaim, and so should

5089

* not be capped for highmem.

5093

* not be capped for highmem.

5090

*/

5094

*/

5091

int min_pages;

5095

int min_pages;

5092

5096

5093

min_pages = zone->present_pages / 1024;

5097

min_pages = zone->present_pages / 1024;

5094

if (min_pages < SWAP_CLUSTER_MAX)

5098

if (min_pages < SWAP_CLUSTER_MAX)

5095

min_pages = SWAP_CLUSTER_MAX;

5099

min_pages = SWAP_CLUSTER_MAX;

5096

if (min_pages > 128)

5100

if (min_pages > 128)

5097

min_pages = 128;

5101

min_pages = 128;

5098

zone->watermark[WMARK_MIN] = min_pages;

5102

zone->watermark[WMARK_MIN] = min_pages;

5099

} else {

5103

} else {

5100

/*

5104

/*

5101

* If it's a lowmem zone, reserve a number of pages

5105

* If it's a lowmem zone, reserve a number of pages

5102

* proportionate to the zone's size.

5106

* proportionate to the zone's size.

5103

*/

5107

*/

5104

zone->watermark[WMARK_MIN] = tmp;

5108

zone->watermark[WMARK_MIN] = tmp;

5105

}

5109

}

5106

5110

5107

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5111

zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);

5108

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5112

zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);

5109

5113

5110

zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);

5114

zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);

5111

zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);

5115

zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);

5112

zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);

5116

zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);

5113

5117

5114

setup_zone_migrate_reserve(zone);

5118

setup_zone_migrate_reserve(zone);

5115

spin_unlock_irqrestore(&zone->lock, flags);

5119

spin_unlock_irqrestore(&zone->lock, flags);

5116

}

5120

}

5117

5121

5118

/* update totalreserve_pages */

5122

/* update totalreserve_pages */

5119

calculate_totalreserve_pages();

5123

calculate_totalreserve_pages();

5120

}

5124

}

5121

5125

5122

/**

5126

/**

5123

* setup_per_zone_wmarks - called when min_free_kbytes changes

5127

* setup_per_zone_wmarks - called when min_free_kbytes changes

5124

* or when memory is hot-{added|removed}

5128

* or when memory is hot-{added|removed}

5125

*

5129

*

5126

* Ensures that the watermark[min,low,high] values for each zone are set

5130

* Ensures that the watermark[min,low,high] values for each zone are set

5127

* correctly with respect to min_free_kbytes.

5131

* correctly with respect to min_free_kbytes.

5128

*/

5132

*/

5129

void setup_per_zone_wmarks(void)

5133

void setup_per_zone_wmarks(void)

5130

{

5134

{

5131

mutex_lock(&zonelists_mutex);

5135

mutex_lock(&zonelists_mutex);

5132

__setup_per_zone_wmarks();

5136

__setup_per_zone_wmarks();

5133

mutex_unlock(&zonelists_mutex);

5137

mutex_unlock(&zonelists_mutex);

5134

}

5138

}

5135

5139

5136

/*

5140

/*

5137

* The inactive anon list should be small enough that the VM never has to

5141

* The inactive anon list should be small enough that the VM never has to

5138

* do too much work, but large enough that each inactive page has a chance

5142

* do too much work, but large enough that each inactive page has a chance

5139

* to be referenced again before it is swapped out.

5143

* to be referenced again before it is swapped out.

5140

*

5144

*

5141

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5145

* The inactive_anon ratio is the target ratio of ACTIVE_ANON to

5142

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5146

* INACTIVE_ANON pages on this zone's LRU, maintained by the

5143

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5147

* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of

5144

* the anonymous pages are kept on the inactive list.

5148

* the anonymous pages are kept on the inactive list.

5145

*

5149

*

5146

* total target max

5150

* total target max

5147

* memory ratio inactive anon

5151

* memory ratio inactive anon

5148

* -------------------------------------

5152

* -------------------------------------

5149

* 10MB 1 5MB

5153

* 10MB 1 5MB

5150

* 100MB 1 50MB

5154

* 100MB 1 50MB

5151

* 1GB 3 250MB

5155

* 1GB 3 250MB

5152

* 10GB 10 0.9GB

5156

* 10GB 10 0.9GB

5153

* 100GB 31 3GB

5157

* 100GB 31 3GB

5154

* 1TB 101 10GB

5158

* 1TB 101 10GB

5155

* 10TB 320 32GB

5159

* 10TB 320 32GB

5156

*/

5160

*/

5157

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5161

static void __meminit calculate_zone_inactive_ratio(struct zone *zone)

5158

{

5162

{

5159

unsigned int gb, ratio;

5163

unsigned int gb, ratio;

5160

5164

5161

/* Zone size in gigabytes */

5165

/* Zone size in gigabytes */

5162

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5166

gb = zone->present_pages >> (30 - PAGE_SHIFT);

5163

if (gb)

5167

if (gb)

5164

ratio = int_sqrt(10 * gb);

5168

ratio = int_sqrt(10 * gb);

5165

else

5169

else

5166

ratio = 1;

5170

ratio = 1;

5167

5171

5168

zone->inactive_ratio = ratio;

5172

zone->inactive_ratio = ratio;

5169

}

5173

}

5170

5174

5171

static void __meminit setup_per_zone_inactive_ratio(void)

5175

static void __meminit setup_per_zone_inactive_ratio(void)

5172

{

5176

{

5173

struct zone *zone;

5177

struct zone *zone;

5174

5178

5175

for_each_zone(zone)

5179

for_each_zone(zone)

5176

calculate_zone_inactive_ratio(zone);

5180

calculate_zone_inactive_ratio(zone);

5177

}

5181

}

5178

5182

5179

/*

5183

/*

5180

* Initialise min_free_kbytes.

5184

* Initialise min_free_kbytes.

5181

*

5185

*

5182

* For small machines we want it small (128k min). For large machines

5186

* For small machines we want it small (128k min). For large machines

5183

* we want it large (64MB max). But it is not linear, because network

5187

* we want it large (64MB max). But it is not linear, because network

5184

* bandwidth does not increase linearly with machine size. We use

5188

* bandwidth does not increase linearly with machine size. We use

5185

*

5189

*

5186

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5190

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

5187

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5191

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

5188

*

5192

*

5189

* which yields

5193

* which yields

5190

*

5194

*

5191

* 16MB: 512k

5195

* 16MB: 512k

5192

* 32MB: 724k

5196

* 32MB: 724k

5193

* 64MB: 1024k

5197

* 64MB: 1024k

5194

* 128MB: 1448k

5198

* 128MB: 1448k

5195

* 256MB: 2048k

5199

* 256MB: 2048k

5196

* 512MB: 2896k

5200

* 512MB: 2896k

5197

* 1024MB: 4096k

5201

* 1024MB: 4096k

5198

* 2048MB: 5792k

5202

* 2048MB: 5792k

5199

* 4096MB: 8192k

5203

* 4096MB: 8192k

5200

* 8192MB: 11584k

5204

* 8192MB: 11584k

5201

* 16384MB: 16384k

5205

* 16384MB: 16384k

5202

*/

5206

*/

5203

int __meminit init_per_zone_wmark_min(void)

5207

int __meminit init_per_zone_wmark_min(void)

5204

{

5208

{

5205

unsigned long lowmem_kbytes;

5209

unsigned long lowmem_kbytes;

5206

5210

5207

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5211

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

5208

5212

5209

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5213

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

5210

if (min_free_kbytes < 128)

5214

if (min_free_kbytes < 128)

5211

min_free_kbytes = 128;

5215

min_free_kbytes = 128;

5212

if (min_free_kbytes > 65536)

5216

if (min_free_kbytes > 65536)

5213

min_free_kbytes = 65536;

5217

min_free_kbytes = 65536;

5214

setup_per_zone_wmarks();

5218

setup_per_zone_wmarks();

5215

refresh_zone_stat_thresholds();

5219

refresh_zone_stat_thresholds();

5216

setup_per_zone_lowmem_reserve();

5220

setup_per_zone_lowmem_reserve();

5217

setup_per_zone_inactive_ratio();

5221

setup_per_zone_inactive_ratio();

5218

return 0;

5222

return 0;

5219

}

5223

}

5220

module_init(init_per_zone_wmark_min)

5224

module_init(init_per_zone_wmark_min)

5221

5225

5222

/*

5226

/*

5223

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5227

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

5224

* that we can call two helper functions whenever min_free_kbytes

5228

* that we can call two helper functions whenever min_free_kbytes

5225

* changes.

5229

* changes.

5226

*/

5230

*/

5227

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5231

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

5228

void __user *buffer, size_t *length, loff_t *ppos)

5232

void __user *buffer, size_t *length, loff_t *ppos)

5229

{

5233

{

5230

proc_dointvec(table, write, buffer, length, ppos);

5234

proc_dointvec(table, write, buffer, length, ppos);

5231

if (write)

5235

if (write)

5232

setup_per_zone_wmarks();

5236

setup_per_zone_wmarks();

5233

return 0;

5237

return 0;

5234

}

5238

}

5235

5239

5236

#ifdef CONFIG_NUMA

5240

#ifdef CONFIG_NUMA

5237

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5241

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

5238

void __user *buffer, size_t *length, loff_t *ppos)

5242

void __user *buffer, size_t *length, loff_t *ppos)

5239

{

5243

{

5240

struct zone *zone;

5244

struct zone *zone;

5241

int rc;

5245

int rc;

5242

5246

5243

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5247

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5244

if (rc)

5248

if (rc)

5245

return rc;

5249

return rc;

5246

5250

5247

for_each_zone(zone)

5251

for_each_zone(zone)

5248

zone->min_unmapped_pages = (zone->present_pages *

5252

zone->min_unmapped_pages = (zone->present_pages *

5249

sysctl_min_unmapped_ratio) / 100;

5253

sysctl_min_unmapped_ratio) / 100;

5250

return 0;

5254

return 0;

5251

}

5255

}

5252

5256

5253

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5257

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

5254

void __user *buffer, size_t *length, loff_t *ppos)

5258

void __user *buffer, size_t *length, loff_t *ppos)

5255

{

5259

{

5256

struct zone *zone;

5260

struct zone *zone;

5257

int rc;

5261

int rc;

5258

5262

5259

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5263

rc = proc_dointvec_minmax(table, write, buffer, length, ppos);

5260

if (rc)

5264

if (rc)

5261

return rc;

5265

return rc;

5262

5266

5263

for_each_zone(zone)

5267

for_each_zone(zone)

5264

zone->min_slab_pages = (zone->present_pages *

5268

zone->min_slab_pages = (zone->present_pages *

5265

sysctl_min_slab_ratio) / 100;

5269

sysctl_min_slab_ratio) / 100;

5266

return 0;

5270

return 0;

5267

}

5271

}

5268

#endif

5272

#endif

5269

5273

5270

/*

5274

/*

5271

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5275

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

5272

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5276

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

5273

* whenever sysctl_lowmem_reserve_ratio changes.

5277

* whenever sysctl_lowmem_reserve_ratio changes.

5274

*

5278

*

5275

* The reserve ratio obviously has absolutely no relation with the

5279

* The reserve ratio obviously has absolutely no relation with the

5276

* minimum watermarks. The lowmem reserve ratio can only make sense

5280

* minimum watermarks. The lowmem reserve ratio can only make sense

5277

* if in function of the boot time zone sizes.

5281

* if in function of the boot time zone sizes.

5278

*/

5282

*/

5279

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5283

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

5280

void __user *buffer, size_t *length, loff_t *ppos)

5284

void __user *buffer, size_t *length, loff_t *ppos)

5281

{

5285

{

5282

proc_dointvec_minmax(table, write, buffer, length, ppos);

5286

proc_dointvec_minmax(table, write, buffer, length, ppos);

5283

setup_per_zone_lowmem_reserve();

5287

setup_per_zone_lowmem_reserve();

5284

return 0;

5288

return 0;

5285

}

5289

}

5286

5290

5287

/*

5291

/*

5288

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5292

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

5289

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5293

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

5290

* can have before it gets flushed back to buddy allocator.

5294

* can have before it gets flushed back to buddy allocator.

5291

*/

5295

*/

5292

5296

5293

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5297

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

5294

void __user *buffer, size_t *length, loff_t *ppos)

5298

void __user *buffer, size_t *length, loff_t *ppos)

5295

{

5299

{

5296

struct zone *zone;

5300

struct zone *zone;

5297

unsigned int cpu;

5301

unsigned int cpu;

5298

int ret;

5302

int ret;

5299

5303

5300

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5304

ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

5301

if (!write || (ret < 0))

5305

if (!write || (ret < 0))

5302

return ret;

5306

return ret;

5303

for_each_populated_zone(zone) {

5307

for_each_populated_zone(zone) {

5304

for_each_possible_cpu(cpu) {

5308

for_each_possible_cpu(cpu) {

5305

unsigned long high;

5309

unsigned long high;

5306

high = zone->present_pages / percpu_pagelist_fraction;

5310

high = zone->present_pages / percpu_pagelist_fraction;

5307

setup_pagelist_highmark(

5311

setup_pagelist_highmark(

5308

per_cpu_ptr(zone->pageset, cpu), high);

5312

per_cpu_ptr(zone->pageset, cpu), high);

5309

}

5313

}

5310

}

5314

}

5311

return 0;

5315

return 0;

5312

}

5316

}

5313

5317

5314

int hashdist = HASHDIST_DEFAULT;

5318

int hashdist = HASHDIST_DEFAULT;

5315

5319

5316

#ifdef CONFIG_NUMA

5320

#ifdef CONFIG_NUMA

5317

static int __init set_hashdist(char *str)

5321

static int __init set_hashdist(char *str)

5318

{

5322

{

5319

if (!str)

5323

if (!str)

5320

return 0;

5324

return 0;

5321

hashdist = simple_strtoul(str, &str, 0);

5325

hashdist = simple_strtoul(str, &str, 0);

5322

return 1;

5326

return 1;

5323

}

5327

}

5324

__setup("hashdist=", set_hashdist);

5328

__setup("hashdist=", set_hashdist);

5325

#endif

5329

#endif

5326

5330

5327

/*

5331

/*

5328

* allocate a large system hash table from bootmem

5332

* allocate a large system hash table from bootmem

5329

* - it is assumed that the hash table must contain an exact power-of-2

5333

* - it is assumed that the hash table must contain an exact power-of-2

5330

* quantity of entries

5334

* quantity of entries

5331

* - limit is the number of hash buckets, not the total allocation size

5335

* - limit is the number of hash buckets, not the total allocation size

5332

*/

5336

*/

5333

void *__init alloc_large_system_hash(const char *tablename,

5337

void *__init alloc_large_system_hash(const char *tablename,

5334

unsigned long bucketsize,

5338

unsigned long bucketsize,

5335

unsigned long numentries,

5339

unsigned long numentries,

5336

int scale,

5340

int scale,

5337

int flags,

5341

int flags,

5338

unsigned int *_hash_shift,

5342

unsigned int *_hash_shift,

5339

unsigned int *_hash_mask,

5343

unsigned int *_hash_mask,

5340

unsigned long low_limit,

5344

unsigned long low_limit,

5341

unsigned long high_limit)

5345

unsigned long high_limit)

5342

{

5346

{

5343

unsigned long long max = high_limit;

5347

unsigned long long max = high_limit;

5344

unsigned long log2qty, size;

5348

unsigned long log2qty, size;

5345

void *table = NULL;

5349

void *table = NULL;

5346

5350

5347

/* allow the kernel cmdline to have a say */

5351

/* allow the kernel cmdline to have a say */

5348

if (!numentries) {

5352

if (!numentries) {

5349

/* round applicable memory size up to nearest megabyte */

5353

/* round applicable memory size up to nearest megabyte */

5350

numentries = nr_kernel_pages;

5354

numentries = nr_kernel_pages;

5351

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5355

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

5352

numentries >>= 20 - PAGE_SHIFT;

5356

numentries >>= 20 - PAGE_SHIFT;

5353

numentries <<= 20 - PAGE_SHIFT;

5357

numentries <<= 20 - PAGE_SHIFT;

5354

5358

5355

/* limit to 1 bucket per 2^scale bytes of low memory */

5359

/* limit to 1 bucket per 2^scale bytes of low memory */

5356

if (scale > PAGE_SHIFT)

5360

if (scale > PAGE_SHIFT)

5357

numentries >>= (scale - PAGE_SHIFT);

5361

numentries >>= (scale - PAGE_SHIFT);

5358

else

5362

else

5359

numentries <<= (PAGE_SHIFT - scale);

5363

numentries <<= (PAGE_SHIFT - scale);

5360

5364

5361

/* Make sure we've got at least a 0-order allocation.. */

5365

/* Make sure we've got at least a 0-order allocation.. */

5362

if (unlikely(flags & HASH_SMALL)) {

5366

if (unlikely(flags & HASH_SMALL)) {

5363

/* Makes no sense without HASH_EARLY */

5367

/* Makes no sense without HASH_EARLY */

5364

WARN_ON(!(flags & HASH_EARLY));

5368

WARN_ON(!(flags & HASH_EARLY));

5365

if (!(numentries >> *_hash_shift)) {

5369

if (!(numentries >> *_hash_shift)) {

5366

numentries = 1UL << *_hash_shift;

5370

numentries = 1UL << *_hash_shift;

5367

BUG_ON(!numentries);

5371

BUG_ON(!numentries);

5368

}

5372

}

5369

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5373

} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))

5370

numentries = PAGE_SIZE / bucketsize;

5374

numentries = PAGE_SIZE / bucketsize;

5371

}

5375

}

5372

numentries = roundup_pow_of_two(numentries);

5376

numentries = roundup_pow_of_two(numentries);

5373

5377

5374

/* limit allocation size to 1/16 total memory by default */

5378

/* limit allocation size to 1/16 total memory by default */

5375

if (max == 0) {

5379

if (max == 0) {

5376

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5380

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

5377

do_div(max, bucketsize);

5381

do_div(max, bucketsize);

5378

}

5382

}

5379

max = min(max, 0x80000000ULL);

5383

max = min(max, 0x80000000ULL);

5380

5384

5381

if (numentries < low_limit)

5385

if (numentries < low_limit)

5382

numentries = low_limit;

5386

numentries = low_limit;

5383

if (numentries > max)

5387

if (numentries > max)

5384

numentries = max;

5388

numentries = max;

5385

5389

5386

log2qty = ilog2(numentries);

5390

log2qty = ilog2(numentries);

5387

5391

5388

do {

5392

do {

5389

size = bucketsize << log2qty;

5393

size = bucketsize << log2qty;

5390

if (flags & HASH_EARLY)

5394

if (flags & HASH_EARLY)

5391

table = alloc_bootmem_nopanic(size);

5395

table = alloc_bootmem_nopanic(size);

5392

else if (hashdist)

5396

else if (hashdist)

5393

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5397

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

5394

else {

5398

else {

5395

/*

5399

/*

5396

* If bucketsize is not a power-of-two, we may free

5400

* If bucketsize is not a power-of-two, we may free

5397

* some pages at the end of hash table which

5401

* some pages at the end of hash table which

5398

* alloc_pages_exact() automatically does

5402

* alloc_pages_exact() automatically does

5399

*/

5403

*/

5400

if (get_order(size) < MAX_ORDER) {

5404

if (get_order(size) < MAX_ORDER) {

5401

table = alloc_pages_exact(size, GFP_ATOMIC);

5405

table = alloc_pages_exact(size, GFP_ATOMIC);

5402

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5406

kmemleak_alloc(table, size, 1, GFP_ATOMIC);

5403

}

5407

}

5404

}

5408

}

5405

} while (!table && size > PAGE_SIZE && --log2qty);

5409

} while (!table && size > PAGE_SIZE && --log2qty);

5406

5410

5407

if (!table)

5411

if (!table)

5408

panic("Failed to allocate %s hash table\n", tablename);

5412

panic("Failed to allocate %s hash table\n", tablename);

5409

5413

5410

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5414

printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",

5411

tablename,

5415

tablename,

5412

(1UL << log2qty),

5416

(1UL << log2qty),

5413

ilog2(size) - PAGE_SHIFT,

5417

ilog2(size) - PAGE_SHIFT,

5414

size);

5418

size);

5415

5419

5416

if (_hash_shift)

5420

if (_hash_shift)

5417

*_hash_shift = log2qty;

5421

*_hash_shift = log2qty;

5418

if (_hash_mask)

5422

if (_hash_mask)

5419

*_hash_mask = (1 << log2qty) - 1;

5423

*_hash_mask = (1 << log2qty) - 1;

5420

5424

5421

return table;

5425

return table;

5422

}

5426

}

5423

5427

5424

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5428

/* Return a pointer to the bitmap storing bits affecting a block of pages */

5425

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5429

static inline unsigned long *get_pageblock_bitmap(struct zone *zone,

5426

unsigned long pfn)

5430

unsigned long pfn)

5427

{

5431

{

5428

#ifdef CONFIG_SPARSEMEM

5432

#ifdef CONFIG_SPARSEMEM

5429

return __pfn_to_section(pfn)->pageblock_flags;

5433

return __pfn_to_section(pfn)->pageblock_flags;

5430

#else

5434

#else

5431

return zone->pageblock_flags;

5435

return zone->pageblock_flags;

5432

#endif /* CONFIG_SPARSEMEM */

5436

#endif /* CONFIG_SPARSEMEM */

5433

}

5437

}

5434

5438

5435

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5439

static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)

5436

{

5440

{

5437

#ifdef CONFIG_SPARSEMEM

5441

#ifdef CONFIG_SPARSEMEM

5438

pfn &= (PAGES_PER_SECTION-1);

5442

pfn &= (PAGES_PER_SECTION-1);

5439

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5443

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5440

#else

5444

#else

5441

pfn = pfn - zone->zone_start_pfn;

5445

pfn = pfn - zone->zone_start_pfn;

5442

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5446

return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;

5443

#endif /* CONFIG_SPARSEMEM */

5447

#endif /* CONFIG_SPARSEMEM */

5444

}

5448

}

5445

5449

5446

/**

5450

/**

5447

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5451

* get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages

5448

* @page: The page within the block of interest

5452

* @page: The page within the block of interest

5449

* @start_bitidx: The first bit of interest to retrieve

5453

* @start_bitidx: The first bit of interest to retrieve

5450

* @end_bitidx: The last bit of interest

5454

* @end_bitidx: The last bit of interest

5451

* returns pageblock_bits flags

5455

* returns pageblock_bits flags

5452

*/

5456

*/

5453

unsigned long get_pageblock_flags_group(struct page *page,

5457

unsigned long get_pageblock_flags_group(struct page *page,

5454

int start_bitidx, int end_bitidx)

5458

int start_bitidx, int end_bitidx)

5455

{

5459

{

5456

struct zone *zone;

5460

struct zone *zone;

5457

unsigned long *bitmap;

5461

unsigned long *bitmap;

5458

unsigned long pfn, bitidx;

5462

unsigned long pfn, bitidx;

5459

unsigned long flags = 0;

5463

unsigned long flags = 0;

5460

unsigned long value = 1;

5464

unsigned long value = 1;

5461

5465

5462

zone = page_zone(page);

5466

zone = page_zone(page);

5463

pfn = page_to_pfn(page);

5467

pfn = page_to_pfn(page);

5464

bitmap = get_pageblock_bitmap(zone, pfn);

5468

bitmap = get_pageblock_bitmap(zone, pfn);

5465

bitidx = pfn_to_bitidx(zone, pfn);

5469

bitidx = pfn_to_bitidx(zone, pfn);

5466

5470

5467

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5471

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5468

if (test_bit(bitidx + start_bitidx, bitmap))

5472

if (test_bit(bitidx + start_bitidx, bitmap))

5469

flags |= value;

5473

flags |= value;

5470

5474

5471

return flags;

5475

return flags;

5472

}

5476

}

5473

5477

5474

/**

5478

/**

5475

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5479

* set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages

5476

* @page: The page within the block of interest

5480

* @page: The page within the block of interest

5477

* @start_bitidx: The first bit of interest

5481

* @start_bitidx: The first bit of interest

5478

* @end_bitidx: The last bit of interest

5482

* @end_bitidx: The last bit of interest

5479

* @flags: The flags to set

5483

* @flags: The flags to set

5480

*/

5484

*/

5481

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5485

void set_pageblock_flags_group(struct page *page, unsigned long flags,

5482

int start_bitidx, int end_bitidx)

5486

int start_bitidx, int end_bitidx)

5483

{

5487

{

5484

struct zone *zone;

5488

struct zone *zone;

5485

unsigned long *bitmap;

5489

unsigned long *bitmap;

5486

unsigned long pfn, bitidx;

5490

unsigned long pfn, bitidx;

5487

unsigned long value = 1;

5491

unsigned long value = 1;

5488

5492

5489

zone = page_zone(page);

5493

zone = page_zone(page);

5490

pfn = page_to_pfn(page);

5494

pfn = page_to_pfn(page);

5491

bitmap = get_pageblock_bitmap(zone, pfn);

5495

bitmap = get_pageblock_bitmap(zone, pfn);

5492

bitidx = pfn_to_bitidx(zone, pfn);

5496

bitidx = pfn_to_bitidx(zone, pfn);

5493

VM_BUG_ON(pfn < zone->zone_start_pfn);

5497

VM_BUG_ON(pfn < zone->zone_start_pfn);

5494

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5498

VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);

5495

5499

5496

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5500

for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)

5497

if (flags & value)

5501

if (flags & value)

5498

__set_bit(bitidx + start_bitidx, bitmap);

5502

__set_bit(bitidx + start_bitidx, bitmap);

5499

else

5503

else

5500

__clear_bit(bitidx + start_bitidx, bitmap);

5504

__clear_bit(bitidx + start_bitidx, bitmap);

5501

}

5505

}

5502

5506

5503

/*

5507

/*

5504

* This function checks whether pageblock includes unmovable pages or not.

5508

* This function checks whether pageblock includes unmovable pages or not.

5505

* If @count is not zero, it is okay to include less @count unmovable pages

5509

* If @count is not zero, it is okay to include less @count unmovable pages

5506

*

5510

*

5507

* PageLRU check wihtout isolation or lru_lock could race so that

5511

* PageLRU check wihtout isolation or lru_lock could race so that

5508

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5512

* MIGRATE_MOVABLE block might include unmovable pages. It means you can't

5509

* expect this function should be exact.

5513

* expect this function should be exact.

5510

*/

5514

*/

5511

bool has_unmovable_pages(struct zone *zone, struct page *page, int count)

5515

bool has_unmovable_pages(struct zone *zone, struct page *page, int count)

5512

{

5516

{

5513

unsigned long pfn, iter, found;

5517

unsigned long pfn, iter, found;

5514

int mt;

5518

int mt;

5515

5519

5516

/*

5520

/*

5517

* For avoiding noise data, lru_add_drain_all() should be called

5521

* For avoiding noise data, lru_add_drain_all() should be called

5518

* If ZONE_MOVABLE, the zone never contains unmovable pages

5522

* If ZONE_MOVABLE, the zone never contains unmovable pages

5519

*/

5523

*/

5520

if (zone_idx(zone) == ZONE_MOVABLE)

5524

if (zone_idx(zone) == ZONE_MOVABLE)

5521

return false;

5525

return false;

5522

mt = get_pageblock_migratetype(page);

5526

mt = get_pageblock_migratetype(page);

5523

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5527

if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))

5524

return false;

5528

return false;

5525

5529

5526

pfn = page_to_pfn(page);

5530

pfn = page_to_pfn(page);

5527

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5531

for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {

5528

unsigned long check = pfn + iter;

5532

unsigned long check = pfn + iter;

5529

5533

5530

if (!pfn_valid_within(check))

5534

if (!pfn_valid_within(check))

5531

continue;

5535

continue;

5532

5536

5533

page = pfn_to_page(check);

5537

page = pfn_to_page(check);

5534

/*

5538

/*

5535

* We can't use page_count without pin a page

5539

* We can't use page_count without pin a page

5536

* because another CPU can free compound page.

5540

* because another CPU can free compound page.

5537

* This check already skips compound tails of THP

5541

* This check already skips compound tails of THP

5538

* because their page->_count is zero at all time.

5542

* because their page->_count is zero at all time.

5539

*/

5543

*/

5540

if (!atomic_read(&page->_count)) {

5544

if (!atomic_read(&page->_count)) {

5541

if (PageBuddy(page))

5545

if (PageBuddy(page))

5542

iter += (1 << page_order(page)) - 1;

5546

iter += (1 << page_order(page)) - 1;

5543

continue;

5547

continue;

5544

}

5548

}

5545

5549

5546

if (!PageLRU(page))

5550

if (!PageLRU(page))

5547

found++;

5551

found++;

5548

/*

5552

/*

5549

* If there are RECLAIMABLE pages, we need to check it.

5553

* If there are RECLAIMABLE pages, we need to check it.

5550

* But now, memory offline itself doesn't call shrink_slab()

5554

* But now, memory offline itself doesn't call shrink_slab()

5551

* and it still to be fixed.

5555

* and it still to be fixed.

5552

*/

5556

*/

5553

/*

5557

/*

5554

* If the page is not RAM, page_count()should be 0.

5558

* If the page is not RAM, page_count()should be 0.

5555

* we don't need more check. This is an _used_ not-movable page.

5559

* we don't need more check. This is an _used_ not-movable page.

5556

*

5560

*

5557

* The problematic thing here is PG_reserved pages. PG_reserved

5561

* The problematic thing here is PG_reserved pages. PG_reserved

5558

* is set to both of a memory hole page and a _used_ kernel

5562

* is set to both of a memory hole page and a _used_ kernel

5559

* page at boot.

5563

* page at boot.

5560

*/

5564

*/

5561

if (found > count)

5565

if (found > count)

5562

return true;

5566

return true;

5563

}

5567

}

5564

return false;

5568

return false;

5565

}

5569

}

5566

5570

5567

bool is_pageblock_removable_nolock(struct page *page)

5571

bool is_pageblock_removable_nolock(struct page *page)

5568

{

5572

{

5569

struct zone *zone;

5573

struct zone *zone;

5570

unsigned long pfn;

5574

unsigned long pfn;

5571

5575

5572

/*

5576

/*

5573

* We have to be careful here because we are iterating over memory

5577

* We have to be careful here because we are iterating over memory

5574

* sections which are not zone aware so we might end up outside of

5578

* sections which are not zone aware so we might end up outside of

5575

* the zone but still within the section.

5579

* the zone but still within the section.

5576

* We have to take care about the node as well. If the node is offline

5580

* We have to take care about the node as well. If the node is offline

5577

* its NODE_DATA will be NULL - see page_zone.

5581

* its NODE_DATA will be NULL - see page_zone.

5578

*/

5582

*/

5579

if (!node_online(page_to_nid(page)))

5583

if (!node_online(page_to_nid(page)))

5580

return false;

5584

return false;

5581

5585

5582

zone = page_zone(page);

5586

zone = page_zone(page);

5583

pfn = page_to_pfn(page);

5587

pfn = page_to_pfn(page);

5584

if (zone->zone_start_pfn > pfn ||

5588

if (zone->zone_start_pfn > pfn ||

5585

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5589

zone->zone_start_pfn + zone->spanned_pages <= pfn)

5586

return false;

5590

return false;

5587

5591

5588

return !has_unmovable_pages(zone, page, 0);

5592

return !has_unmovable_pages(zone, page, 0);

5589

}

5593

}

5590

5594

5591

#ifdef CONFIG_CMA

5595

#ifdef CONFIG_CMA

5592

5596

5593

static unsigned long pfn_max_align_down(unsigned long pfn)

5597

static unsigned long pfn_max_align_down(unsigned long pfn)

5594

{

5598

{

5595

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5599

return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,

5596

pageblock_nr_pages) - 1);

5600

pageblock_nr_pages) - 1);

5597

}

5601

}

5598

5602

5599

static unsigned long pfn_max_align_up(unsigned long pfn)

5603

static unsigned long pfn_max_align_up(unsigned long pfn)

5600

{

5604

{

5601

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5605

return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,

5602

pageblock_nr_pages));

5606

pageblock_nr_pages));

5603

}

5607

}

5604

5608

5605

static struct page *

5609

static struct page *

5606

__alloc_contig_migrate_alloc(struct page *page, unsigned long private,

5610

__alloc_contig_migrate_alloc(struct page *page, unsigned long private,

5607

int **resultp)

5611

int **resultp)

5608

{

5612

{

5609

gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;

5613

gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;

5610

5614

5611

if (PageHighMem(page))

5615

if (PageHighMem(page))

5612

gfp_mask |= __GFP_HIGHMEM;

5616

gfp_mask |= __GFP_HIGHMEM;

5613

5617

5614

return alloc_page(gfp_mask);

5618

return alloc_page(gfp_mask);

5615

}

5619

}

5616

5620

5617

/* [start, end) must belong to a single zone. */

5621

/* [start, end) must belong to a single zone. */

5618

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

5622

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

5619

{

5623

{

5620

/* This function is based on compact_zone() from compaction.c. */

5624

/* This function is based on compact_zone() from compaction.c. */

5621

5625

5622

unsigned long pfn = start;

5626

unsigned long pfn = start;

5623

unsigned int tries = 0;

5627

unsigned int tries = 0;

5624

int ret = 0;

5628

int ret = 0;

5625

5629

5626

struct compact_control cc = {

5630

struct compact_control cc = {

5627

.nr_migratepages = 0,

5631

.nr_migratepages = 0,

5628

.order = -1,

5632

.order = -1,

5629

.zone = page_zone(pfn_to_page(start)),

5633

.zone = page_zone(pfn_to_page(start)),

5630

.sync = true,

5634

.sync = true,

5631

};

5635

};

5632

INIT_LIST_HEAD(&cc.migratepages);

5636

INIT_LIST_HEAD(&cc.migratepages);

5633

5637

5634

migrate_prep_local();

5638

migrate_prep_local();

5635

5639

5636

while (pfn < end || !list_empty(&cc.migratepages)) {

5640

while (pfn < end || !list_empty(&cc.migratepages)) {

5637

if (fatal_signal_pending(current)) {

5641

if (fatal_signal_pending(current)) {

5638

ret = -EINTR;

5642

ret = -EINTR;

5639

break;

5643

break;

5640

}

5644

}

5641

5645

5642

if (list_empty(&cc.migratepages)) {

5646

if (list_empty(&cc.migratepages)) {

5643

cc.nr_migratepages = 0;

5647

cc.nr_migratepages = 0;

5644

pfn = isolate_migratepages_range(cc.zone, &cc,

5648

pfn = isolate_migratepages_range(cc.zone, &cc,

5645

pfn, end);

5649

pfn, end);

5646

if (!pfn) {

5650

if (!pfn) {

5647

ret = -EINTR;

5651

ret = -EINTR;

5648

break;

5652

break;

5649

}

5653

}

5650

tries = 0;

5654

tries = 0;

5651

} else if (++tries == 5) {

5655

} else if (++tries == 5) {

5652

ret = ret < 0 ? ret : -EBUSY;

5656

ret = ret < 0 ? ret : -EBUSY;

5653

break;

5657

break;

5654

}

5658

}

5655

5659

5656

ret = migrate_pages(&cc.migratepages,

5660

ret = migrate_pages(&cc.migratepages,

5657

__alloc_contig_migrate_alloc,

5661

__alloc_contig_migrate_alloc,

5658

0, false, MIGRATE_SYNC);

5662

0, false, MIGRATE_SYNC);

5659

}

5663

}

5660

5664

5661

putback_lru_pages(&cc.migratepages);

5665

putback_lru_pages(&cc.migratepages);

5662

return ret > 0 ? 0 : ret;

5666

return ret > 0 ? 0 : ret;

5663

}

5667

}

5664

5668

5665

/*

5669

/*

5666

* Update zone's cma pages counter used for watermark level calculation.

5670

* Update zone's cma pages counter used for watermark level calculation.

5667

*/

5671

*/

5668

static inline void __update_cma_watermarks(struct zone *zone, int count)

5672

static inline void __update_cma_watermarks(struct zone *zone, int count)

5669

{

5673

{

5670

unsigned long flags;

5674

unsigned long flags;

5671

spin_lock_irqsave(&zone->lock, flags);

5675

spin_lock_irqsave(&zone->lock, flags);

5672

zone->min_cma_pages += count;

5676

zone->min_cma_pages += count;

5673

spin_unlock_irqrestore(&zone->lock, flags);

5677

spin_unlock_irqrestore(&zone->lock, flags);

5674

setup_per_zone_wmarks();

5678

setup_per_zone_wmarks();

5675

}

5679

}

5676

5680

5677

/*

5681

/*

5678

* Trigger memory pressure bump to reclaim some pages in order to be able to

5682

* Trigger memory pressure bump to reclaim some pages in order to be able to

5679

* allocate 'count' pages in single page units. Does similar work as

5683

* allocate 'count' pages in single page units. Does similar work as

5680

*__alloc_pages_slowpath() function.

5684

*__alloc_pages_slowpath() function.

5681

*/

5685

*/

5682

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

5686

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

5683

{

5687

{

5684

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

5688

enum zone_type high_zoneidx = gfp_zone(gfp_mask);

5685

struct zonelist *zonelist = node_zonelist(0, gfp_mask);

5689

struct zonelist *zonelist = node_zonelist(0, gfp_mask);

5686

int did_some_progress = 0;

5690

int did_some_progress = 0;

5687

int order = 1;

5691

int order = 1;

5688

5692

5689

/*

5693

/*

5690

* Increase level of watermarks to force kswapd do his job

5694

* Increase level of watermarks to force kswapd do his job

5691

* to stabilise at new watermark level.

5695

* to stabilise at new watermark level.

5692

*/

5696

*/

5693

__update_cma_watermarks(zone, count);

5697

__update_cma_watermarks(zone, count);

5694

5698

5695

/* Obey watermarks as if the page was being allocated */

5699

/* Obey watermarks as if the page was being allocated */

5696

while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {

5700

while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {

5697

wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

5701

wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

5698

5702

5699

did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

5703

did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

5700

NULL);

5704

NULL);

5701

if (!did_some_progress) {

5705

if (!did_some_progress) {

5702

/* Exhausted what can be done so it's blamo time */

5706

/* Exhausted what can be done so it's blamo time */

5703

out_of_memory(zonelist, gfp_mask, order, NULL, false);

5707

out_of_memory(zonelist, gfp_mask, order, NULL, false);

5704

}

5708

}

5705

}

5709

}

5706

5710

5707

/* Restore original watermark levels. */

5711

/* Restore original watermark levels. */

5708

__update_cma_watermarks(zone, -count);

5712

__update_cma_watermarks(zone, -count);

5709

5713

5710

return count;

5714

return count;

5711

}

5715

}

5712

5716

5713

/**

5717

/**

5714

* alloc_contig_range() -- tries to allocate given range of pages

5718

* alloc_contig_range() -- tries to allocate given range of pages

5715

* @start: start PFN to allocate

5719

* @start: start PFN to allocate

5716

* @end: one-past-the-last PFN to allocate

5720

* @end: one-past-the-last PFN to allocate

5717

* @migratetype: migratetype of the underlaying pageblocks (either

5721

* @migratetype: migratetype of the underlaying pageblocks (either

5718

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5722

* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks

5719

* in range must have the same migratetype and it must

5723

* in range must have the same migratetype and it must

5720

* be either of the two.

5724

* be either of the two.

5721

*

5725

*

5722

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5726

* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES

5723

* aligned, however it's the caller's responsibility to guarantee that

5727

* aligned, however it's the caller's responsibility to guarantee that

5724

* we are the only thread that changes migrate type of pageblocks the

5728

* we are the only thread that changes migrate type of pageblocks the

5725

* pages fall in.

5729

* pages fall in.

5726

*

5730

*

5727

* The PFN range must belong to a single zone.

5731

* The PFN range must belong to a single zone.

5728

*

5732

*

5729

* Returns zero on success or negative error code. On success all

5733

* Returns zero on success or negative error code. On success all

5730

* pages which PFN is in [start, end) are allocated for the caller and

5734

* pages which PFN is in [start, end) are allocated for the caller and

5731

* need to be freed with free_contig_range().

5735

* need to be freed with free_contig_range().

5732

*/

5736

*/

5733

int alloc_contig_range(unsigned long start, unsigned long end,

5737

int alloc_contig_range(unsigned long start, unsigned long end,

5734

unsigned migratetype)

5738

unsigned migratetype)

5735

{

5739

{

5736

struct zone *zone = page_zone(pfn_to_page(start));

5740

struct zone *zone = page_zone(pfn_to_page(start));

5737

unsigned long outer_start, outer_end;

5741

unsigned long outer_start, outer_end;

5738

int ret = 0, order;

5742

int ret = 0, order;

5739

5743

5740

/*

5744

/*

5741

* What we do here is we mark all pageblocks in range as

5745

* What we do here is we mark all pageblocks in range as

5742

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5746

* MIGRATE_ISOLATE. Because pageblock and max order pages may

5743

* have different sizes, and due to the way page allocator

5747

* have different sizes, and due to the way page allocator

5744

* work, we align the range to biggest of the two pages so

5748

* work, we align the range to biggest of the two pages so

5745

* that page allocator won't try to merge buddies from

5749

* that page allocator won't try to merge buddies from

5746

* different pageblocks and change MIGRATE_ISOLATE to some

5750

* different pageblocks and change MIGRATE_ISOLATE to some

5747

* other migration type.

5751

* other migration type.

5748

*

5752

*

5749

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5753

* Once the pageblocks are marked as MIGRATE_ISOLATE, we

5750

* migrate the pages from an unaligned range (ie. pages that

5754

* migrate the pages from an unaligned range (ie. pages that

5751

* we are interested in). This will put all the pages in

5755

* we are interested in). This will put all the pages in

5752

* range back to page allocator as MIGRATE_ISOLATE.

5756

* range back to page allocator as MIGRATE_ISOLATE.

5753

*

5757

*

5754

* When this is done, we take the pages in range from page

5758

* When this is done, we take the pages in range from page

5755

* allocator removing them from the buddy system. This way

5759

* allocator removing them from the buddy system. This way

5756

* page allocator will never consider using them.

5760

* page allocator will never consider using them.

5757

*

5761

*

5758

* This lets us mark the pageblocks back as

5762

* This lets us mark the pageblocks back as

5759

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5763

* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

5760

* aligned range but not in the unaligned, original range are

5764

* aligned range but not in the unaligned, original range are

5761

* put back to page allocator so that buddy can use them.

5765

* put back to page allocator so that buddy can use them.

5762

*/

5766

*/

5763

5767

5764

ret = start_isolate_page_range(pfn_max_align_down(start),

5768

ret = start_isolate_page_range(pfn_max_align_down(start),

5765

pfn_max_align_up(end), migratetype);

5769

pfn_max_align_up(end), migratetype);

5766

if (ret)

5770

if (ret)

5767

goto done;

5771

goto done;

5768

5772

5769

ret = __alloc_contig_migrate_range(start, end);

5773

ret = __alloc_contig_migrate_range(start, end);

5770

if (ret)

5774

if (ret)

5771

goto done;

5775

goto done;

5772

5776

5773

/*

5777

/*

5774

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5778

* Pages from [start, end) are within a MAX_ORDER_NR_PAGES

5775

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5779

* aligned blocks that are marked as MIGRATE_ISOLATE. What's

5776

* more, all pages in [start, end) are free in page allocator.

5780

* more, all pages in [start, end) are free in page allocator.

5777

* What we are going to do is to allocate all pages from

5781

* What we are going to do is to allocate all pages from

5778

* [start, end) (that is remove them from page allocator).

5782

* [start, end) (that is remove them from page allocator).

5779

*

5783

*

5780

* The only problem is that pages at the beginning and at the

5784

* The only problem is that pages at the beginning and at the

5781

* end of interesting range may be not aligned with pages that

5785

* end of interesting range may be not aligned with pages that

5782

* page allocator holds, ie. they can be part of higher order

5786

* page allocator holds, ie. they can be part of higher order

5783

* pages. Because of this, we reserve the bigger range and

5787

* pages. Because of this, we reserve the bigger range and

5784

* once this is done free the pages we are not interested in.

5788

* once this is done free the pages we are not interested in.

5785

*

5789

*

5786

* We don't have to hold zone->lock here because the pages are

5790

* We don't have to hold zone->lock here because the pages are

5787

* isolated thus they won't get removed from buddy.

5791

* isolated thus they won't get removed from buddy.

5788

*/

5792

*/

5789

5793

5790

lru_add_drain_all();

5794

lru_add_drain_all();

5791

drain_all_pages();

5795

drain_all_pages();

5792

5796

5793

order = 0;

5797

order = 0;

5794

outer_start = start;

5798

outer_start = start;

5795

while (!PageBuddy(pfn_to_page(outer_start))) {

5799

while (!PageBuddy(pfn_to_page(outer_start))) {

5796

if (++order >= MAX_ORDER) {

5800

if (++order >= MAX_ORDER) {

5797

ret = -EBUSY;

5801

ret = -EBUSY;

5798

goto done;

5802

goto done;

5799

}

5803

}

5800

outer_start &= ~0UL << order;

5804

outer_start &= ~0UL << order;

5801

}

5805

}

5802

5806

5803

/* Make sure the range is really isolated. */

5807

/* Make sure the range is really isolated. */

5804

if (test_pages_isolated(outer_start, end)) {

5808

if (test_pages_isolated(outer_start, end)) {

5805

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5809

pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",

5806

outer_start, end);

5810

outer_start, end);

5807

ret = -EBUSY;

5811

ret = -EBUSY;

5808

goto done;

5812

goto done;

5809

}

5813

}

5810

5814

5811

/*

5815

/*

5812

* Reclaim enough pages to make sure that contiguous allocation

5816

* Reclaim enough pages to make sure that contiguous allocation

5813

* will not starve the system.

5817

* will not starve the system.

5814

*/

5818

*/

5815

__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

5819

__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

5816

5820

5817

/* Grab isolated pages from freelists. */

5821

/* Grab isolated pages from freelists. */

5818

outer_end = isolate_freepages_range(outer_start, end);

5822

outer_end = isolate_freepages_range(outer_start, end);

5819

if (!outer_end) {

5823

if (!outer_end) {

5820

ret = -EBUSY;

5824

ret = -EBUSY;

5821

goto done;

5825

goto done;

5822

}

5826

}

5823

5827

5824

/* Free head and tail (if any) */

5828

/* Free head and tail (if any) */

5825

if (start != outer_start)

5829

if (start != outer_start)

5826

free_contig_range(outer_start, start - outer_start);

5830

free_contig_range(outer_start, start - outer_start);

5827

if (end != outer_end)

5831

if (end != outer_end)

5828

free_contig_range(end, outer_end - end);

5832

free_contig_range(end, outer_end - end);

5829

5833

5830

done:

5834

done:

5831

undo_isolate_page_range(pfn_max_align_down(start),

5835

undo_isolate_page_range(pfn_max_align_down(start),

5832

pfn_max_align_up(end), migratetype);

5836

pfn_max_align_up(end), migratetype);

5833

return ret;

5837

return ret;

5834

}

5838

}

5835

5839

5836

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5840

void free_contig_range(unsigned long pfn, unsigned nr_pages)

5837

{

5841

{

5838

for (; nr_pages--; ++pfn)

5842

for (; nr_pages--; ++pfn)

5839

__free_page(pfn_to_page(pfn));

5843

__free_page(pfn_to_page(pfn));

5840

}

5844

}

5841

#endif

5845

#endif

5842

5846

5843

#ifdef CONFIG_MEMORY_HOTPLUG

5847

#ifdef CONFIG_MEMORY_HOTPLUG

5844

static int __meminit __zone_pcp_update(void *data)

5848

static int __meminit __zone_pcp_update(void *data)

5845

{

5849

{

5846

struct zone *zone = data;

5850

struct zone *zone = data;

5847

int cpu;

5851

int cpu;

5848

unsigned long batch = zone_batchsize(zone), flags;

5852

unsigned long batch = zone_batchsize(zone), flags;

5849

5853

5850

for_each_possible_cpu(cpu) {

5854

for_each_possible_cpu(cpu) {

5851

struct per_cpu_pageset *pset;

5855

struct per_cpu_pageset *pset;

5852

struct per_cpu_pages *pcp;

5856

struct per_cpu_pages *pcp;

5853

5857

5854

pset = per_cpu_ptr(zone->pageset, cpu);

5858

pset = per_cpu_ptr(zone->pageset, cpu);

5855

pcp = &pset->pcp;

5859

pcp = &pset->pcp;

5856

5860

5857

local_irq_save(flags);

5861

local_irq_save(flags);

5858

if (pcp->count > 0)

5862

if (pcp->count > 0)

5859

free_pcppages_bulk(zone, pcp->count, pcp);

5863

free_pcppages_bulk(zone, pcp->count, pcp);

5860

setup_pageset(pset, batch);

5864

setup_pageset(pset, batch);

5861

local_irq_restore(flags);

5865

local_irq_restore(flags);

5862

}

5866

}

5863

return 0;

5867

return 0;

5864

}

5868

}

5865

5869

5866

void __meminit zone_pcp_update(struct zone *zone)

5870

void __meminit zone_pcp_update(struct zone *zone)

5867

{

5871

{

5868

stop_machine(__zone_pcp_update, zone, NULL);

5872

stop_machine(__zone_pcp_update, zone, NULL);

5869

}

5873

}

5870

#endif

5874

#endif

5871

5875

5872

#ifdef CONFIG_MEMORY_HOTREMOVE

5876

#ifdef CONFIG_MEMORY_HOTREMOVE

5873

void zone_pcp_reset(struct zone *zone)

5877

void zone_pcp_reset(struct zone *zone)

5874

{

5878

{

5875

unsigned long flags;

5879

unsigned long flags;

5876

5880

5877

/* avoid races with drain_pages() */

5881

/* avoid races with drain_pages() */

5878

local_irq_save(flags);

5882

local_irq_save(flags);

5879

if (zone->pageset != &boot_pageset) {

5883

if (zone->pageset != &boot_pageset) {

5880

free_percpu(zone->pageset);

5884

free_percpu(zone->pageset);

5881

zone->pageset = &boot_pageset;

5885

zone->pageset = &boot_pageset;

5882

}

5886

}

5883

local_irq_restore(flags);

5887

local_irq_restore(flags);

5884

}

5888

}

5885

5889

5886

/*

5890

/*

5887

* All pages in the range must be isolated before calling this.

5891

* All pages in the range must be isolated before calling this.

5888

*/

5892

*/

5889

void

5893

void

5890

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

5894

__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)

5891

{

5895

{

5892

struct page *page;

5896

struct page *page;

5893

struct zone *zone;

5897

struct zone *zone;

5894

int order, i;

5898

int order, i;

5895

unsigned long pfn;

5899

unsigned long pfn;

5896

unsigned long flags;

5900

unsigned long flags;

5897

/* find the first valid pfn */

5901

/* find the first valid pfn */

5898

for (pfn = start_pfn; pfn < end_pfn; pfn++)

5902

for (pfn = start_pfn; pfn < end_pfn; pfn++)

5899

if (pfn_valid(pfn))

5903

if (pfn_valid(pfn))

5900

break;

5904

break;

5901

if (pfn == end_pfn)

5905

if (pfn == end_pfn)

5902

return;

5906

return;

5903

zone = page_zone(pfn_to_page(pfn));

5907

zone = page_zone(pfn_to_page(pfn));

5904

spin_lock_irqsave(&zone->lock, flags);

5908

spin_lock_irqsave(&zone->lock, flags);

5905

pfn = start_pfn;

5909

pfn = start_pfn;

5906

while (pfn < end_pfn) {

5910

while (pfn < end_pfn) {

5907

if (!pfn_valid(pfn)) {

5911

if (!pfn_valid(pfn)) {

5908

pfn++;

5912

pfn++;

5909

continue;

5913

continue;

5910

}

5914

}

5911

page = pfn_to_page(pfn);

5915

page = pfn_to_page(pfn);

5912

BUG_ON(page_count(page));

5916

BUG_ON(page_count(page));

5913

BUG_ON(!PageBuddy(page));

5917

BUG_ON(!PageBuddy(page));

5914

order = page_order(page);

5918

order = page_order(page);

5915

#ifdef CONFIG_DEBUG_VM

5919

#ifdef CONFIG_DEBUG_VM

5916

printk(KERN_INFO "remove from free list %lx %d %lx\n",

5920

printk(KERN_INFO "remove from free list %lx %d %lx\n",

5917

pfn, 1 << order, end_pfn);

5921

pfn, 1 << order, end_pfn);

5918

#endif

5922

#endif

5919

list_del(&page->lru);

5923

list_del(&page->lru);

5920

rmv_page_order(page);

5924

rmv_page_order(page);

5921

zone->free_area[order].nr_free--;

5925

zone->free_area[order].nr_free--;

5922

__mod_zone_page_state(zone, NR_FREE_PAGES,

5926

__mod_zone_page_state(zone, NR_FREE_PAGES,

5923

- (1UL << order));

5927

- (1UL << order));

5924

for (i = 0; i < (1 << order); i++)

5928

for (i = 0; i < (1 << order); i++)

5925

SetPageReserved((page+i));

5929

SetPageReserved((page+i));

5926

pfn += (1 << order);

5930

pfn += (1 << order);

5927

}

5931

}

5928

spin_unlock_irqrestore(&zone->lock, flags);

5932

spin_unlock_irqrestore(&zone->lock, flags);

5929

}

5933

}

5930

#endif

5934

#endif

5931

5935

5932

#ifdef CONFIG_MEMORY_FAILURE

5936

#ifdef CONFIG_MEMORY_FAILURE

5933

bool is_free_buddy_page(struct page *page)

5937

bool is_free_buddy_page(struct page *page)

5934

{

5938

{

5935

struct zone *zone = page_zone(page);

5939

struct zone *zone = page_zone(page);

5936

unsigned long pfn = page_to_pfn(page);

5940

unsigned long pfn = page_to_pfn(page);

5937

unsigned long flags;

5941

unsigned long flags;

5938

int order;

5942

int order;

5939

5943

5940

spin_lock_irqsave(&zone->lock, flags);

5944

spin_lock_irqsave(&zone->lock, flags);

5941

for (order = 0; order < MAX_ORDER; order++) {

5945

for (order = 0; order < MAX_ORDER; order++) {

5942

struct page *page_head = page - (pfn & ((1 << order) - 1));

5946

struct page *page_head = page - (pfn & ((1 << order) - 1));

5943

5947

5944

if (PageBuddy(page_head) && page_order(page_head) >= order)

5948

if (PageBuddy(page_head) && page_order(page_head) >= order)

5945

break;

5949

break;

5946

}

5950

}

5947

spin_unlock_irqrestore(&zone->lock, flags);

5951

spin_unlock_irqrestore(&zone->lock, flags);

5948

5952

5949

return order < MAX_ORDER;

5953

return order < MAX_ORDER;

5950

}

5954

}

5951

#endif

5955

#endif

5952

5956

5953

static const struct trace_print_flags pageflag_names[] = {

5957

static const struct trace_print_flags pageflag_names[] = {

5954

{1UL << PG_locked, "locked" },

5958

{1UL << PG_locked, "locked" },

5955

{1UL << PG_error, "error" },

5959

{1UL << PG_error, "error" },

5956

{1UL << PG_referenced, "referenced" },

5960

{1UL << PG_referenced, "referenced" },

5957

{1UL << PG_uptodate, "uptodate" },

5961

{1UL << PG_uptodate, "uptodate" },

5958

{1UL << PG_dirty, "dirty" },

5962

{1UL << PG_dirty, "dirty" },

5959

{1UL << PG_lru, "lru" },

5963

{1UL << PG_lru, "lru" },

5960

{1UL << PG_active, "active" },

5964

{1UL << PG_active, "active" },

5961

{1UL << PG_slab, "slab" },

5965

{1UL << PG_slab, "slab" },

5962

{1UL << PG_owner_priv_1, "owner_priv_1" },

5966

{1UL << PG_owner_priv_1, "owner_priv_1" },

5963

{1UL << PG_arch_1, "arch_1" },

5967

{1UL << PG_arch_1, "arch_1" },

5964

{1UL << PG_reserved, "reserved" },

5968

{1UL << PG_reserved, "reserved" },

5965

{1UL << PG_private, "private" },

5969

{1UL << PG_private, "private" },

5966

{1UL << PG_private_2, "private_2" },

5970

{1UL << PG_private_2, "private_2" },

5967

{1UL << PG_writeback, "writeback" },

5971

{1UL << PG_writeback, "writeback" },

5968

#ifdef CONFIG_PAGEFLAGS_EXTENDED

5972

#ifdef CONFIG_PAGEFLAGS_EXTENDED

5969

{1UL << PG_head, "head" },

5973

{1UL << PG_head, "head" },

5970

{1UL << PG_tail, "tail" },

5974

{1UL << PG_tail, "tail" },

5971

#else

5975

#else

5972

{1UL << PG_compound, "compound" },

5976

{1UL << PG_compound, "compound" },

5973

#endif

5977

#endif

5974

{1UL << PG_swapcache, "swapcache" },

5978

{1UL << PG_swapcache, "swapcache" },

5975

{1UL << PG_mappedtodisk, "mappedtodisk" },

5979

{1UL << PG_mappedtodisk, "mappedtodisk" },

5976

{1UL << PG_reclaim, "reclaim" },

5980

{1UL << PG_reclaim, "reclaim" },

5977

{1UL << PG_swapbacked, "swapbacked" },

5981

{1UL << PG_swapbacked, "swapbacked" },

5978

{1UL << PG_unevictable, "unevictable" },

5982

{1UL << PG_unevictable, "unevictable" },

5979

#ifdef CONFIG_MMU

5983

#ifdef CONFIG_MMU

5980

{1UL << PG_mlocked, "mlocked" },

5984

{1UL << PG_mlocked, "mlocked" },

5981

#endif

5985

#endif

5982

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

5986

#ifdef CONFIG_ARCH_USES_PG_UNCACHED

5983

{1UL << PG_uncached, "uncached" },

5987

{1UL << PG_uncached, "uncached" },

5984

#endif

5988

#endif

5985

#ifdef CONFIG_MEMORY_FAILURE

5989

#ifdef CONFIG_MEMORY_FAILURE

5986

{1UL << PG_hwpoison, "hwpoison" },

5990

{1UL << PG_hwpoison, "hwpoison" },

5987

#endif

5991

#endif

5988

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5992

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5989

{1UL << PG_compound_lock, "compound_lock" },

5993

{1UL << PG_compound_lock, "compound_lock" },

5990

#endif

5994

#endif

5991

};

5995

};

5992

5996

5993

static void dump_page_flags(unsigned long flags)

5997

static void dump_page_flags(unsigned long flags)

5994

{

5998

{

5995

const char *delim = "";

5999

const char *delim = "";

5996

unsigned long mask;

6000

unsigned long mask;

5997

int i;

6001

int i;

5998

6002

5999

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6003

BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);

6000

6004

6001

printk(KERN_ALERT "page flags: %#lx(", flags);

6005

printk(KERN_ALERT "page flags: %#lx(", flags);

6002

6006

6003

/* remove zone id */

6007

/* remove zone id */

6004

flags &= (1UL << NR_PAGEFLAGS) - 1;

6008

flags &= (1UL << NR_PAGEFLAGS) - 1;

6005

6009

6006

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6010

for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {

6007

6011

6008

mask = pageflag_names[i].mask;

6012

mask = pageflag_names[i].mask;

6009

if ((flags & mask) != mask)

6013

if ((flags & mask) != mask)

6010

continue;

6014

continue;

6011

6015

6012

flags &= ~mask;

6016

flags &= ~mask;

6013

printk("%s%s", delim, pageflag_names[i].name);

6017

printk("%s%s", delim, pageflag_names[i].name);

6014

delim = "|";

6018

delim = "|";

6015

}

6019

}

6016

6020

6017

/* check for left over flags */

6021

/* check for left over flags */

6018

if (flags)

6022

if (flags)

6019

printk("%s%#lx", delim, flags);

6023

printk("%s%#lx", delim, flags);

6020

6024

6021

printk(")\n");

6025

printk(")\n");

6022

}

6026

}

6023

6027

6024

void dump_page(struct page *page)

6028

void dump_page(struct page *page)

6025

{

6029

{

6026

printk(KERN_ALERT

6030

printk(KERN_ALERT

6027

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6031

"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",

6028

page, atomic_read(&page->_count), page_mapcount(page),

6032

page, atomic_read(&page->_count), page_mapcount(page),

6029

page->mapping, page->index);

6033

page->mapping, page->index);

6030

dump_page_flags(page->flags);

6034

dump_page_flags(page->flags);

6031

mem_cgroup_print_bad_page(page);

6035

mem_cgroup_print_bad_page(page);

6032

}

6036

}

6033

6037

GITLAB

mm: allow PF_MEMALLOC from softirq context

 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H
 /*
  * cloning flags:
  */
 #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
 #define CLONE_VM	0x00000100	/* set if VM shared between processes */
 #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
 #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
 #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
 #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
 #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
 #define CLONE_THREAD	0x00010000	/* Same thread group? */
 #define CLONE_NEWNS	0x00020000	/* New namespace group? */
 #define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
 #define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
 #define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
 #define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
 #define CLONE_DETACHED		0x00400000	/* Unused, ignored */
 #define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
 /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
    and is now available for re-use. */
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 /*
  * Scheduling policies
  */
 #define SCHED_NORMAL		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
 #ifdef __KERNEL__
 struct sched_param {
 	int sched_priority;
 };
 #include <asm/param.h>	/* for HZ */
 #include <linux/capability.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/thread_info.h>
 #include <linux/cpumask.h>
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cputime.h>
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
 #include <asm/processor.h>
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
 struct fs_struct;
 struct perf_event_context;
 struct blk_plug;
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
  */
 #define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
  *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
  *    a load-average precision of 10 bits integer + 11 bits fractional
  *  - if you want to count load-averages more often, you need more
  *    precision, or rounding will get you. With 2-second counting freq,
  *    the EXP_n values would be 1981, 2034 and 2043 if still using only
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
 #define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
 #define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
 #define EXP_5		2014		/* 1/exp(5sec/5min) */
 #define EXP_15		2037		/* 1/exp(5sec/15min) */
 #define CALC_LOAD(load,exp,n) \
 	load *= exp; \
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern unsigned long this_cpu_load(void);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 extern unsigned long get_parent_ip(unsigned long addr);
 struct seq_file;
 struct cfs_rq;
 struct task_group;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 extern void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #else
 static inline void
 proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 }
 static inline void proc_sched_set_task(struct task_struct *p)
 {
 }
 static inline void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 }
 #endif
 /*
  * Task state bitmask. NOTE! These bits are also
  * encoded in fs/proc/array.c: get_task_state().
  *
  * We have two separate sets of flags: task->state
  * is about runnability, while task->exit_state are
  * about the task exiting. Confusing, but this way
  * modifying one set can't modify the other one by
  * mistake.
  */
 #define TASK_RUNNING		0
 #define TASK_INTERRUPTIBLE	1
 #define TASK_UNINTERRUPTIBLE	2
 #define __TASK_STOPPED		4
 #define __TASK_TRACED		8
 /* in tsk->exit_state */
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
 /* in tsk->state again */
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
 #define TASK_STATE_MAX		512
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
 /* Convenience macros for the sake of set_task_state */
 #define TASK_KILLABLE		(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
 /* get_task_state() */
 #define TASK_REPORT		(TASK_RUNNING | TASK_INTERRUPTIBLE | \
 				 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
 				 __TASK_TRACED)
 #define task_is_traced(task)	((task->state & __TASK_TRACED) != 0)
 #define task_is_stopped(task)	((task->state & __TASK_STOPPED) != 0)
 #define task_is_dead(task)	((task)->exit_state != 0)
 #define task_is_stopped_or_traced(task)	\
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FROZEN) == 0)
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 /*
  * set_current_state() includes a barrier so that the write of current->state
  * is correctly serialised wrt the caller's subsequent test of whether to
  * actually sleep:
  *
  *	set_current_state(TASK_UNINTERRUPTIBLE);
  *	if (do_i_need_to_sleep())
  *		schedule();
  *
  * If the caller does not need such serialisation then use __set_current_state()
  */
 #define __set_current_state(state_value)			\
 	do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)		\
 	set_mb(current->state, (state_value))
 /* Task command name length */
 #define TASK_COMM_LEN 16
 #include <linux/spinlock.h>
 /*
  * This serializes "schedule()" and also protects
  * the run-queue from deletions/modifications (but
  * _adding_ to the beginning of the run-queue has
  * a separate lock).
  */
 extern rwlock_t tasklist_lock;
 extern spinlock_t mmlist_lock;
 struct task_struct;
 #ifdef CONFIG_PROVE_RCU
 extern int lockdep_tasklist_lock_is_held(void);
 #endif /* #ifdef CONFIG_PROVE_RCU */
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
 extern void set_cpu_sd_state_idle(void);
 extern int get_nohz_timer_target(void);
 #else
 static inline void select_nohz_load_balancer(int stop_tick) { }
 static inline void set_cpu_sd_state_idle(void) { }
 #endif
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
 extern void show_state_filter(unsigned long state_filter);
 static inline void show_state(void)
 {
 	show_state_filter(0);
 }
 extern void show_regs(struct pt_regs *);
 /*
  * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
  * task), SP is the stack pointer of the first frame that should be shown in the back
  * trace (or NULL if the entire call-chain of the task should be shown).
  */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 extern void sched_show_task(struct task_struct *p);
 #ifdef CONFIG_LOCKUP_DETECTOR
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
 				  void __user *buffer,
 				  size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
 void lockup_detector_init(void);
 #else
 static inline void touch_softlockup_watchdog(void)
 {
 }
 static inline void touch_softlockup_watchdog_sync(void)
 {
 }
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
 static inline void lockup_detector_init(void)
 {
 }
 #endif
 #if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND)
 void lockup_detector_bootcpu_resume(void);
 #else
 static inline void lockup_detector_bootcpu_resume(void)
 {
 }
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 extern unsigned int  sysctl_hung_task_panic;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
 extern unsigned long sysctl_hung_task_warnings;
 extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 					 void __user *buffer,
 					 size_t *lenp, loff_t *ppos);
 #else
 /* Avoid need for ifdefs elsewhere in the code */
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Linker adds these: start and end of __sched functions */
 extern char __sched_text_start[], __sched_text_end[];
 /* Is this address in the __sched functions? */
 extern int in_sched_functions(unsigned long addr);
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
 struct nsproxy;
 struct user_namespace;
 /*
  * Default maximum number of active map areas, this limits the number of vmas
  * per mm struct. Users can overwrite this number by sysctl but there is a
  * problem.
  *
  * When a program's coredump is generated as ELF format, a section is created
  * per a vma. In ELF, the number of sections is represented in unsigned short.
  * This means the number of sections should be smaller than 65535 at coredump.
  * Because the kernel adds some informative sections to a image of program at
  * generating coredump, we need some margin. The number of extra sections is
  * 1-3 now and depends on arch. We use "5" as safe margin, here.
  */
 #define MAPCOUNT_ELF_CORE_MARGIN	(5)
 #define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 extern int sysctl_max_map_count;
 #include <linux/aio.h>
 #ifdef CONFIG_MMU
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 extern unsigned long
 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 		       unsigned long, unsigned long);
 extern unsigned long
 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #else
 static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 #endif
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
 /* get/set_dumpable() values */
 #define SUID_DUMPABLE_DISABLED	0
 #define SUID_DUMPABLE_ENABLED	1
 #define SUID_DUMPABLE_SAFE	2
 /* mm flags */
 /* dumpable bits */
 #define MMF_DUMPABLE      0  /* core dump is permitted */
 #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
 #define MMF_DUMPABLE_BITS 2
 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
 /* coredump filter bits */
 #define MMF_DUMP_ANON_PRIVATE	2
 #define MMF_DUMP_ANON_SHARED	3
 #define MMF_DUMP_MAPPED_PRIVATE	4
 #define MMF_DUMP_MAPPED_SHARED	5
 #define MMF_DUMP_ELF_HEADERS	6
 #define MMF_DUMP_HUGETLB_PRIVATE 7
 #define MMF_DUMP_HUGETLB_SHARED  8
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
 #define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK \
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
 	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
 	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
 #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
 # define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
 #else
 # define MMF_DUMP_MASK_DEFAULT_ELF	0
 #endif
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
 #define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
 	wait_queue_head_t	signalfd_wqh;
 };
 struct pacct_struct {
 	int			ac_flag;
 	long			ac_exitcode;
 	unsigned long		ac_mem;
 	cputime_t		ac_utime, ac_stime;
 	unsigned long		ac_minflt, ac_majflt;
 };
 struct cpu_itimer {
 	cputime_t expires;
 	cputime_t incr;
 	u32 error;
 	u32 incr_error;
 };
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
  * CPU time want to group these counts together and treat all three
  * of them in parallel.
  */
 struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
 #define virt_exp	utime
 #define sched_exp	sum_exec_runtime
 #define INIT_CPUTIME	\
 	(struct task_cputime) {					\
 		.utime = 0,					\
 		.stime = 0,					\
 		.sum_exec_runtime = 0,				\
 	}
 /*
  * Disable preemption until the scheduler is running.
  * Reset by start_kernel()->sched_init()->init_idle().
  *
  * We include PREEMPT_ACTIVE to avoid cond_resched() from working
  * before the scheduler is active -- see should_resched().
  */
 #define INIT_PREEMPT_COUNT	(1 + PREEMPT_ACTIVE)
 /**
  * struct thread_group_cputimer - thread group interval timer counts
  * @cputime:		thread group interval timers.
  * @running:		non-zero when there are timers running and
  * 			@cputime receives updates.
  * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
 	struct task_cputime cputime;
 	int running;
 	raw_spinlock_t lock;
 };
 #include <linux/rwsem.h>
 struct autogroup;
 /*
  * NOTE! "signal_struct" does not have its own
  * locking, because a shared signal_struct always
  * implies a shared sighand_struct, so locking
  * sighand_struct is always a proper superset of
  * the locking of signal_struct.
  */
 struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
 	/* current thread group signal load-balancing target: */
 	struct task_struct	*curr_target;
 	/* shared signal handling: */
 	struct sigpending	shared_pending;
 	/* thread group exit support */
 	int			group_exit_code;
 	/* overloaded:
 	 * - notify group_exit_task when ->count is equal to notify_count
 	 * - everyone except group_exit_task is stopped during signal delivery
 	 *   of fatal signals, group_exit_task processes the signal.
 	 */
 	int			notify_count;
 	struct task_struct	*group_exit_task;
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
 	unsigned int		flags; /* see SIGNAL_* flags below */
 	/*
 	 * PR_SET_CHILD_SUBREAPER marks a process, like a service
 	 * manager, to re-parent orphan (double-forking) child processes
 	 * to this process instead of 'init'. The service manager is
 	 * able to receive SIGCHLD signals and is able to investigate
 	 * the process until it calls wait(). All children of this
 	 * process will inherit a flag if they should look for a
 	 * child_subreaper process at exit.
 	 */
 	unsigned int		is_child_subreaper:1;
 	unsigned int		has_child_subreaper:1;
 	/* POSIX.1b Interval Timers */
 	struct list_head posix_timers;
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
 	struct pid *leader_pid;
 	ktime_t it_real_incr;
 	/*
 	 * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
 	 * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
 	 * values are defined to 0 and 1 respectively
 	 */
 	struct cpu_itimer it[2];
 	/*
 	 * Thread group totals for process CPU timers.
 	 * See thread_group_cputimer(), et al, for details.
 	 */
 	struct thread_group_cputimer cputimer;
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 	struct pid *tty_old_pgrp;
 	/* boolean value for session group leader */
 	int leader;
 	struct tty_struct *tty; /* NULL if no tty */
 #ifdef CONFIG_SCHED_AUTOGROUP
 	struct autogroup *autogroup;
 #endif
 	/*
 	 * Cumulative resource counters for dead threads in the group,
 	 * and for reaped dead child processes forked by this group.
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
 #endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
 	unsigned long maxrss, cmaxrss;
 	struct task_io_accounting ioac;
 	/*
 	 * Cumulative ns of schedule CPU time fo dead threads in the
 	 * group, not including a zombie group leader, (This only differs
 	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
 	 * other than jiffies.)
 	 */
 	unsigned long long sum_sched_runtime;
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
 	 * to get both rlim_cur and rlim_max atomically, and either one
 	 * alone is a single word that can safely be read normally.
 	 * getrlimit/setrlimit use task_lock(current->group_leader) to
 	 * protect this instead of the siglock, because they really
 	 * have no need to disable irqs.
 	 */
 	struct rlimit rlim[RLIM_NLIMITS];
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct pacct_struct pacct;	/* per-process accounting information */
 #endif
 #ifdef CONFIG_TASKSTATS
 	struct taskstats *stats;
 #endif
 #ifdef CONFIG_AUDIT
 	unsigned audit_tty;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 #ifdef CONFIG_CGROUPS
 	/*
 	 * group_rwsem prevents new tasks from entering the threadgroup and
 	 * member tasks from exiting,a more specifically, setting of
 	 * PF_EXITING.  fork and exit paths are protected with this rwsem
 	 * using threadgroup_change_begin/end().  Users which require
 	 * threadgroup to remain stable should use threadgroup_[un]lock()
 	 * which also takes care of exec path.  Currently, cgroup is the
 	 * only user.
 	 */
 	struct rw_semaphore group_rwsem;
 #endif
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
 	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
 				 * Only settable by CAP_SYS_RESOURCE. */
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
 };
 /* Context switch must be unlocked if interrupts are to be enabled */
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 # define __ARCH_WANT_UNLOCKED_CTXSW
 #endif
 /*
  * Bits in flags field of signal_struct.
  */
 #define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
 #define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
 /*
  * Pending notifications to parent.
  */
 #define SIGNAL_CLD_STOPPED	0x00000010
 #define SIGNAL_CLD_CONTINUED	0x00000020
 #define SIGNAL_CLD_MASK		(SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
 #define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
 	return	(sig->flags & SIGNAL_GROUP_EXIT) ||
 		(sig->group_exit_task != NULL);
 }
 /*
  * Some day this will be a full-fledged user tracking system..
  */
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
 #ifdef CONFIG_INOTIFY_USER
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
 #ifdef CONFIG_EPOLL
 	atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 #endif
 #ifdef CONFIG_POSIX_MQUEUE
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
 #ifdef CONFIG_KEYS
 	struct key *uid_keyring;	/* UID specific keyring */
 	struct key *session_keyring;	/* UID's default session keyring */
 #endif
 	/* Hash table maintenance information */
 	struct hlist_node uidhash_node;
 	kuid_t uid;
 #ifdef CONFIG_PERF_EVENTS
 	atomic_long_t locked_vm;
 #endif
 };
 extern int uids_sysfs_init(void);
 extern struct user_struct *find_user(kuid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 struct backing_dev_info;
 struct reclaim_state;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
 	unsigned long long run_delay; /* time spent waiting on a runqueue */
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
 	spinlock_t	lock;
 	unsigned int	flags;	/* Private per-task flags */
 	/* For each stat XXX, add following, aligned appropriately
 	 *
 	 * struct timespec XXX_start, XXX_end;
 	 * u64 XXX_delay;
 	 * u32 XXX_count;
 	 *
 	 * Atomicity of updates to XXX_delay, XXX_count protected by
 	 * single lock above (split into XXX_lock if contention is an issue).
 	 */
 	/*
 	 * XXX_count is incremented on every XXX operation, the delay
 	 * associated with the operation is added to XXX_delay.
 	 * XXX_delay contains the accumulated delay time in nanoseconds.
 	 */
 	struct timespec blkio_start, blkio_end;	/* Shared by blkio, swapin */
 	u64 blkio_delay;	/* wait for sync block io completion */
 	u64 swapin_delay;	/* wait for swapin block io completion */
 	u32 blkio_count;	/* total count of the number of sync block */
 				/* io operations performed */
 	u32 swapin_count;	/* total count of the number of swapin block */
 				/* io operations performed */
 	struct timespec freepages_start, freepages_end;
 	u64 freepages_delay;	/* wait for memory reclaim */
 	u32 freepages_count;	/* total count of memory reclaim */
 };
 #endif	/* CONFIG_TASK_DELAY_ACCT */
 static inline int sched_info_on(void)
 {
 #ifdef CONFIG_SCHEDSTATS
 	return 1;
 #elif defined(CONFIG_TASK_DELAY_ACCT)
 	extern int delayacct_on;
 	return delayacct_on;
 #else
 	return 0;
 #endif
 }
 enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
 	CPU_NEWLY_IDLE,
 	CPU_MAX_IDLE_TYPES
 };
 /*
  * Increase resolution of nice-level calculations for 64-bit architectures.
  * The extra resolution improves shares distribution and load balancing of
  * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
  * hierarchies, especially on larger systems. This is not a user-visible change
  * and does not change the user-interface for setting shares/weights.
  *
  * We increase resolution only if we have enough bits to allow this increased
  * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
  * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  * increased costs.
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
 #else
 # define SCHED_LOAD_RESOLUTION	0
 # define scale_load(w)		(w)
 # define scale_load_down(w)	(w)
 #endif
 #define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 /*
  * Increase resolution of cpu_power calculations
  */
 #define SCHED_POWER_SHIFT	10
 #define SCHED_POWER_SCALE	(1L << SCHED_POWER_SHIFT)
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		0x0001	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	0x0002	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		0x0004	/* Balance on exec */
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
 #define SD_PREFER_LOCAL		0x0040  /* Prefer to keep tasks local to this domain */
 #define SD_SHARE_CPUPOWER	0x0080	/* Domain members share cpu power */
 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 extern int __weak arch_sd_sibiling_asym_packing(void);
 struct sched_group_power {
 	atomic_t ref;
 	/*
 	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
 	 * single CPU.
 	 */
 	unsigned int power, power_orig;
 	unsigned long next_update;
 	/*
 	 * Number of busy cpus in this group.
 	 */
 	atomic_t nr_busy_cpus;
 	unsigned long cpumask[0]; /* iteration mask */
 };
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
 	atomic_t ref;
 	unsigned int group_weight;
 	struct sched_group_power *sgp;
 	/*
 	 * The CPUs this group covers.
 	 *
 	 * NOTE: this field is variable length. (Allocated dynamically
 	 * by attaching extra space to the end of the structure,
 	 * depending on how many CPUs the kernel has booted up with)
 	 */
 	unsigned long cpumask[0];
 };
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 {
 	return to_cpumask(sg->cpumask);
 }
 /*
  * cpumask masking which cpus in the group are allowed to iterate up the domain
  * tree.
  */
 static inline struct cpumask *sched_group_mask(struct sched_group *sg)
 {
 	return to_cpumask(sg->sgp->cpumask);
 }
 /**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
  */
 static inline unsigned int group_first_cpu(struct sched_group *group)
 {
 	return cpumask_first(sched_group_cpus(group));
 }
 struct sched_domain_attr {
 	int relax_domain_level;
 };
 #define SD_ATTR_INIT	(struct sched_domain_attr) {	\
 	.relax_domain_level = -1,			\
 }
 extern int sched_domain_level_max;
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int busy_idx;
 	unsigned int idle_idx;
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	unsigned int smt_gain;
 	int flags;			/* See SD_* */
 	int level;
 	int idle_buddy;			/* cpu assigned to select_idle_sibling() */
 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 	unsigned int nr_balance_failed; /* initialise to 0 */
 	u64 last_update;
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
 	/* Active load balancing */
 	unsigned int alb_count;
 	unsigned int alb_failed;
 	unsigned int alb_pushed;
 	/* SD_BALANCE_EXEC stats */
 	unsigned int sbe_count;
 	unsigned int sbe_balanced;
 	unsigned int sbe_pushed;
 	/* SD_BALANCE_FORK stats */
 	unsigned int sbf_count;
 	unsigned int sbf_balanced;
 	unsigned int sbf_pushed;
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_wake_remote;
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
 #endif
 #ifdef CONFIG_SCHED_DEBUG
 	char *name;
 #endif
 	union {
 		void *private;		/* used during construction */
 		struct rcu_head rcu;	/* used during destruction */
 	};
 	unsigned int span_weight;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
 	 * NOTE: this field is variable length. (Allocated dynamically
 	 * by attaching extra space to the end of the structure,
 	 * depending on how many CPUs the kernel has booted up with)
 	 */
 	unsigned long span[0];
 };
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 {
 	return to_cpumask(sd->span);
 }
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 				    struct sched_domain_attr *dattr_new);
 /* Allocate an array of sched domains, for partition_sched_domains(). */
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 /* Test a flag in parent sched domain */
 static inline int test_sd_parent(struct sched_domain *sd, int flag)
 {
 	if (sd->parent && (sd->parent->flags & flag))
 		return 1;
 	return 0;
 }
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 bool cpus_share_cache(int this_cpu, int that_cpu);
 #else /* CONFIG_SMP */
 struct sched_domain_attr;
 static inline void
 partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			struct sched_domain_attr *dattr_new)
 {
 }
 static inline bool cpus_share_cache(int this_cpu, int that_cpu)
 {
 	return true;
 }
 #endif	/* !CONFIG_SMP */
 struct io_context;			/* See blkdev.h */
 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
 extern void prefetch_stack(struct task_struct *t);
 #else
 static inline void prefetch_stack(struct task_struct *t) { }
 #endif
 struct audit_context;		/* See audit.c */
 struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 struct rq;
 struct sched_domain;
 /*
  * wake flags
  */
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 #define WF_MIGRATED	0x04		/* internal use, task got migrated */
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
 #ifdef CONFIG_SMP
 #define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
 #else
 #define ENQUEUE_WAKING		0
 #endif
 #define DEQUEUE_SLEEP		1
 struct sched_class {
 	const struct sched_class *next;
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task) (struct rq *rq);
 	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 	void (*set_cpus_allowed)(struct task_struct *p,
 				 const struct cpumask *newmask);
 	void (*rq_online)(struct rq *rq);
 	void (*rq_offline)(struct rq *rq);
 #endif
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_fork) (struct task_struct *p);
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
 	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			     int oldprio);
 	unsigned int (*get_rr_interval) (struct rq *rq,
 					 struct task_struct *task);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
 };
 struct load_weight {
 	unsigned long weight, inv_weight;
 };
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
 	u64			wait_sum;
 	u64			iowait_count;
 	u64			iowait_sum;
 	u64			sleep_start;
 	u64			sleep_max;
 	s64			sum_sleep_runtime;
 	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
 	u64			slice_max;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
 	u64			nr_failed_migrations_hot;
 	u64			nr_forced_migrations;
 	u64			nr_wakeups;
 	u64			nr_wakeups_sync;
 	u64			nr_wakeups_migrate;
 	u64			nr_wakeups_local;
 	u64			nr_wakeups_remote;
 	u64			nr_wakeups_affine;
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
 };
 #endif
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	struct list_head	group_node;
 	unsigned int		on_rq;
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 	u64			nr_migrations;
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
 };
 struct sched_rt_entity {
 	struct list_head run_list;
 	unsigned long timeout;
 	unsigned int time_slice;
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct rt_rq		*rt_rq;
 	/* rq "owned" by this entity/group: */
 	struct rt_rq		*my_q;
 #endif
 };
 /*
  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
  * Timeslices get refilled after they expire.
  */
 #define RR_TIMESLICE		(100 * HZ / 1000)
 struct rcu_node;
 enum perf_event_task_context {
 	perf_invalid_context = -1,
 	perf_hw_context = 0,
 	perf_sw_context,
 	perf_nr_task_contexts,
 };
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
 #endif
 	int on_rq;
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
 	struct hlist_head preempt_notifiers;
 #endif
 	/*
 	 * fpu_counter contains the number of consecutive context switches
 	 * that the FPU is used. If this is over a threshold, the lazy fpu
 	 * saving becomes unlazy to save the trap. This is an unsigned char
 	 * so that after 256 times the counter wraps and the behavior turns
 	 * lazy again; this to deal with bursty apps that only use FPU for
 	 * a short time
 	 */
 	unsigned char fpu_counter;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
 	unsigned int policy;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
 	char rcu_read_unlock_special;
 	struct list_head rcu_node_entry;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
 	struct rt_mutex *rcu_boost_mutex;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 #endif
 	struct list_head tasks;
 #ifdef CONFIG_SMP
 	struct plist_node pushable_tasks;
 #endif
 	struct mm_struct *mm, *active_mm;
 #ifdef CONFIG_COMPAT_BRK
 	unsigned brk_randomized:1;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
 	struct task_rss_stat	rss_stat;
 #endif
 /* task state */
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
 	/* ??? */
 	unsigned int personality;
 	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
 	/* task may not gain privileges */
 	unsigned no_new_privs:1;
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	pid_t pid;
 	pid_t tgid;
 #ifdef CONFIG_CC_STACKPROTECTOR
 	/* Canary value for the -fstack-protector gcc feature */
 	unsigned long stack_canary;
 #endif
 	/*
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with
 	 * p->real_parent->pid)
 	 */
 	struct task_struct __rcu *real_parent; /* real parent process */
 	struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
 	/*
 	 * children/sibling forms the list of my natural children
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 	/*
 	 * ptraced is the list of tasks this task is using ptrace on.
 	 * This includes both natural children and PTRACE_ATTACH targets.
 	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
 	 */
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
 	struct completion *vfork_done;		/* for vfork() */
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 	cputime_t prev_utime, prev_stime;
 #endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
 	struct timespec real_start_time;	/* boot based time */
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt;
 	struct task_cputime cputime_expires;
 	struct list_head cpu_timers[3];
 /* process credentials */
 	const struct cred __rcu *real_cred; /* objective and real subjective task
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
 				       it with task_lock())
 				     - initialized normally by setup_new_exec */
 /* file system info */
 	int link_count, total_link_count;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
 	struct files_struct *files;
 /* namespaces */
 	struct nsproxy *nsproxy;
 /* signal handlers */
 	struct signal_struct *signal;
 	struct sighand_struct *sighand;
 	sigset_t blocked, real_blocked;
 	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
 	struct sigpending pending;
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
 	struct callback_head *task_works;
 	struct audit_context *audit_context;
 #ifdef CONFIG_AUDITSYSCALL
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
 	struct seccomp seccomp;
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
  * mempolicy */
 	spinlock_t alloc_lock;
 	/* Protection of the PI data structures: */
 	raw_spinlock_t pi_lock;
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct plist_head pi_waiters;
 	/* Deadlock detection and priority inheritance handling */
 	struct rt_mutex_waiter *pi_blocked_on;
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* mutex deadlock detection */
 	struct mutex_waiter *blocked_on;
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	unsigned int irq_events;
 	unsigned long hardirq_enable_ip;
 	unsigned long hardirq_disable_ip;
 	unsigned int hardirq_enable_event;
 	unsigned int hardirq_disable_event;
 	int hardirqs_enabled;
 	int hardirq_context;
 	unsigned long softirq_disable_ip;
 	unsigned long softirq_enable_ip;
 	unsigned int softirq_disable_event;
 	unsigned int softirq_enable_event;
 	int softirqs_enabled;
 	int softirq_context;
 #endif
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
 	int lockdep_depth;
 	unsigned int lockdep_recursion;
 	struct held_lock held_locks[MAX_LOCK_DEPTH];
 	gfp_t lockdep_reclaim_gfp;
 #endif
 /* journalling filesystem info */
 	void *journal_info;
 /* stacked block device info */
 	struct bio_list *bio_list;
 #ifdef CONFIG_BLOCK
 /* stack plugging */
 	struct blk_plug *plug;
 #endif
 /* VM state */
 	struct reclaim_state *reclaim_state;
 	struct backing_dev_info *backing_dev_info;
 	struct io_context *io_context;
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
 	struct task_io_accounting ioac;
 #if defined(CONFIG_TASK_XACCT)
 	u64 acct_rss_mem1;	/* accumulated rss usage */
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;	/* Protected by alloc_lock */
 	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
 	int cpuset_mem_spread_rotor;
 	int cpuset_slab_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
 	/* Control Group info protected by css_set_lock */
 	struct css_set __rcu *cgroups;
 	/* cg_list protected by css_set_lock and tsk->alloc_lock */
 	struct list_head cg_list;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
 	short pref_node_fork;
 #endif
 	struct rcu_head rcu;
 	/*
 	 * cache last used pipe for splice
 	 */
 	struct pipe_inode_info *splice_pipe;
 #ifdef	CONFIG_TASK_DELAY_ACCT
 	struct task_delay_info *delays;
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
 	/*
 	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
 	 * balance_dirty_pages() for some dirty throttling pause
 	 */
 	int nr_dirtied;
 	int nr_dirtied_pause;
 	unsigned long dirty_paused_when; /* start of a write-and-pause period */
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
 #endif
 	/*
 	 * time slack values; these are used to round up poll() and
 	 * select() etc timeout values. These are in nanoseconds.
 	 */
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	*ret_stack;
 	/* time stamp for last schedule */
 	unsigned long long ftrace_timestamp;
 	/*
 	 * Number of functions that haven't been traced
 	 * because of depth overrun.
 	 */
 	atomic_t trace_overrun;
 	/* Pause for the tracing */
 	atomic_t tracing_graph_pause;
 #endif
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
 	struct memcg_batch_info {
 		int do_batch;	/* incremented when batch uncharge started */
 		struct mem_cgroup *memcg; /* target memcg of uncharge */
 		unsigned long nr_pages;	/* uncharged usage */
 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
 	} memcg_batch;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 	atomic_t ptrace_bp_refcnt;
 #endif
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
 };
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
  * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
  * values are inverted: lower p->prio value means higher priority.
  *
  * The MAX_USER_RT_PRIO value allows the actual maximum
  * RT priority to be separate from the value exported to
  * user-space.  This allows kernel threads to set their
  * priority to a value higher than any user task. Note:
  * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
  */
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 static inline int rt_prio(int prio)
 {
 	if (unlikely(prio < MAX_RT_PRIO))
 		return 1;
 	return 0;
 }
 static inline int rt_task(struct task_struct *p)
 {
 	return rt_prio(p->prio);
 }
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
 }
 static inline struct pid *task_tgid(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PID].pid;
 }
 /*
  * Without tasklist or rcu lock it is not safe to dereference
  * the result of task_pgrp/task_session even if task == current,
  * we can race with another thread doing sys_setsid/sys_setpgid.
  */
 static inline struct pid *task_pgrp(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_PGID].pid;
 }
 static inline struct pid *task_session(struct task_struct *task)
 {
 	return task->group_leader->pids[PIDTYPE_SID].pid;
 }
 struct pid_namespace;
 /*
  * the helpers to get the task's different pids as they are seen
  * from various namespaces
  *
  * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
  * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
  *                     current.
  * task_xid_nr_ns()  : id seen from the ns specified;
  *
  * set_task_vxid()   : assigns a virtual id to a task;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 			struct pid_namespace *ns);
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
 }
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
 }
 static inline pid_t task_tgid_nr(struct task_struct *tsk)
 {
 	return tsk->tgid;
 }
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 {
 	return pid_vnr(task_tgid(tsk));
 }
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
 }
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
 }
 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
 }
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 /* obsolete, do not use */
 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 {
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
  *
  * Test if a process is not yet dead (at most zombie state)
  * If pid_alive fails, then pointers within the task structure
  * can be stale and must not be dereferenced.
  */
 static inline int pid_alive(struct task_struct *p)
 {
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 /**
  * is_global_init - check if a task structure is init
  * @tsk: Task structure to be checked.
  *
  * Check if a task structure is the first user space task the kernel created.
  */
 static inline int is_global_init(struct task_struct *tsk)
 {
 	return tsk->pid == 1;
 }
 /*
  * is_container_init:
  * check whether in the task is init in its own pid namespace.
  */
 extern int is_container_init(struct task_struct *tsk);
 extern struct pid *cad_pid;
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 extern void __put_task_struct(struct task_struct *t);
 static inline void put_task_struct(struct task_struct *t)
 {
 	if (atomic_dec_and_test(&t->usage))
 		__put_task_struct(t);
 }
 extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
 /*
  * Per process flags
  */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
 #define PF_WQ_WORKER	0x00000020	/* I'm a workqueue worker */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_NPROC_EXCEEDED 0x00001000	/* set_user noticed that RLIMIT_NPROC was exceeded */
 #define PF_USED_MATH	0x00002000	/* if unset the fpu must be initialized before use */
 #define PF_NOFREEZE	0x00008000	/* this thread should not be frozen */
 #define PF_FROZEN	0x00010000	/* frozen for system suspend */
 #define PF_FSTRANS	0x00020000	/* inside a filesystem transaction */
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
  * with tsk_used_math (like during threaded core dumping).
  * There is however an exception to this rule during ptrace
  * or during fork: the ptracer task is allowed to write to the
  * child->flags of its traced child (same goes for fork, the parent
  * can write to the child->flags), because we're guaranteed the
  * child is not running and in turn not changing child->flags
  * at the same time the parent does it.
  */
 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
 #define clear_used_math() clear_stopped_child_used_math(current)
 #define set_used_math() set_stopped_child_used_math(current)
 #define conditional_stopped_child_used_math(condition, child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
 #define conditional_used_math(condition) \
 	conditional_stopped_child_used_math(condition, current)
 #define copy_to_stopped_child_used_math(child) \
 	do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 /*
  * task->jobctl flags
  */
 #define JOBCTL_STOP_SIGMASK	0xffff	/* signr of the last group stop */
 #define JOBCTL_STOP_DEQUEUED_BIT 16	/* stop signal dequeued */
 #define JOBCTL_STOP_PENDING_BIT	17	/* task should stop for group stop */
 #define JOBCTL_STOP_CONSUME_BIT	18	/* consume group stop count */
 #define JOBCTL_TRAP_STOP_BIT	19	/* trap for STOP */
 #define JOBCTL_TRAP_NOTIFY_BIT	20	/* trap for NOTIFY */
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 #define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
 #define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
 #define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
 #define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
 #define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
 #define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
 #define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 extern bool task_set_jobctl_pending(struct task_struct *task,
 				    unsigned int mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
 				      unsigned int mask);
 #ifdef CONFIG_PREEMPT_RCU
 #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
 #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
 static inline void rcu_copy_process(struct task_struct *p)
 {
 	p->rcu_read_lock_nesting = 0;
 	p->rcu_read_unlock_special = 0;
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	p->rcu_blocked_node = NULL;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
 	p->rcu_boost_mutex = NULL;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 	INIT_LIST_HEAD(&p->rcu_node_entry);
 }
 #else
 static inline void rcu_copy_process(struct task_struct *p)
 {
 }
 #endif
+static inline void tsk_restore_flags(struct task_struct *task,
+				unsigned long orig_flags, unsigned long flags)
+{
+	task->flags &= ~flags;
+	task->flags |= orig_flags & flags;
+}
 #ifdef CONFIG_SMP
 extern void do_set_cpus_allowed(struct task_struct *p,
 			       const struct cpumask *new_mask);
 extern int set_cpus_allowed_ptr(struct task_struct *p,
 				const struct cpumask *new_mask);
 #else
 static inline void do_set_cpus_allowed(struct task_struct *p,
 				      const struct cpumask *new_mask)
 {
 }
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
 				       const struct cpumask *new_mask)
 {
 	if (!cpumask_test_cpu(0, new_mask))
 		return -EINVAL;
 	return 0;
 }
 #endif
 #ifdef CONFIG_NO_HZ
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
 #else
 static inline void calc_load_enter_idle(void) { }
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ */
 #ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 {
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
 #endif
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
  * sched_clock() has no promise of monotonicity or bounded drift between
  * CPUs, use (which you should not) requires disabling IRQs.
  *
  * Please use one of the three interfaces below.
  */
 extern unsigned long long notrace sched_clock(void);
 /*
  * See the comment in kernel/sched/clock.c
  */
 extern u64 cpu_clock(int cpu);
 extern u64 local_clock(void);
 extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 static inline void sched_clock_tick(void)
 {
 }
 static inline void sched_clock_idle_sleep_event(void)
 {
 }
 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
 #else
 /*
  * Architectures can set this to 1 if they have specified
  * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
  * but then during bootup it turns out that sched_clock()
  * is reliable after all:
  */
 extern int sched_clock_stable;
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
  * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
  * The reason for this explicit opt-in is not to have perf penalty with
  * slow sched_clocks.
  */
 extern void enable_sched_clock_irqtime(void);
 extern void disable_sched_clock_irqtime(void);
 #else
 static inline void enable_sched_clock_irqtime(void) {}
 static inline void disable_sched_clock_irqtime(void) {}
 #endif
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
 #endif
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
 #endif
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 extern void wake_up_idle_cpu(int cpu);
 #else
 static inline void wake_up_idle_cpu(int cpu) { }
 #endif
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
 	SCHED_TUNABLESCALING_LOG,
 	SCHED_TUNABLESCALING_LINEAR,
 	SCHED_TUNABLESCALING_END,
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
 #ifdef CONFIG_SCHED_DEBUG
 static inline unsigned int get_sysctl_timer_migration(void)
 {
 	return sysctl_timer_migration;
 }
 #else
 static inline unsigned int get_sysctl_timer_migration(void)
 {
 	return 1;
 }
 #endif
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 int sched_rt_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
 extern void sched_autogroup_exit(struct signal_struct *sig);
 #ifdef CONFIG_PROC_FS
 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
 #endif
 #else
 static inline void sched_autogroup_create_attach(struct task_struct *p) { }
 static inline void sched_autogroup_detach(struct task_struct *p) { }
 static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
 static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return false;
 }
 #endif
 extern bool yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
 extern int task_nice(const struct task_struct *p);
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int,
 			      const struct sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
 				      const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 /**
  * is_idle_task - is the specified task an idle task?
  * @p: the task in question.
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
 	return p->pid == 0;
 }
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 /*
  * The default (Linux) execution domain.
  */
 extern struct exec_domain	default_exec_domain;
 union thread_union {
 	struct thread_info thread_info;
 	unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 #ifndef __HAVE_ARCH_KSTACK_END
 static inline int kstack_end(void *addr)
 {
 	/* Reliable end of stack detection:
 	 * Some APM bios versions misalign the stack
 	 */
 	return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
 }
 #endif
 extern union thread_union init_thread_union;
 extern struct task_struct init_task;
 extern struct   mm_struct init_mm;
 extern struct pid_namespace init_pid_ns;
 /*
  * find a task by one of its numerical ids
  *
  * find_task_by_pid_ns():
  *      finds a task by its pid in the specified namespace
  * find_task_by_vpid():
  *      finds a task by its virtual pid
  *
  * see also find_vpid() etc in include/linux/pid.h
  */
 extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
 extern void __set_special_pids(struct pid *pid);
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(kuid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
 	return u;
 }
 extern void free_uid(struct user_struct *);
 #include <asm/current.h>
 extern void xtime_update(unsigned long ticks);
 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 #ifdef CONFIG_SMP
  extern void kick_process(struct task_struct *tsk);
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
 extern void sched_fork(struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 {
 	unsigned long flags;
 	int ret;
 	spin_lock_irqsave(&tsk->sighand->siglock, flags);
 	ret = dequeue_signal(tsk, mask, info);
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 	return ret;
 }
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
 extern void release_task(struct task_struct * p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sigsegv(int, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
 extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
 				const struct cred *, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern int zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(struct sigqueue *,  struct task_struct *, int group);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 static inline void restore_saved_sigmask(void)
 {
 	if (test_and_clear_restore_sigmask())
 		__set_current_blocked(&current->saved_sigmask);
 }
 static inline sigset_t *sigmask_to_save(void)
 {
 	sigset_t *res = &current->blocked;
 	if (unlikely(test_restore_sigmask()))
 		res = &current->saved_sigmask;
 	return res;
 }
 static inline int kill_cad_pid(int sig, int priv)
 {
 	return kill_pid(cad_pid, sig, priv);
 }
 /* These can be the second arg to send_sig_info/send_group_sig_info.  */
 #define SEND_SIG_NOINFO ((struct siginfo *) 0)
 #define SEND_SIG_PRIV	((struct siginfo *) 1)
 #define SEND_SIG_FORCED	((struct siginfo *) 2)
 /*
  * True if we are on the alternate signal stack.
  */
 static inline int on_sig_stack(unsigned long sp)
 {
 #ifdef CONFIG_STACK_GROWSUP
 	return sp >= current->sas_ss_sp &&
 		sp - current->sas_ss_sp < current->sas_ss_size;
 #else
 	return sp > current->sas_ss_sp &&
 		sp - current->sas_ss_sp <= current->sas_ss_size;
 #endif
 }
 static inline int sas_ss_flags(unsigned long sp)
 {
 	return (current->sas_ss_size == 0 ? SS_DISABLE
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 }
 /*
  * Routines for handling mm_structs
  */
 extern struct mm_struct * mm_alloc(void);
 /* mmdrop drops the mm and the page tables */
 extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
  * Grab a reference to a task's mm, if it is not already going away
  * and ptrace_may_access with the mode parameter passed to it
  * succeeds.
  */
 extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
 extern struct mm_struct *dup_mm(struct task_struct *tsk);
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
 			struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 extern void flush_itimer_signals(void);
 extern void do_group_exit(int);
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
 extern int disallow_signal(int);
 extern int do_execve(const char *,
 		     const char __user * const __user *,
 		     const char __user * const __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 #ifdef CONFIG_SMP
 void scheduler_ipi(void);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
 static inline void scheduler_ipi(void) { }
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
 	return 1;
 }
 #endif
 #define next_task(p) \
 	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
 extern bool current_is_single_threaded(void);
 /*
  * Careful: do_each_thread/while_each_thread is a double loop so
  *          'break' will not work as expected - use goto instead.
  */
 #define do_each_thread(g, t) \
 	for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 static inline int get_nr_threads(struct task_struct *tsk)
 {
 	return tsk->signal->nr_threads;
 }
 static inline bool thread_group_leader(struct task_struct *p)
 {
 	return p->exit_signal >= 0;
 }
 /* Do to the insanities of de_thread it is possible for a process
  * to have the pid of the thread group leader without actually being
  * the thread group leader.  For iteration through the pids in proc
  * all we care about is that we have a task with the appropriate
  * pid, we don't actually care if we have the right task.
  */
 static inline int has_group_leader_pid(struct task_struct *p)
 {
 	return p->pid == p->tgid;
 }
 static inline
 int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 {
 	return p1->tgid == p2->tgid;
 }
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
 	return list_entry_rcu(p->thread_group.next,
 			      struct task_struct, thread_group);
 }
 static inline int thread_group_empty(struct task_struct *p)
 {
 	return list_empty(&p->thread_group);
 }
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
  * pins the final release of task.io_context.  Also protects ->cpuset and
  * ->cgroup.subsys[]. And ->vfork_done.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
  * neither inside nor outside.
  */
 static inline void task_lock(struct task_struct *p)
 {
 	spin_lock(&p->alloc_lock);
 }
 static inline void task_unlock(struct task_struct *p)
 {
 	spin_unlock(&p->alloc_lock);
 }
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 							unsigned long *flags);
 static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
 						       unsigned long *flags)
 {
 	struct sighand_struct *ret;
 	ret = __lock_task_sighand(tsk, flags);
 	(void)__cond_lock(&tsk->sighand->siglock, ret);
 	return ret;
 }
 static inline void unlock_task_sighand(struct task_struct *tsk,
 						unsigned long *flags)
 {
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 #ifdef CONFIG_CGROUPS
 static inline void threadgroup_change_begin(struct task_struct *tsk)
 {
 	down_read(&tsk->signal->group_rwsem);
 }
 static inline void threadgroup_change_end(struct task_struct *tsk)
 {
 	up_read(&tsk->signal->group_rwsem);
 }
 /**
  * threadgroup_lock - lock threadgroup
  * @tsk: member task of the threadgroup to lock
  *
  * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
  * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
  * perform exec.  This is useful for cases where the threadgroup needs to
  * stay stable across blockable operations.
  *
  * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
  * synchronization.  While held, no new task will be added to threadgroup
  * and no existing live task will have its PF_EXITING set.
  *
  * During exec, a task goes and puts its thread group through unusual
  * changes.  After de-threading, exclusive access is assumed to resources
  * which are usually shared by tasks in the same group - e.g. sighand may
  * be replaced with a new one.  Also, the exec'ing task takes over group
  * leader role including its pid.  Exclude these changes while locked by
  * grabbing cred_guard_mutex which is used to synchronize exec path.
  */
 static inline void threadgroup_lock(struct task_struct *tsk)
 {
 	/*
 	 * exec uses exit for de-threading nesting group_rwsem inside
 	 * cred_guard_mutex. Grab cred_guard_mutex first.
 	 */
 	mutex_lock(&tsk->signal->cred_guard_mutex);
 	down_write(&tsk->signal->group_rwsem);
 }
 /**
  * threadgroup_unlock - unlock threadgroup
  * @tsk: member task of the threadgroup to unlock
  *
  * Reverse threadgroup_lock().
  */
 static inline void threadgroup_unlock(struct task_struct *tsk)
 {
 	up_write(&tsk->signal->group_rwsem);
 	mutex_unlock(&tsk->signal->cred_guard_mutex);
 }
 #else
 static inline void threadgroup_change_begin(struct task_struct *tsk) {}
 static inline void threadgroup_change_end(struct task_struct *tsk) {}
 static inline void threadgroup_lock(struct task_struct *tsk) {}
 static inline void threadgroup_unlock(struct task_struct *tsk) {}
 #endif
 #ifndef __HAVE_THREAD_FUNCTIONS
 #define task_thread_info(task)	((struct thread_info *)(task)->stack)
 #define task_stack_page(task)	((task)->stack)
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
 	*task_thread_info(p) = *task_thread_info(org);
 	task_thread_info(p)->task = p;
 }
 static inline unsigned long *end_of_stack(struct task_struct *p)
 {
 	return (unsigned long *)(task_thread_info(p) + 1);
 }
 #endif
 static inline int object_is_on_stack(void *obj)
 {
 	void *stack = task_stack_page(current);
 	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
 }
 extern void thread_info_cache_init(void);
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static inline unsigned long stack_not_used(struct task_struct *p)
 {
 	unsigned long *n = end_of_stack(p);
 	do { 	/* Skip over canary */
 		n++;
 	} while (!*n);
 	return (unsigned long)n - (unsigned long)end_of_stack(p);
 }
 #endif
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
 	return test_ti_thread_flag(task_thread_info(tsk), flag);
 }
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
 {
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 static inline int test_tsk_need_resched(struct task_struct *tsk)
 {
 	return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
 }
 static inline int restart_syscall(void)
 {
 	set_tsk_thread_flag(current, TIF_SIGPENDING);
 	return -ERESTARTNOINTR;
 }
 static inline int signal_pending(struct task_struct *p)
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
 static inline int __fatal_signal_pending(struct task_struct *p)
 {
 	return unlikely(sigismember(&p->pending.signal, SIGKILL));
 }
 static inline int fatal_signal_pending(struct task_struct *p)
 {
 	return signal_pending(p) && __fatal_signal_pending(p);
 }
 static inline int signal_pending_state(long state, struct task_struct *p)
 {
 	if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
 		return 0;
 	if (!signal_pending(p))
 		return 0;
 	return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
 }
 static inline int need_resched(void)
 {
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 /*
  * cond_resched() and cond_resched_lock(): latency reduction via
  * explicit rescheduling in places that are safe. The return
  * value indicates whether a reschedule was done in fact.
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_softirq() will enable bhs before scheduling.
  */
 extern int _cond_resched(void);
 #define cond_resched() ({			\
 	__might_sleep(__FILE__, __LINE__, 0);	\
 	_cond_resched();			\
 })
 extern int __cond_resched_lock(spinlock_t *lock);
 #ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
 #endif
 #define cond_resched_lock(lock) ({				\
 	__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);	\
 	__cond_resched_lock(lock);				\
 })
 extern int __cond_resched_softirq(void);
 #define cond_resched_softirq() ({					\
 	__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);	\
 	__cond_resched_softirq();					\
 })
 /*
  * Does a critical section need to be broken due to another
  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
  * but a general need for low latency)
  */
 static inline int spin_needbreak(spinlock_t *lock)
 {
 #ifdef CONFIG_PREEMPT
 	return spin_is_contended(lock);
 #else
 	return 0;
 #endif
 }
 /*
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
 	raw_spin_lock_init(&sig->cputimer.lock);
 }
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
  * This is required every time the blocked sigset_t changes.
  * callers must hold sighand->siglock.
  */
 extern void recalc_sigpending_and_wake(struct task_struct *t);
 extern void recalc_sigpending(void);
 extern void signal_wake_up(struct task_struct *t, int resume_stopped);
 /*
  * Wrappers for p->thread_info->cpu access. No-op on UP.
  */
 #ifdef CONFIG_SMP
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
 	return task_thread_info(p)->cpu;
 }
 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
 #else
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
 	return 0;
 }
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 }
 #endif /* CONFIG_SMP */
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 extern void normalize_rt_tasks(void);
 #ifdef CONFIG_CGROUP_SCHED
 extern struct task_group root_task_group;
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 extern int sched_group_set_rt_runtime(struct task_group *tg,
 				      long rt_runtime_us);
 extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
 #endif /* CONFIG_CGROUP_SCHED */
 extern int task_can_switch_user(struct user_struct *up,
 					struct task_struct *tsk);
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
 	tsk->ioac.rchar += amt;
 }
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
 	tsk->ioac.wchar += amt;
 }
 static inline void inc_syscr(struct task_struct *tsk)
 {
 	tsk->ioac.syscr++;
 }
 static inline void inc_syscw(struct task_struct *tsk)
 {
 	tsk->ioac.syscw++;
 }
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
 }
 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
 {
 }
 static inline void inc_syscr(struct task_struct *tsk)
 {
 }
 static inline void inc_syscw(struct task_struct *tsk)
 {
 }
 #endif
 #ifndef TASK_SIZE_OF
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
 #else
 static inline void mm_update_next_owner(struct mm_struct *mm)
 {
 }
 static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 }
 #endif /* CONFIG_MM_OWNER */
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
 {
 	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
 }
 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
 		unsigned int limit)
 {
 	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
 }
 static inline unsigned long rlimit(unsigned int limit)
 {
 	return task_rlimit(current, limit);
 }
 static inline unsigned long rlimit_max(unsigned int limit)
 {
 	return task_rlimit_max(current, limit);
 }
 #endif /* __KERNEL__ */
 #endif

 /*
  *	linux/kernel/softirq.c
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
  *	Distribute under GPLv2.
  *
  *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  *
  *	Remote softirq infrastructure is by Jens Axboe.
  */
 #include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 #include <asm/irq.h>
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
      by its own spinlocks.
    - Even if softirq is serialized, only local cpu is marked for
      execution. Hence, we get something sort of weak cpu binding.
      Though it is still not clear, will it result in better locality
      or will not.
    Examples:
    - NET RX softirq. It is multithreaded and does not require
      any global serialization.
    - NET TX softirq. It kicks software netdevice queues, hence
      it is logically serialized per device, but this serialization
      is invisible to common code.
    - Tasklets: serialized wrt itself.
  */
 #ifndef __ARCH_IRQ_STAT
 irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
 EXPORT_SYMBOL(irq_stat);
 #endif
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 char *softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
 };
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
 static void wakeup_softirqd(void)
 {
 	/* Interrupts are disabled: no need to stop preemption */
 	struct task_struct *tsk = __this_cpu_read(ksoftirqd);
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 /*
  * preempt_count and SOFTIRQ_OFFSET usage:
  * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
  *   softirq processing.
  * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
  *   on local_bh_disable or local_bh_enable.
  * This lets us distinguish between whether we are currently processing
  * softirq and whether we just have bh disabled.
  */
 /*
  * This one is for softirq.c-internal use,
  * where hardirqs are disabled legitimately:
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
 static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;
 	WARN_ON_ONCE(in_irq());
 	raw_local_irq_save(flags);
 	/*
 	 * The preempt tracer hooks into add_preempt_count and will break
 	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
 	 * is set and before current->softirq_enabled is cleared.
 	 * We must manually increment preempt_count here and manually
 	 * call the trace_preempt_off later.
 	 */
 	preempt_count() += cnt;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == cnt)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
 	if (preempt_count() == cnt)
 		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
 	add_preempt_count(cnt);
 	barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
 void local_bh_disable(void)
 {
 	__local_bh_disable((unsigned long)__builtin_return_address(0),
 				SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(local_bh_disable);
 static void __local_bh_enable(unsigned int cnt)
 {
 	WARN_ON_ONCE(in_irq());
 	WARN_ON_ONCE(!irqs_disabled());
 	if (softirq_count() == cnt)
 		trace_softirqs_on((unsigned long)__builtin_return_address(0));
 	sub_preempt_count(cnt);
 }
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
  * without processing still-pending softirqs:
  */
 void _local_bh_enable(void)
 {
 	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
 }
 EXPORT_SYMBOL(_local_bh_enable);
 static inline void _local_bh_enable_ip(unsigned long ip)
 {
 	WARN_ON_ONCE(in_irq() || irqs_disabled());
 #ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_disable();
 #endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
 	if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
 		trace_softirqs_on(ip);
 	/*
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
  	 */
 	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
 	if (unlikely(!in_interrupt() && local_softirq_pending()))
 		do_softirq();
 	dec_preempt_count();
 #ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_enable();
 #endif
 	preempt_check_resched();
 }
 void local_bh_enable(void)
 {
 	_local_bh_enable_ip((unsigned long)__builtin_return_address(0));
 }
 EXPORT_SYMBOL(local_bh_enable);
 void local_bh_enable_ip(unsigned long ip)
 {
 	_local_bh_enable_ip(ip);
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
  *
  * This number has been established via experimentation.
  * The two things to balance is latency against fairness -
  * we want to handle softirqs as soon as possible, but they
  * should not be able to lock up the box.
  */
 #define MAX_SOFTIRQ_RESTART 10
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	int cpu;
+	unsigned long old_flags = current->flags;
+	/*
+	 * Mask out PF_MEMALLOC s current task context is borrowed for the
+	 * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+	 * again if the socket is related to swap
+	 */
+	current->flags &= ~PF_MEMALLOC;
 	pending = local_softirq_pending();
 	account_system_vtime(current);
 	__local_bh_disable((unsigned long)__builtin_return_address(0),
 				SOFTIRQ_OFFSET);
 	lockdep_softirq_enter();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 	local_irq_enable();
 	h = softirq_vec;
 	do {
 		if (pending & 1) {
 			unsigned int vec_nr = h - softirq_vec;
 			int prev_count = preempt_count();
 			kstat_incr_softirqs_this_cpu(vec_nr);
 			trace_softirq_entry(vec_nr);
 			h->action(h);
 			trace_softirq_exit(vec_nr);
 			if (unlikely(prev_count != preempt_count())) {
 				printk(KERN_ERR "huh, entered softirq %u %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", vec_nr,
 				       softirq_to_name[vec_nr], h->action,
 				       prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
 			rcu_bh_qs(cpu);
 		}
 		h++;
 		pending >>= 1;
 	} while (pending);
 	local_irq_disable();
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
 		goto restart;
 	if (pending)
 		wakeup_softirqd();
 	lockdep_softirq_exit();
 	account_system_vtime(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
+	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
 asmlinkage void do_softirq(void)
 {
 	__u32 pending;
 	unsigned long flags;
 	if (in_interrupt())
 		return;
 	local_irq_save(flags);
 	pending = local_softirq_pending();
 	if (pending)
 		__do_softirq();
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Enter an interrupt context.
  */
 void irq_enter(void)
 {
 	int cpu = smp_processor_id();
 	rcu_irq_enter();
 	if (is_idle_task(current) && !in_interrupt()) {
 		/*
 		 * Prevent raise_softirq from needlessly waking up ksoftirqd
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
 		local_bh_disable();
 		tick_check_idle(cpu);
 		_local_bh_enable();
 	}
 	__irq_enter();
 }
 static inline void invoke_softirq(void)
 {
 	if (!force_irqthreads) {
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 		__do_softirq();
 #else
 		do_softirq();
 #endif
 	} else {
 		__local_bh_disable((unsigned long)__builtin_return_address(0),
 				SOFTIRQ_OFFSET);
 		wakeup_softirqd();
 		__local_bh_enable(SOFTIRQ_OFFSET);
 	}
 }
 /*
  * Exit an interrupt context. Process softirqs if needed and possible:
  */
 void irq_exit(void)
 {
 	account_system_vtime(current);
 	trace_hardirq_exit();
 	sub_preempt_count(IRQ_EXIT_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
 	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
 		tick_nohz_irq_exit();
 #endif
 	rcu_irq_exit();
 	sched_preempt_enable_no_resched();
 }
 /*
  * This function must run with irqs disabled!
  */
 inline void raise_softirq_irqoff(unsigned int nr)
 {
 	__raise_softirq_irqoff(nr);
 	/*
 	 * If we're in an interrupt or softirq, we're done
 	 * (this also catches softirq-disabled code). We will
 	 * actually run the softirq once we return from
 	 * the irq or softirq.
 	 *
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
 	 */
 	if (!in_interrupt())
 		wakeup_softirqd();
 }
 void raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	raise_softirq_irqoff(nr);
 	local_irq_restore(flags);
 }
 void __raise_softirq_irqoff(unsigned int nr)
 {
 	trace_softirq_raise(nr);
 	or_softirq_pending(1UL << nr);
 }
 void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
 	softirq_vec[nr].action = action;
 }
 /*
  * Tasklets
  */
 struct tasklet_head
 {
 	struct tasklet_struct *head;
 	struct tasklet_struct **tail;
 };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	t->next = NULL;
 	*__this_cpu_read(tasklet_vec.tail) = t;
 	__this_cpu_write(tasklet_vec.tail, &(t->next));
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__tasklet_schedule);
 void __tasklet_hi_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 	local_irq_save(flags);
 	t->next = NULL;
 	*__this_cpu_read(tasklet_hi_vec.tail) = t;
 	__this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
 	raise_softirq_irqoff(HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
 	BUG_ON(!irqs_disabled());
 	t->next = __this_cpu_read(tasklet_hi_vec.head);
 	__this_cpu_write(tasklet_hi_vec.head, t);
 	__raise_softirq_irqoff(HI_SOFTIRQ);
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
 static void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 	local_irq_disable();
 	list = __this_cpu_read(tasklet_vec.head);
 	__this_cpu_write(tasklet_vec.head, NULL);
 	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
 	local_irq_enable();
 	while (list) {
 		struct tasklet_struct *t = list;
 		list = list->next;
 		if (tasklet_trylock(t)) {
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
 				t->func(t->data);
 				tasklet_unlock(t);
 				continue;
 			}
 			tasklet_unlock(t);
 		}
 		local_irq_disable();
 		t->next = NULL;
 		*__this_cpu_read(tasklet_vec.tail) = t;
 		__this_cpu_write(tasklet_vec.tail, &(t->next));
 		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
 		local_irq_enable();
 	}
 }
 static void tasklet_hi_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
 	local_irq_disable();
 	list = __this_cpu_read(tasklet_hi_vec.head);
 	__this_cpu_write(tasklet_hi_vec.head, NULL);
 	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
 	local_irq_enable();
 	while (list) {
 		struct tasklet_struct *t = list;
 		list = list->next;
 		if (tasklet_trylock(t)) {
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
 				t->func(t->data);
 				tasklet_unlock(t);
 				continue;
 			}
 			tasklet_unlock(t);
 		}
 		local_irq_disable();
 		t->next = NULL;
 		*__this_cpu_read(tasklet_hi_vec.tail) = t;
 		__this_cpu_write(tasklet_hi_vec.tail, &(t->next));
 		__raise_softirq_irqoff(HI_SOFTIRQ);
 		local_irq_enable();
 	}
 }
 void tasklet_init(struct tasklet_struct *t,
 		  void (*func)(unsigned long), unsigned long data)
 {
 	t->next = NULL;
 	t->state = 0;
 	atomic_set(&t->count, 0);
 	t->func = func;
 	t->data = data;
 }
 EXPORT_SYMBOL(tasklet_init);
 void tasklet_kill(struct tasklet_struct *t)
 {
 	if (in_interrupt())
 		printk("Attempt to kill tasklet from interrupt\n");
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do {
 			yield();
 		} while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
 }
 EXPORT_SYMBOL(tasklet_kill);
 /*
  * tasklet_hrtimer
  */
 /*
  * The trampoline is called when the hrtimer expires. It schedules a tasklet
  * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
  * hrtimer callback, but from softirq context.
  */
 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
 {
 	struct tasklet_hrtimer *ttimer =
 		container_of(timer, struct tasklet_hrtimer, timer);
 	tasklet_hi_schedule(&ttimer->tasklet);
 	return HRTIMER_NORESTART;
 }
 /*
  * Helper function which calls the hrtimer callback from
  * tasklet/softirq context
  */
 static void __tasklet_hrtimer_trampoline(unsigned long data)
 {
 	struct tasklet_hrtimer *ttimer = (void *)data;
 	enum hrtimer_restart restart;
 	restart = ttimer->function(&ttimer->timer);
 	if (restart != HRTIMER_NORESTART)
 		hrtimer_restart(&ttimer->timer);
 }
 /**
  * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
  * @ttimer:	 tasklet_hrtimer which is initialized
  * @function:	 hrtimer callback function which gets called from softirq context
  * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
  * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
  */
 void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 			  enum hrtimer_restart (*function)(struct hrtimer *),
 			  clockid_t which_clock, enum hrtimer_mode mode)
 {
 	hrtimer_init(&ttimer->timer, which_clock, mode);
 	ttimer->timer.function = __hrtimer_tasklet_trampoline;
 	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
 		     (unsigned long)ttimer);
 	ttimer->function = function;
 }
 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
 /*
  * Remote softirq bits
  */
 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
 EXPORT_PER_CPU_SYMBOL(softirq_work_list);
 static void __local_trigger(struct call_single_data *cp, int softirq)
 {
 	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
 	list_add_tail(&cp->list, head);
 	/* Trigger the softirq only if the list was previously empty.  */
 	if (head->next == &cp->list)
 		raise_softirq_irqoff(softirq);
 }
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static void remote_softirq_receive(void *data)
 {
 	struct call_single_data *cp = data;
 	unsigned long flags;
 	int softirq;
 	softirq = cp->priv;
 	local_irq_save(flags);
 	__local_trigger(cp, softirq);
 	local_irq_restore(flags);
 }
 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
 {
 	if (cpu_online(cpu)) {
 		cp->func = remote_softirq_receive;
 		cp->info = cp;
 		cp->flags = 0;
 		cp->priv = softirq;
 		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
 }
 #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
 {
 	return 1;
 }
 #endif
 /**
  * __send_remote_softirq - try to schedule softirq work on a remote cpu
  * @cp: private SMP call function data area
  * @cpu: the remote cpu
  * @this_cpu: the currently executing cpu
  * @softirq: the softirq for the work
  *
  * Attempt to schedule softirq work on a remote cpu.  If this cannot be
  * done, the work is instead queued up on the local cpu.
  *
  * Interrupts must be disabled.
  */
 void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
 {
 	if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
 		__local_trigger(cp, softirq);
 }
 EXPORT_SYMBOL(__send_remote_softirq);
 /**
  * send_remote_softirq - try to schedule softirq work on a remote cpu
  * @cp: private SMP call function data area
  * @cpu: the remote cpu
  * @softirq: the softirq for the work
  *
  * Like __send_remote_softirq except that disabling interrupts and
  * computing the current cpu is done for the caller.
  */
 void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
 {
 	unsigned long flags;
 	int this_cpu;
 	local_irq_save(flags);
 	this_cpu = smp_processor_id();
 	__send_remote_softirq(cp, cpu, this_cpu, softirq);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(send_remote_softirq);
 static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
 					       unsigned long action, void *hcpu)
 {
 	/*
 	 * If a CPU goes away, splice its entries to the current CPU
 	 * and trigger a run of the softirq
 	 */
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		int cpu = (unsigned long) hcpu;
 		int i;
 		local_irq_disable();
 		for (i = 0; i < NR_SOFTIRQS; i++) {
 			struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
 			struct list_head *local_head;
 			if (list_empty(head))
 				continue;
 			local_head = &__get_cpu_var(softirq_work_list[i]);
 			list_splice_init(head, local_head);
 			raise_softirq_irqoff(i);
 		}
 		local_irq_enable();
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
 	.notifier_call	= remote_softirq_cpu_notify,
 };
 void __init softirq_init(void)
 {
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		int i;
 		per_cpu(tasklet_vec, cpu).tail =
 			&per_cpu(tasklet_vec, cpu).head;
 		per_cpu(tasklet_hi_vec, cpu).tail =
 			&per_cpu(tasklet_hi_vec, cpu).head;
 		for (i = 0; i < NR_SOFTIRQS; i++)
 			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
 	}
 	register_hotcpu_notifier(&remote_softirq_cpu_notifier);
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 static int run_ksoftirqd(void * __bind_cpu)
 {
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		preempt_disable();
 		if (!local_softirq_pending()) {
 			schedule_preempt_disabled();
 		}
 		__set_current_state(TASK_RUNNING);
 		while (local_softirq_pending()) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
 			if (cpu_is_offline((long)__bind_cpu))
 				goto wait_to_die;
 			local_irq_disable();
 			if (local_softirq_pending())
 				__do_softirq();
 			local_irq_enable();
 			sched_preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
 			rcu_note_context_switch((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 wait_to_die:
 	preempt_enable();
 	/* Wait for kthread_stop */
 	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * tasklet_kill_immediate is called to remove a tasklet which can already be
  * scheduled for execution on @cpu.
  *
  * Unlike tasklet_kill, this function removes the tasklet
  * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
  *
  * When this function is called, @cpu must be in the CPU_DEAD state.
  */
 void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
 {
 	struct tasklet_struct **i;
 	BUG_ON(cpu_online(cpu));
 	BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
 	if (!test_bit(TASKLET_STATE_SCHED, &t->state))
 		return;
 	/* CPU is dead, so no lock needed. */
 	for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
 		if (*i == t) {
 			*i = t->next;
 			/* If this was the tail element, move the tail ptr */
 			if (*i == NULL)
 				per_cpu(tasklet_vec, cpu).tail = i;
 			return;
 		}
 	}
 	BUG();
 }
 static void takeover_tasklets(unsigned int cpu)
 {
 	/* CPU is dead, so no lock needed. */
 	local_irq_disable();
 	/* Find end, append list for that CPU. */
 	if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
 		*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
 		this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
 		per_cpu(tasklet_vec, cpu).head = NULL;
 		per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
 	}
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
 		*__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
 		__this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
 		per_cpu(tasklet_hi_vec, cpu).head = NULL;
 		per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
 	}
 	raise_softirq_irqoff(HI_SOFTIRQ);
 	local_irq_enable();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 static int __cpuinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
 	struct task_struct *p;
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		p = kthread_create_on_node(run_ksoftirqd,
 					   hcpu,
 					   cpu_to_node(hotcpu),
 					   "ksoftirqd/%d", hotcpu);
 		if (IS_ERR(p)) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
 			return notifier_from_errno(PTR_ERR(p));
 		}
 		kthread_bind(p, hotcpu);
   		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		wake_up_process(per_cpu(ksoftirqd, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(ksoftirqd, hotcpu))
 			break;
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(ksoftirqd, hotcpu),
 			     cpumask_any(cpu_online_mask));
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN: {
 		static const struct sched_param param = {
 			.sched_priority = MAX_RT_PRIO-1
 		};
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
 		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_stop(p);
 		takeover_tasklets(hotcpu);
 		break;
 	}
 #endif /* CONFIG_HOTPLUG_CPU */
  	}
 	return NOTIFY_OK;
 }
 static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 	BUG_ON(err != NOTIFY_OK);
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
 /*
  * [ These __weak aliases are kept in a separate compilation unit, so that
  *   GCC does not inline them incorrectly. ]
  */
 int __init __weak early_irq_init(void)
 {
 	return 0;
 }
 #ifdef CONFIG_GENERIC_HARDIRQS
 int __init __weak arch_probe_nr_irqs(void)
 {
 	return NR_IRQS_LEGACY;
 }
 int __init __weak arch_early_irq_init(void)
 {
 	return 0;
 }
 #endif

 /*
  *  linux/mm/page_alloc.c
  *
  *  Manages the free list, the system allocates free pages here.
  *  Note that kmalloc() lives in slab.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *  Swap reorganised 29.12.95, Stephen Tweedie
  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  * defined in <linux/topology.h>.
  */
 DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
 /*
  * Array of node states.
  */
 nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 	[N_POSSIBLE] = NODE_MASK_ALL,
 	[N_ONLINE] = { { [0] = 1UL } },
 #ifndef CONFIG_NUMA
 	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
 EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
  * When calculating the number of globally allowed dirty pages, there
  * is a certain number of per-zone reserves that should not be
  * considered dirtyable memory.  This is the sum of those reserves
  * over all existing zones that contribute dirtyable memory.
  */
 unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
  * change gfp_allowed_mask in order to avoid using I/O during memory allocations
  * while devices are suspended.  To avoid races with the suspend/hibernate code,
  * they should always be called with pm_mutex held (gfp_allowed_mask also should
  * only be modified with pm_mutex held, unless the suspend/hibernate code is
  * guaranteed not to run in parallel with that modification).
  */
 static gfp_t saved_gfp_mask;
 void pm_restore_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	if (saved_gfp_mask) {
 		gfp_allowed_mask = saved_gfp_mask;
 		saved_gfp_mask = 0;
 	}
 }
 void pm_restrict_gfp_mask(void)
 {
 	WARN_ON(!mutex_is_locked(&pm_mutex));
 	WARN_ON(saved_gfp_mask);
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
 bool pm_suspended_storage(void)
 {
 	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
 		return false;
 	return true;
 }
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
  * results with 256, 32 in the lowmem_reserve sysctl:
  *	1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
  *	1G machine -> (16M dma, 784M normal, 224M high)
  *	NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
  *	HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
  *	HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
  *
  * TBD: should special case ZONE_DMA32 machines here - in those we normally
  * don't need any ZONE_NORMAL reservation
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 #ifdef CONFIG_ZONE_DMA
 	 256,
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 256,
 #endif
 #ifdef CONFIG_HIGHMEM
 	 32,
 #endif
 	 32,
 };
 EXPORT_SYMBOL(totalram_pages);
 static char * const zone_names[MAX_NR_ZONES] = {
 #ifdef CONFIG_ZONE_DMA
 	 "DMA",
 #endif
 #ifdef CONFIG_ZONE_DMA32
 	 "DMA32",
 #endif
 	 "Normal",
 #ifdef CONFIG_HIGHMEM
 	 "HighMem",
 #endif
 	 "Movable",
 };
 int min_free_kbytes = 1024;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
 int nr_online_nodes __read_mostly = 1;
 EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 int page_group_by_mobility_disabled __read_mostly;
 /*
  * NOTE:
  * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
  * Instead, use {un}set_pageblock_isolate.
  */
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled))
 		migratetype = MIGRATE_UNMOVABLE;
 	set_pageblock_flags_group(page, (unsigned long)migratetype,
 					PB_migrate, PB_migrate_end);
 }
 bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
 	int ret = 0;
 	unsigned seq;
 	unsigned long pfn = page_to_pfn(page);
 	do {
 		seq = zone_span_seqbegin(zone);
 		if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
 			ret = 1;
 		else if (pfn < zone->zone_start_pfn)
 			ret = 1;
 	} while (zone_span_seqretry(zone, seq));
 	return ret;
 }
 static int page_is_consistent(struct zone *zone, struct page *page)
 {
 	if (!pfn_valid_within(page_to_pfn(page)))
 		return 0;
 	if (zone != page_zone(page))
 		return 0;
 	return 1;
 }
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
 static int bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
 	if (!page_is_consistent(zone, page))
 		return 1;
 	return 0;
 }
 #else
 static inline int bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
 #endif
 static void bad_page(struct page *page)
 {
 	static unsigned long resume;
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
 		reset_page_mapcount(page); /* remove PageBuddy */
 		return;
 	}
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
 	 */
 	if (nr_shown == 60) {
 		if (time_before(jiffies, resume)) {
 			nr_unshown++;
 			goto out;
 		}
 		if (nr_unshown) {
 			printk(KERN_ALERT
 			      "BUG: Bad page state: %lu messages suppressed\n",
 				nr_unshown);
 			nr_unshown = 0;
 		}
 		nr_shown = 0;
 	}
 	if (nr_shown++ == 0)
 		resume = jiffies + 60 * HZ;
 	printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
 		current->comm, page_to_pfn(page));
 	dump_page(page);
 	print_modules();
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
 	reset_page_mapcount(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE);
 }
 /*
  * Higher-order pages are called "compound pages".  They are structured thusly:
  *
  * The first PAGE_SIZE page is called the "head page".
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
  * All pages have PG_compound set.  All tail pages have their ->first_page
  * pointing at the head page.
  *
  * The first tail page's ->lru.next holds the address of the compound page's
  * put_page() function.  Its ->lru.prev holds the order of allocation.
  * This usage means that zero-order pages may not be compound.
  */
 static void free_compound_page(struct page *page)
 {
 	__free_pages_ok(page, compound_order(page));
 }
 void prep_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	set_compound_page_dtor(page, free_compound_page);
 	set_compound_order(page, order);
 	__SetPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		__SetPageTail(p);
 		set_page_count(p, 0);
 		p->first_page = page;
 	}
 }
 /* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
 	int i;
 	int nr_pages = 1 << order;
 	int bad = 0;
 	if (unlikely(compound_order(page) != order) ||
 	    unlikely(!PageHead(page))) {
 		bad_page(page);
 		bad++;
 	}
 	__ClearPageHead(page);
 	for (i = 1; i < nr_pages; i++) {
 		struct page *p = page + i;
 		if (unlikely(!PageTail(p) || (p->first_page != page))) {
 			bad_page(page);
 			bad++;
 		}
 		__ClearPageTail(p);
 	}
 	return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	/*
 	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
 	 * and __GFP_HIGHMEM from hard or soft interrupt context.
 	 */
 	VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
 static int __init debug_guardpage_minorder_setup(char *buf)
 {
 	unsigned long res;
 	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
 		return 0;
 	}
 	_debug_guardpage_minorder = res;
 	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
 	return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
 static inline void set_page_guard_flag(struct page *page)
 {
 	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 static inline void clear_page_guard_flag(struct page *page)
 {
 	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
 }
 #else
 static inline void set_page_guard_flag(struct page *page) { }
 static inline void clear_page_guard_flag(struct page *page) { }
 #endif
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
 	__SetPageBuddy(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 }
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
  *
  * 1) Any buddy B1 will have an order O twin B2 which satisfies
  * the following equation:
  *     B2 = B1 ^ (1 << O)
  * For example, if the starting buddy (buddy2) is #8 its order
  * 1 buddy is #10:
  *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
  *
  * 2) Any buddy B will have an order O+1 parent P which
  * satisfies the following equation:
  *     P = B & ~(1 << O)
  *
  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
  */
 static inline unsigned long
 __find_buddy_index(unsigned long page_idx, unsigned int order)
 {
 	return page_idx ^ (1 << order);
 }
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
  * (a) the buddy is not in a hole &&
  * (b) the buddy is in the buddy system &&
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
  * For recording whether a page is in the buddy system, we set ->_mapcount -2.
  * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
 static inline int page_is_buddy(struct page *page, struct page *buddy,
 								int order)
 {
 	if (!pfn_valid_within(page_to_pfn(buddy)))
 		return 0;
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 	if (page_is_guard(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
 	}
 	return 0;
 }
 /*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
  * (containing bit values) for memory blocks of various "orders".
  * The bottom level table contains the map for the smallest allocatable
  * units of memory (here, pages), and each level above it describes
  * pairs of units from the levels below, hence, "buddies".
  * At a high level, all that happens here is marking the table entry
  * at the bottom level available, and propagating the changes upward
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with _mapcount -2. Page's
  * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
  * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
 static inline void __free_one_page(struct page *page,
 		struct zone *zone, unsigned int order,
 		int migratetype)
 {
 	unsigned long page_idx;
 	unsigned long combined_idx;
 	unsigned long uninitialized_var(buddy_idx);
 	struct page *buddy;
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
 			return;
 	VM_BUG_ON(migratetype == -1);
 	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 	VM_BUG_ON(page_idx & ((1 << order) - 1));
 	VM_BUG_ON(bad_range(zone, page));
 	while (order < MAX_ORDER-1) {
 		buddy_idx = __find_buddy_index(page_idx, order);
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
 		/*
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy)) {
 			clear_page_guard_flag(buddy);
 			set_page_private(page, 0);
 			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 		} else {
 			list_del(&buddy->lru);
 			zone->free_area[order].nr_free--;
 			rmv_page_order(buddy);
 		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
 		order++;
 	}
 	set_page_order(page, order);
 	/*
 	 * If this is not the largest possible page, check if the buddy
 	 * of the next-highest order is free. If it is, it's possible
 	 * that pages are being freed that will coalesce soon. In case,
 	 * that is happening, add the free page to the tail of the list
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
 	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
 		struct page *higher_page, *higher_buddy;
 		combined_idx = buddy_idx & page_idx;
 		higher_page = page + (combined_idx - page_idx);
 		buddy_idx = __find_buddy_index(combined_idx, order + 1);
 		higher_buddy = page + (buddy_idx - combined_idx);
 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
 			list_add_tail(&page->lru,
 				&zone->free_area[order].free_list[migratetype]);
 			goto out;
 		}
 	}
 	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
 	zone->free_area[order].nr_free++;
 }
 /*
  * free_page_mlock() -- clean up attempts to free and mlocked() page.
  * Page should not be on lru, so no need to fix that up.
  * free_pages_check() will verify...
  */
 static inline void free_page_mlock(struct page *page)
 {
 	__dec_zone_page_state(page, NR_MLOCK);
 	__count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
 static inline int free_pages_check(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0) |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
 		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return 0;
 }
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
  *
  * And clear the zone's pages_scanned counter, to hold off the "all pages are
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
 					struct per_cpu_pages *pcp)
 {
 	int migratetype = 0;
 	int batch_free = 0;
 	int to_free = count;
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	while (to_free) {
 		struct page *page;
 		struct list_head *list;
 		/*
 		 * Remove pages from lists in a round-robin fashion. A
 		 * batch_free count is maintained that is incremented when an
 		 * empty list is encountered.  This is so more pages are freed
 		 * off fuller lists instead of spinning excessively around empty
 		 * lists
 		 */
 		do {
 			batch_free++;
 			if (++migratetype == MIGRATE_PCPTYPES)
 				migratetype = 0;
 			list = &pcp->lists[migratetype];
 		} while (list_empty(list));
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
 			batch_free = to_free;
 		do {
 			page = list_entry(list->prev, struct page, lru);
 			/* must delete as __free_one_page list manipulates */
 			list_del(&page->lru);
 			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 			__free_one_page(page, zone, 0, page_private(page));
 			trace_mm_page_pcpu_drain(page, 0, page_private(page));
 		} while (--to_free && --batch_free && !list_empty(list));
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, count);
 	spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
 	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 	__free_one_page(page, zone, order, migratetype);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
 	spin_unlock(&zone->lock);
 }
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	int i;
 	int bad = 0;
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	if (PageAnon(page))
 		page->mapping = NULL;
 	for (i = 0; i < (1 << order); i++)
 		bad += free_pages_check(page + i);
 	if (bad)
 		return false;
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
 	arch_free_page(page, order);
 	kernel_map_pages(page, 1 << order, 0);
 	return true;
 }
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int wasMlocked = __TestClearPageMlocked(page);
 	if (!free_pages_prepare(page, order))
 		return;
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_events(PGFREE, 1 << order);
 	free_one_page(page_zone(page), page, order,
 					get_pageblock_migratetype(page));
 	local_irq_restore(flags);
 }
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	unsigned int loop;
 	prefetchw(page);
 	for (loop = 0; loop < nr_pages; loop++) {
 		struct page *p = &page[loop];
 		if (loop + 1 < nr_pages)
 			prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
 	unsigned i = pageblock_nr_pages;
 	struct page *p = page;
 	do {
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	} while (++p, --i);
 	set_page_refcounted(page);
 	set_pageblock_migratetype(page, MIGRATE_CMA);
 	__free_pages(page, pageblock_order);
 	totalram_pages += pageblock_nr_pages;
 }
 #endif
 /*
  * The order of subdivision here is critical for the IO subsystem.
  * Please do not alter this order without good reasons and regression
  * testing. Specifically, as large blocks of memory are subdivided,
  * the order in which smaller blocks are delivered depends on the order
  * they're subdivided in this function. This is the primary factor
  * influencing the order in which pages are delivered to the IO
  * subsystem according to empirical testing, and this is also justified
  * by considering the behavior of a buddy system containing a single
  * large block of memory acted on by a series of small allocations.
  * This behavior is a critical factor in sglist merging's success.
  *
  * -- wli
  */
 static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, struct free_area *area,
 	int migratetype)
 {
 	unsigned long size = 1 << high;
 	while (high > low) {
 		area--;
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (high < debug_guardpage_minorder()) {
 			/*
 			 * Mark as guard pages (or page), that will allow to
 			 * merge back to allocator when buddy will be freed.
 			 * Corresponding page table entries will not be touched,
 			 * pages will stay not present in virtual address space
 			 */
 			INIT_LIST_HEAD(&page[size].lru);
 			set_page_guard_flag(&page[size]);
 			set_page_private(&page[size], high);
 			/* Guard pages are not available for any usage */
 			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
 			continue;
 		}
 #endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
 	}
 }
 /*
  * This page is about to be returned from the page allocator
  */
 static inline int check_new_page(struct page *page)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
 		(atomic_read(&page->_count) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
 		(mem_cgroup_bad_page_check(page)))) {
 		bad_page(page);
 		return 1;
 	}
 	return 0;
 }
 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
 		if (unlikely(check_new_page(p)))
 			return 1;
 	}
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
 	if (order && (gfp_flags & __GFP_COMP))
 		prep_compound_page(page, order);
 	return 0;
 }
 /*
  * Go through the free lists for the given migratetype and remove
  * the smallest available page from the freelists
  */
 static inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
 	struct free_area * area;
 	struct page *page;
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
 		if (list_empty(&area->free_list[migratetype]))
 			continue;
 		page = list_entry(area->free_list[migratetype].next,
 							struct page, lru);
 		list_del(&page->lru);
 		rmv_page_order(page);
 		area->nr_free--;
 		expand(zone, page, order, current_order, area, migratetype);
 		return page;
 	}
 	return NULL;
 }
 /*
  * This array describes the order lists are fallen back to when
  * the free lists for the desirable migrate type are depleted
  */
 static int fallbacks[MIGRATE_TYPES][4] = {
 	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 #ifdef CONFIG_CMA
 	[MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
 	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 #else
 	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 #endif
 	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 };
 /*
  * Move the free pages in a range to the free lists of the requested type.
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
 static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
 			  int migratetype)
 {
 	struct page *page;
 	unsigned long order;
 	int pages_moved = 0;
 #ifndef CONFIG_HOLES_IN_ZONE
 	/*
 	 * page_zone is not safe to call in this context when
 	 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
 	 * anyway as we check zone boundaries in move_freepages_block().
 	 * Remove at a later date when no bug reports exist related to
 	 * grouping pages by mobility
 	 */
 	BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 	for (page = start_page; page <= end_page;) {
 		/* Make sure we are not inadvertently changing nodes */
 		VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
 			continue;
 		}
 		if (!PageBuddy(page)) {
 			page++;
 			continue;
 		}
 		order = page_order(page);
 		list_move(&page->lru,
 			  &zone->free_area[order].free_list[migratetype]);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
 	return pages_moved;
 }
 int move_freepages_block(struct zone *zone, struct page *page,
 				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
 	start_pfn = page_to_pfn(page);
 	start_pfn = start_pfn & ~(pageblock_nr_pages-1);
 	start_page = pfn_to_page(start_pfn);
 	end_page = start_page + pageblock_nr_pages - 1;
 	end_pfn = start_pfn + pageblock_nr_pages - 1;
 	/* Do not cross zone boundaries */
 	if (start_pfn < zone->zone_start_pfn)
 		start_page = page;
 	if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
 		return 0;
 	return move_freepages(zone, start_page, end_page, migratetype);
 }
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
 	int nr_pageblocks = 1 << (start_order - pageblock_order);
 	while (nr_pageblocks--) {
 		set_pageblock_migratetype(pageblock_page, migratetype);
 		pageblock_page += pageblock_nr_pages;
 	}
 }
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
 	struct free_area * area;
 	int current_order;
 	struct page *page;
 	int migratetype, i;
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
 						--current_order) {
 		for (i = 0;; i++) {
 			migratetype = fallbacks[start_migratetype][i];
 			/* MIGRATE_RESERVE handled later if necessary */
 			if (migratetype == MIGRATE_RESERVE)
 				break;
 			area = &(zone->free_area[current_order]);
 			if (list_empty(&area->free_list[migratetype]))
 				continue;
 			page = list_entry(area->free_list[migratetype].next,
 					struct page, lru);
 			area->nr_free--;
 			/*
 			 * If breaking a large block of pages, move all free
 			 * pages to the preferred allocation list. If falling
 			 * back for a reclaimable kernel allocation, be more
 			 * aggressive about taking ownership of free pages
 			 *
 			 * On the other hand, never change migration
 			 * type of MIGRATE_CMA pageblocks nor move CMA
 			 * pages on different free lists. We don't
 			 * want unmovable pages to be allocated from
 			 * MIGRATE_CMA areas.
 			 */
 			if (!is_migrate_cma(migratetype) &&
 			    (unlikely(current_order >= pageblock_order / 2) ||
 			     start_migratetype == MIGRATE_RECLAIMABLE ||
 			     page_group_by_mobility_disabled)) {
 				int pages;
 				pages = move_freepages_block(zone, page,
 								start_migratetype);
 				/* Claim the whole block if over half of it is free */
 				if (pages >= (1 << (pageblock_order-1)) ||
 						page_group_by_mobility_disabled)
 					set_pageblock_migratetype(page,
 								start_migratetype);
 				migratetype = start_migratetype;
 			}
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 			/* Take ownership for orders >= pageblock_order */
 			if (current_order >= pageblock_order &&
 			    !is_migrate_cma(migratetype))
 				change_pageblock_range(page, current_order,
 							start_migratetype);
 			expand(zone, page, order, current_order, area,
 			       is_migrate_cma(migratetype)
 			     ? migratetype : start_migratetype);
 			trace_mm_page_alloc_extfrag(page, order, current_order,
 				start_migratetype, migratetype);
 			return page;
 		}
 	}
 	return NULL;
 }
 /*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
 static struct page *__rmqueue(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	struct page *page;
 retry_reserve:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 		page = __rmqueue_fallback(zone, order, migratetype);
 		/*
 		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
 		 * is used because __rmqueue_smallest is an inline function
 		 * and we want just one call site
 		 */
 		if (!page) {
 			migratetype = MIGRATE_RESERVE;
 			goto retry_reserve;
 		}
 	}
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
 	return page;
 }
 /*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			int migratetype, int cold)
 {
 	int mt = migratetype, i;
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, migratetype);
 		if (unlikely(page == NULL))
 			break;
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
 		 * list and the list head then moves forward. From the callers
 		 * perspective, the linked list is ordered by page number in
 		 * some conditions. This is useful for IO devices that can
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
 		if (likely(cold == 0))
 			list_add(&page->lru, list);
 		else
 			list_add_tail(&page->lru, list);
 		if (IS_ENABLED(CONFIG_CMA)) {
 			mt = get_pageblock_migratetype(page);
 			if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
 				mt = migratetype;
 		}
 		set_page_private(page, mt);
 		list = &page->lru;
 	}
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	return i;
 }
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
  *
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
 	unsigned long flags;
 	int to_drain;
 	local_irq_save(flags);
 	if (pcp->count >= pcp->batch)
 		to_drain = pcp->batch;
 	else
 		to_drain = pcp->count;
 	if (to_drain > 0) {
 		free_pcppages_bulk(zone, to_drain, pcp);
 		pcp->count -= to_drain;
 	}
 	local_irq_restore(flags);
 }
 #endif
 /*
  * Drain pages of the indicated processor.
  *
  * The processor must either be the current processor and the
  * thread pinned to the current processor or a processor that
  * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
 	unsigned long flags;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		local_irq_save(flags);
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		if (pcp->count) {
 			free_pcppages_bulk(zone, pcp->count, pcp);
 			pcp->count = 0;
 		}
 		local_irq_restore(flags);
 	}
 }
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
  */
 void drain_local_pages(void *arg)
 {
 	drain_pages(smp_processor_id());
 }
 /*
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * Note that this code is protected against sending an IPI to an offline
  * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
  * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
  * nothing keeps CPUs from showing up after we populated the cpumask and
  * before the call to on_each_cpu_mask().
  */
 void drain_all_pages(void)
 {
 	int cpu;
 	struct per_cpu_pageset *pcp;
 	struct zone *zone;
 	/*
 	 * Allocate in the BSS so we wont require allocation in
 	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
 	 */
 	static cpumask_t cpus_with_pcps;
 	/*
 	 * We don't care about racing with CPU hotplug event
 	 * as offline notification will cause the notified
 	 * cpu to drain that CPU pcps and on_each_cpu_mask
 	 * disables preemption as part of its processing
 	 */
 	for_each_online_cpu(cpu) {
 		bool has_pcps = false;
 		for_each_populated_zone(zone) {
 			pcp = per_cpu_ptr(zone->pageset, cpu);
 			if (pcp->pcp.count) {
 				has_pcps = true;
 				break;
 			}
 		}
 		if (has_pcps)
 			cpumask_set_cpu(cpu, &cpus_with_pcps);
 		else
 			cpumask_clear_cpu(cpu, &cpus_with_pcps);
 	}
 	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
 {
 	unsigned long pfn, max_zone_pfn;
 	unsigned long flags;
 	int order, t;
 	struct list_head *curr;
 	if (!zone->spanned_pages)
 		return;
 	spin_lock_irqsave(&zone->lock, flags);
 	max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
 	for_each_migratetype_order(order, t) {
 		list_for_each(curr, &zone->free_area[order].free_list[t]) {
 			unsigned long i;
 			pfn = page_to_pfn(list_entry(curr, struct page, lru));
 			for (i = 0; i < (1UL << order); i++)
 				swsusp_set_page_free(pfn_to_page(pfn + i));
 		}
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif /* CONFIG_PM */
 /*
  * Free a 0-order page
  * cold == 1 ? free a cold page : free a hot page
  */
 void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	unsigned long flags;
 	int migratetype;
 	int wasMlocked = __TestClearPageMlocked(page);
 	if (!free_pages_prepare(page, 0))
 		return;
 	migratetype = get_pageblock_migratetype(page);
 	set_page_private(page, migratetype);
 	local_irq_save(flags);
 	if (unlikely(wasMlocked))
 		free_page_mlock(page);
 	__count_vm_event(PGFREE);
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
 	 * offlined but treat RESERVE as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
 	if (migratetype >= MIGRATE_PCPTYPES) {
 		if (unlikely(migratetype == MIGRATE_ISOLATE)) {
 			free_one_page(zone, page, 0, migratetype);
 			goto out;
 		}
 		migratetype = MIGRATE_MOVABLE;
 	}
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	if (cold)
 		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	else
 		list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		free_pcppages_bulk(zone, pcp->batch, pcp);
 		pcp->count -= pcp->batch;
 	}
 out:
 	local_irq_restore(flags);
 }
 /*
  * Free a list of 0-order pages
  */
 void free_hot_cold_page_list(struct list_head *list, int cold)
 {
 	struct page *page, *next;
 	list_for_each_entry_safe(page, next, list, lru) {
 		trace_mm_page_free_batched(page, cold);
 		free_hot_cold_page(page, cold);
 	}
 }
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
  * Each sub-page must be freed individually.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
 	VM_BUG_ON(PageCompound(page));
 	VM_BUG_ON(!page_count(page));
 #ifdef CONFIG_KMEMCHECK
 	/*
 	 * Split shadow pages too, because free(page[0]) would
 	 * otherwise free the whole shadow.
 	 */
 	if (kmemcheck_page_is_tracked(page))
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 }
 /*
  * Similar to split_page except the page is already free. As this is only
  * being used for migration, the migratetype of the block also changes.
  * As this is called with interrupts disabled, the caller is responsible
  * for calling arch_alloc_page() and kernel_map_page() after interrupts
  * are enabled.
  *
  * Note: this is probably too low level an operation for use in drivers.
  * Please consult with lkml before using this in your driver.
  */
 int split_free_page(struct page *page)
 {
 	unsigned int order;
 	unsigned long watermark;
 	struct zone *zone;
 	BUG_ON(!PageBuddy(page));
 	zone = page_zone(page);
 	order = page_order(page);
 	/* Obey watermarks as if the page was being allocated */
 	watermark = low_wmark_pages(zone) + (1 << order);
 	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
 		return 0;
 	/* Remove page from free list */
 	list_del(&page->lru);
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
 	/* Split into individual pages */
 	set_page_refcounted(page);
 	split_page(page, order);
 	if (order >= pageblock_order - 1) {
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
 	}
 	return 1 << order;
 }
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
  * or two.
  */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, int order, gfp_t gfp_flags,
 			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
 again:
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 		struct list_head *list;
 		local_irq_save(flags);
 		pcp = &this_cpu_ptr(zone->pageset)->pcp;
 		list = &pcp->lists[migratetype];
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
 					migratetype, cold);
 			if (unlikely(list_empty(list)))
 				goto failed;
 		}
 		if (cold)
 			page = list_entry(list->prev, struct page, lru);
 		else
 			page = list_entry(list->next, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
 		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
 			/*
 			 * __GFP_NOFAIL is not to be used in new code.
 			 *
 			 * All __GFP_NOFAIL callers should be fixed so that they
 			 * properly detect and handle allocation failures.
 			 *
 			 * We most definitely don't want callers attempting to
 			 * allocate greater than order-1 page units with
 			 * __GFP_NOFAIL.
 			 */
 			WARN_ON_ONCE(order > 1);
 		}
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order, migratetype);
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
 		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
 	}
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
 		goto again;
 	return page;
 failed:
 	local_irq_restore(flags);
 	return NULL;
 }
 /* The ALLOC_WMARK bits are used as an index to zone->watermark */
 #define ALLOC_WMARK_MIN		WMARK_MIN
 #define ALLOC_WMARK_LOW		WMARK_LOW
 #define ALLOC_WMARK_HIGH	WMARK_HIGH
 #define ALLOC_NO_WATERMARKS	0x04 /* don't check watermarks at all */
 /* Mask to get the watermark bits */
 #define ALLOC_WMARK_MASK	(ALLOC_NO_WATERMARKS-1)
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 static struct {
 	struct fault_attr attr;
 	u32 ignore_gfp_highmem;
 	u32 ignore_gfp_wait;
 	u32 min_order;
 } fail_page_alloc = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 	.ignore_gfp_highmem = 1,
 	.min_order = 1,
 };
 static int __init setup_fail_page_alloc(char *str)
 {
 	return setup_fault_attr(&fail_page_alloc.attr, str);
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
 static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	if (order < fail_page_alloc.min_order)
 		return false;
 	if (gfp_mask & __GFP_NOFAIL)
 		return false;
 	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
 		return false;
 	if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
 		return false;
 	return should_fail(&fail_page_alloc.attr, 1 << order);
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init fail_page_alloc_debugfs(void)
 {
 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
 					&fail_page_alloc.attr);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
 	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				&fail_page_alloc.ignore_gfp_wait))
 		goto fail;
 	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
 				&fail_page_alloc.ignore_gfp_highmem))
 		goto fail;
 	if (!debugfs_create_u32("min-order", mode, dir,
 				&fail_page_alloc.min_order))
 		goto fail;
 	return 0;
 fail:
 	debugfs_remove_recursive(dir);
 	return -ENOMEM;
 }
 late_initcall(fail_page_alloc_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAIL_PAGE_ALLOC */
 static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
  * Return true if free pages are above 'mark'. This takes into account the order
  * of the allocation.
  */
 static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags, long free_pages)
 {
 	/* free_pages my go negative - that's OK */
 	long min = mark;
 	long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 	int o;
 	free_pages -= (1 << order) - 1;
 	if (alloc_flags & ALLOC_HIGH)
 		min -= min / 2;
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 	if (free_pages <= min + lowmem_reserve)
 		return false;
 	for (o = 0; o < order; o++) {
 		/* At the next order, this order's pages become unavailable */
 		free_pages -= z->free_area[o].nr_free << o;
 		/* Require fewer higher order pages to be free */
 		min >>= 1;
 		if (free_pages <= min)
 			return false;
 	}
 	return true;
 }
 #ifdef CONFIG_MEMORY_ISOLATION
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
 {
 	if (unlikely(zone->nr_pageblock_isolate))
 		return zone->nr_pageblock_isolate * pageblock_nr_pages;
 	return 0;
 }
 #else
 static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
 {
 	return 0;
 }
 #endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 		      int classzone_idx, int alloc_flags)
 {
 	long free_pages = zone_page_state(z, NR_FREE_PAGES);
 	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
 		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
 	/*
 	 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
 	 * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
 	 * sleep although it could do so.  But this is more desirable for memory
 	 * hotplug than sleeping which can cause a livelock in the direct
 	 * reclaim path.
 	 */
 	free_pages -= nr_zone_isolate_freepages(z);
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 								free_pages);
 }
 #ifdef CONFIG_NUMA
 /*
  * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
  * skip over zones that are not allowed by the cpuset, or that have
  * been recently (in last second) found to be nearly full.  See further
  * comments in mmzone.h.  Reduces cache footprint of zonelist scans
  * that have to skip over a lot of full or unallowed zones.
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
  * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
  *
  * If the fullzones BITMAP in the zonelist cache is stale (more than
  * a second since last zap'd) then we zap it out (clear its bits.)
  *
  * We hold off even calling zlc_setup, until after we've checked the
  * first zone in the zonelist, on the theory that most allocations will
  * be satisfied from that first zone, so best to examine that zone as
  * quickly as we can.
  */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	nodemask_t *allowednodes;	/* zonelist_cache approximation */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return NULL;
 	if (time_after(jiffies, zlc->last_full_zap + HZ)) {
 		bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 		zlc->last_full_zap = jiffies;
 	}
 	allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
 					&cpuset_current_mems_allowed :
 					&node_states[N_HIGH_MEMORY];
 	return allowednodes;
 }
 /*
  * Given 'z' scanning a zonelist, run a couple of quick checks to see
  * if it is worth looking at further for free memory:
  *  1) Check that the zone isn't thought to be full (doesn't have its
  *     bit set in the zonelist_cache fullzones BITMAP).
  *  2) Check that the zones node (obtained from the zonelist_cache
  *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
  * Return true (non-zero) if zone is worth looking at further, or
  * else return false (zero) if it is not.
  *
  * This check -ignores- the distinction between various watermarks,
  * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
  * found to be full for any variation of these watermarks, it will
  * be considered full for up to one second by all requests, unless
  * we are so low on memory on all allowed nodes that we are forced
  * into the second scan of the zonelist.
  *
  * In the second scan we ignore this zonelist cache and exactly
  * apply the watermarks to all zones, even it is slower to do so.
  * We are low on memory in the second scan, and should leave no stone
  * unturned looking for a free page.
  */
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 						nodemask_t *allowednodes)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	int n;				/* node that zone *z is on */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return 1;
 	i = z - zonelist->_zonerefs;
 	n = zlc->z_to_n[i];
 	/* This zone is worth trying if it is allowed but not full */
 	return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
 }
 /*
  * Given 'z' scanning a zonelist, set the corresponding bit in
  * zlc->fullzones, so that subsequent attempts to allocate a page
  * from that zone don't waste time re-examining it.
  */
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	int i;				/* index of *z in zonelist zones */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	i = z - zonelist->_zonerefs;
 	set_bit(i, zlc->fullzones);
 }
 /*
  * clear all zones full, called after direct reclaim makes progress so that
  * a zone that was recently full is not skipped over for up to a second
  */
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 	struct zonelist_cache *zlc;	/* cached zonelist speedup info */
 	zlc = zonelist->zlcache_ptr;
 	if (!zlc)
 		return;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 #else	/* CONFIG_NUMA */
 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 {
 	return NULL;
 }
 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
 				nodemask_t *allowednodes)
 {
 	return 1;
 }
 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 #endif	/* CONFIG_NUMA */
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
 		struct zone *preferred_zone, int migratetype)
 {
 	struct zoneref *z;
 	struct page *page = NULL;
 	int classzone_idx;
 	struct zone *zone;
 	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 	classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
 		 * limit, such that no single zone holds more than its
 		 * proportional share of globally allowed dirty pages.
 		 * The dirty limits take into account the zone's
 		 * lowmem reserves and high watermark so that kswapd
 		 * should be able to balance it without having to
 		 * write pages from its LRU list.
 		 *
 		 * This may look like it could increase pressure on
 		 * lower zones by failing allocations in higher zones
 		 * before they are full.  But the pages that do spill
 		 * over are limited as the lower zones are protected
 		 * by this very same mechanism.  It should not become
 		 * a practical burden to them.
 		 *
 		 * XXX: For now, allow allocations to potentially
 		 * exceed the per-zone dirty limit in the slowpath
 		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
 		 * which is important when on a NUMA setup the allowed
 		 * zones are together not big enough to reach the
 		 * global limit.  The proper fix for these situations
 		 * will require awareness of zones in the
 		 * dirty-throttling and the flusher threads.
 		 */
 		if ((alloc_flags & ALLOC_WMARK_LOW) &&
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			goto this_zone_full;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
 			int ret;
 			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags))
 				goto try_this_zone;
 			if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
 				/*
 				 * we do zlc_setup if there are multiple nodes
 				 * and before considering the first zone allowed
 				 * by the cpuset.
 				 */
 				allowednodes = zlc_setup(zonelist, alloc_flags);
 				zlc_active = 1;
 				did_zlc_setup = 1;
 			}
 			if (zone_reclaim_mode == 0)
 				goto this_zone_full;
 			/*
 			 * As we may have just activated ZLC, check if the first
 			 * eligible zone has failed zone_reclaim recently.
 			 */
 			if (NUMA_BUILD && zlc_active &&
 				!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 			ret = zone_reclaim(zone, gfp_mask, order);
 			switch (ret) {
 			case ZONE_RECLAIM_NOSCAN:
 				/* did not scan */
 				continue;
 			case ZONE_RECLAIM_FULL:
 				/* scanned but unreclaimable */
 				continue;
 			default:
 				/* did we reclaim enough */
 				if (!zone_watermark_ok(zone, order, mark,
 						classzone_idx, alloc_flags))
 					goto this_zone_full;
 			}
 		}
 try_this_zone:
 		page = buffered_rmqueue(preferred_zone, zone, order,
 						gfp_mask, migratetype);
 		if (page)
 			break;
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
 	}
 	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
 		/* Disable zlc cache for second zonelist scan */
 		zlc_active = 0;
 		goto zonelist_scan;
 	}
 	return page;
 }
 /*
  * Large machines with many possible nodes should not always dump per-node
  * meminfo in irq context.
  */
 static inline bool should_suppress_show_mem(void)
 {
 	bool ret = false;
 #if NODES_SHIFT > 8
 	ret = in_interrupt();
 #endif
 	return ret;
 }
 static DEFINE_RATELIMIT_STATE(nopage_rs,
 		DEFAULT_RATELIMIT_INTERVAL,
 		DEFAULT_RATELIMIT_BURST);
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
 	    debug_guardpage_minorder() > 0)
 		return;
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
 	 * of allowed nodes.
 	 */
 	if (!(gfp_mask & __GFP_NOMEMALLOC))
 		if (test_thread_flag(TIF_MEMDIE) ||
 		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
 			filter &= ~SHOW_MEM_FILTER_NODES;
 	if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
 		filter &= ~SHOW_MEM_FILTER_NODES;
 	if (fmt) {
 		struct va_format vaf;
 		va_list args;
 		va_start(args, fmt);
 		vaf.fmt = fmt;
 		vaf.va = &args;
 		pr_warn("%pV", &vaf);
 		va_end(args);
 	}
 	pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 		current->comm, order, gfp_mask);
 	dump_stack();
 	if (!should_suppress_show_mem())
 		show_mem(filter);
 }
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 	/* Always retry if specifically requested */
 	if (gfp_mask & __GFP_NOFAIL)
 		return 1;
 	/*
 	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
 	 * making forward progress without invoking OOM. Suspend also disables
 	 * storage devices so kswapd will not help. Bail if we are suspending.
 	 */
 	if (!did_some_progress && pm_suspended_storage())
 		return 0;
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
 	 * implementations.
 	 */
 	if (order <= PAGE_ALLOC_COSTLY_ORDER)
 		return 1;
 	/*
 	 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
 	 * specified, then we retry until we no longer reclaim any pages
 	 * (above), or we've reclaimed an order of pages at least as
 	 * large as the allocation's order. In both cases, if the
 	 * allocation still fails, we stop retrying.
 	 */
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 	return 0;
 }
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	/* Acquire the OOM killer lock for the zones in zonelist */
 	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
 	 * we're still under heavy pressure.
 	 */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
 		order, zonelist, high_zoneidx,
 		ALLOC_WMARK_HIGH|ALLOC_CPUSET,
 		preferred_zone, migratetype);
 	if (page)
 		goto out;
 	if (!(gfp_mask & __GFP_NOFAIL)) {
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
 		 * The caller should handle page allocation failure by itself if
 		 * it specifies __GFP_THISNODE.
 		 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
 		 */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
 	clear_zonelist_oom(zonelist, gfp_mask);
 	return page;
 }
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	struct page *page;
 	if (!order)
 		return NULL;
 	if (compaction_deferred(preferred_zone, order)) {
 		*deferred_compaction = true;
 		return NULL;
 	}
 	current->flags |= PF_MEMALLOC;
 	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
 						nodemask, sync_migration);
 	current->flags &= ~PF_MEMALLOC;
 	if (*did_some_progress != COMPACT_SKIPPED) {
 		/* Page migration frees to the PCP lists but we want merging */
 		drain_pages(get_cpu());
 		put_cpu();
 		page = get_page_from_freelist(gfp_mask, nodemask,
 				order, zonelist, high_zoneidx,
 				alloc_flags, preferred_zone,
 				migratetype);
 		if (page) {
 			preferred_zone->compact_considered = 0;
 			preferred_zone->compact_defer_shift = 0;
 			if (order >= preferred_zone->compact_order_failed)
 				preferred_zone->compact_order_failed = order + 1;
 			count_vm_event(COMPACTSUCCESS);
 			return page;
 		}
 		/*
 		 * It's bad if compaction run occurs and fails.
 		 * The most likely reason is that pages exist,
 		 * but not enough to satisfy watermarks.
 		 */
 		count_vm_event(COMPACTFAIL);
 		/*
 		 * As async compaction considers a subset of pageblocks, only
 		 * defer if the failure was a sync compaction failure.
 		 */
 		if (sync_migration)
 			defer_compaction(preferred_zone, order);
 		cond_resched();
 	}
 	return NULL;
 }
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, bool sync_migration,
 	bool *deferred_compaction,
 	unsigned long *did_some_progress)
 {
 	return NULL;
 }
 #endif /* CONFIG_COMPACTION */
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 		  nodemask_t *nodemask)
 {
 	struct reclaim_state reclaim_state;
 	int progress;
 	cond_resched();
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	current->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 	progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	current->flags &= ~PF_MEMALLOC;
 	cond_resched();
 	return progress;
 }
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
 	int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
 	bool drained = false;
 	*did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 					       nodemask);
 	if (unlikely(!(*did_some_progress)))
 		return NULL;
 	/* After successful reclaim, reconsider all zones for allocation */
 	if (NUMA_BUILD)
 		zlc_clear_zones_full(zonelist);
 retry:
 	page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx,
 					alloc_flags, preferred_zone,
 					migratetype);
 	/*
 	 * If an allocation failed after direct reclaim, it could be because
 	 * pages are pinned on the per-cpu lists. Drain them and try again
 	 */
 	if (!page && !drained) {
 		drain_all_pages();
 		drained = true;
 		goto retry;
 	}
 	return page;
 }
 /*
  * This is called in the allocator slow-path if the allocation request is of
  * sufficient urgency to ignore watermarks and take other desperate measures
  */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	struct page *page;
 	do {
 		page = get_page_from_freelist(gfp_mask, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 		if (!page && gfp_mask & __GFP_NOFAIL)
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 	return page;
 }
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
 						enum zone_type high_zoneidx,
 						enum zone_type classzone_idx)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
 		wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
 	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
 	/*
 	 * The caller may dip into page reserves a bit more if the caller
 	 * cannot run direct reclaim, or if the caller has realtime scheduling
 	 * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
 	 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 	 */
 	alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
 	if (!wait) {
 		/*
 		 * Not worth trying to allocate harder for
 		 * __GFP_NOMEMALLOC even if it can't schedule.
 		 */
 		if  (!(gfp_mask & __GFP_NOMEMALLOC))
 			alloc_flags |= ALLOC_HARDER;
 		/*
 		 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
 	} else if (unlikely(rt_task(current)) && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
 		if (gfp_mask & __GFP_MEMALLOC)
 			alloc_flags |= ALLOC_NO_WATERMARKS;
-		else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
+		else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+			alloc_flags |= ALLOC_NO_WATERMARKS;
+		else if (!in_interrupt() &&
+				((current->flags & PF_MEMALLOC) ||
+				 unlikely(test_thread_flag(TIF_MEMDIE))))
 			alloc_flags |= ALLOC_NO_WATERMARKS;
 	}
 	return alloc_flags;
 }
 bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
 	int migratetype)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	struct page *page = NULL;
 	int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	bool sync_migration = false;
 	bool deferred_compaction = false;
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
 	 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
 	 * be using allocators in order of preference for an area that is
 	 * too large.
 	 */
 	if (order >= MAX_ORDER) {
 		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
 		return NULL;
 	}
 	/*
 	 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
 	 * __GFP_NOWARN set) should not cause reclaim since the subsystem
 	 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
 	 * using a larger set of nodes after it has established that the
 	 * allowed per node queues are empty and that nodes are
 	 * over allocated.
 	 */
 	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 restart:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapd(order, zonelist, high_zoneidx,
 						zone_idx(preferred_zone));
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
 	 * reclaim. Now things get more complex, so set up alloc_flags according
 	 * to how we want to proceed.
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 	/*
 	 * Find the true preferred zone if the allocation is unconstrained by
 	 * cpusets.
 	 */
 	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
 		first_zones_zonelist(zonelist, high_zoneidx, NULL,
 					&preferred_zone);
 rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
 			preferred_zone, migratetype);
 	if (page)
 		goto got_pg;
 	/* Allocate without watermarks if the context allows */
 	if (alloc_flags & ALLOC_NO_WATERMARKS) {
 		page = __alloc_pages_high_priority(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 		if (page)
 			goto got_pg;
 	}
 	/* Atomic allocations - we can't balance anything */
 	if (!wait)
 		goto nopage;
 	/* Avoid recursion of direct reclaim */
 	if (current->flags & PF_MEMALLOC)
 		goto nopage;
 	/* Avoid allocations with no watermarks from looping endlessly */
 	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
 		goto nopage;
 	/*
 	 * Try direct compaction. The first pass is asynchronous. Subsequent
 	 * attempts after direct reclaim are synchronous
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&deferred_compaction,
 					&did_some_progress);
 	if (page)
 		goto got_pg;
 	sync_migration = true;
 	/*
 	 * If compaction is deferred for high-order allocations, it is because
 	 * sync compaction recently failed. In this is the case and the caller
 	 * has requested the system not be heavily disrupted, fail the
 	 * allocation now instead of entering direct reclaim
 	 */
 	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, &did_some_progress);
 	if (page)
 		goto got_pg;
 	/*
 	 * If we failed to make any progress reclaiming, then we are
 	 * running out of options and have to consider going OOM
 	 */
 	if (!did_some_progress) {
 		if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
 			if (oom_killer_disabled)
 				goto nopage;
 			/* Coredumps can quickly deplete all memory reserves */
 			if ((current->flags & PF_DUMPCORE) &&
 			    !(gfp_mask & __GFP_NOFAIL))
 				goto nopage;
 			page = __alloc_pages_may_oom(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask, preferred_zone,
 					migratetype);
 			if (page)
 				goto got_pg;
 			if (!(gfp_mask & __GFP_NOFAIL)) {
 				/*
 				 * The oom killer is not called for high-order
 				 * allocations that may fail, so if no progress
 				 * is being made, there are no other options and
 				 * retrying is unlikely to help.
 				 */
 				if (order > PAGE_ALLOC_COSTLY_ORDER)
 					goto nopage;
 				/*
 				 * The oom killer is not called for lowmem
 				 * allocations to prevent needlessly killing
 				 * innocent tasks.
 				 */
 				if (high_zoneidx < ZONE_NORMAL)
 					goto nopage;
 			}
 			goto restart;
 		}
 	}
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
 		 * direct reclaim and reclaim/compaction depends on compaction
 		 * being called after reclaim so call directly if necessary
 		 */
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 					zonelist, high_zoneidx,
 					nodemask,
 					alloc_flags, preferred_zone,
 					migratetype, sync_migration,
 					&deferred_compaction,
 					&did_some_progress);
 		if (page)
 			goto got_pg;
 	}
 nopage:
 	warn_alloc_failed(gfp_mask, order, NULL);
 	return page;
 got_pg:
 	/*
 	 * page->pfmemalloc is set when the caller had PFMEMALLOC set, is
 	 * been OOM killed or specified __GFP_MEMALLOC. The expectation is
 	 * that the caller is taking steps that will free more memory. The
 	 * caller should avoid the page being used for !PFMEMALLOC purposes.
 	 */
 	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
 	if (kmemcheck_enabled)
 		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
 	return page;
 }
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zone *preferred_zone;
 	struct page *page = NULL;
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	gfp_mask &= gfp_allowed_mask;
 	lockdep_trace_alloc(gfp_mask);
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
 	/*
 	 * Check the zones suitable for the gfp_mask contain at least one
 	 * valid zone. It's possible to have an empty zonelist as a result
 	 * of GFP_THISNODE and a memoryless node
 	 */
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
 				nodemask ? : &cpuset_current_mems_allowed,
 				&preferred_zone);
 	if (!preferred_zone)
 		goto out;
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
 			preferred_zone, migratetype);
 	if (unlikely(!page))
 		page = __alloc_pages_slowpath(gfp_mask, order,
 				zonelist, high_zoneidx, nodemask,
 				preferred_zone, migratetype);
 	else
 		page->pfmemalloc = false;
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 out:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
 	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
 		goto retry_cpuset;
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
 /*
  * Common helper functions.
  */
 unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 	/*
 	 * __get_free_pages() returns a 32-bit address, which cannot represent
 	 * a highmem page
 	 */
 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 	page = alloc_pages(gfp_mask, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
 EXPORT_SYMBOL(__get_free_pages);
 unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
 }
 EXPORT_SYMBOL(get_zeroed_page);
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_cold_page(page, 0);
 		else
 			__free_pages_ok(page, order);
 	}
 }
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {
 		VM_BUG_ON(!virt_addr_valid((void *)addr));
 		__free_pages(virt_to_page((void *)addr), order);
 	}
 }
 EXPORT_SYMBOL(free_pages);
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
 		unsigned long alloc_end = addr + (PAGE_SIZE << order);
 		unsigned long used = addr + PAGE_ALIGN(size);
 		split_page(virt_to_page((void *)addr), order);
 		while (used < alloc_end) {
 			free_page(used);
 			used += PAGE_SIZE;
 		}
 	}
 	return (void *)addr;
 }
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
  * allocate memory in power-of-two pages.
  *
  * This function is also limited by MAX_ORDER.
  *
  * Memory allocated by this function must be released by free_pages_exact().
  */
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
 	addr = __get_free_pages(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
  *			   pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
  * @gfp_mask: GFP flags for the allocation
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
  * Note this is not alloc_pages_exact_node() which allocates on a specific node,
  * but is not exact.
  */
 void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned order = get_order(size);
 	struct page *p = alloc_pages_node(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
 EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
  * free_pages_exact - release memory allocated via alloc_pages_exact()
  * @virt: the value returned by alloc_pages_exact.
  * @size: size of allocation, same value as passed to alloc_pages_exact().
  *
  * Release the memory allocated by a previous call to alloc_pages_exact.
  */
 void free_pages_exact(void *virt, size_t size)
 {
 	unsigned long addr = (unsigned long)virt;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	while (addr < end) {
 		free_page(addr);
 		addr += PAGE_SIZE;
 	}
 }
 EXPORT_SYMBOL(free_pages_exact);
 static unsigned int nr_free_zone_pages(int offset)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	/* Just pick one node, since fallback list is circular */
 	unsigned int sum = 0;
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
 	for_each_zone_zonelist(zone, z, zonelist, offset) {
 		unsigned long size = zone->present_pages;
 		unsigned long high = high_wmark_pages(zone);
 		if (size > high)
 			sum += size - high;
 	}
 	return sum;
 }
 /*
  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
  */
 unsigned int nr_free_buffer_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_USER));
 }
 EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
 /*
  * Amount of free RAM allocatable within all zones
  */
 unsigned int nr_free_pagecache_pages(void)
 {
 	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
 }
 static inline void show_node(struct zone *zone)
 {
 	if (NUMA_BUILD)
 		printk("Node %d ", zone_to_nid(zone));
 }
 void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = global_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
 	val->mem_unit = PAGE_SIZE;
 }
 EXPORT_SYMBOL(si_meminfo);
 #ifdef CONFIG_NUMA
 void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	val->totalram = pgdat->node_present_pages;
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
 			NR_FREE_PAGES);
 #else
 	val->totalhigh = 0;
 	val->freehigh = 0;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
 #endif
 /*
  * Determine whether the node should be displayed or not, depending on whether
  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
  */
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
 	bool ret = false;
 	unsigned int cpuset_mems_cookie;
 	if (!(flags & SHOW_MEM_FILTER_NODES))
 		goto out;
 	do {
 		cpuset_mems_cookie = get_mems_allowed();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
 	} while (!put_mems_allowed(cpuset_mems_cookie));
 out:
 	return ret;
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Show free area list (used inside shift_scroll-lock stuff)
  * We also calculate the percentage fragmentation. We do this by counting the
  * memory on each free list with the exception of the first item on the list.
  * Suppresses nodes that are not allowed by current's cpuset if
  * SHOW_MEM_FILTER_NODES is passed.
  */
 void show_free_areas(unsigned int filter)
 {
 	int cpu;
 	struct zone *zone;
 	for_each_populated_zone(zone) {
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 		for_each_online_cpu(cpu) {
 			struct per_cpu_pageset *pageset;
 			pageset = per_cpu_ptr(zone->pageset, cpu);
 			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
 			       cpu, pageset->pcp.high,
 			       pageset->pcp.batch, pageset->pcp.count);
 		}
 	}
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
 		" unevictable:%lu"
 		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
 		global_page_state(NR_ACTIVE_ANON),
 		global_page_state(NR_INACTIVE_ANON),
 		global_page_state(NR_ISOLATED_ANON),
 		global_page_state(NR_ACTIVE_FILE),
 		global_page_state(NR_INACTIVE_FILE),
 		global_page_state(NR_ISOLATED_FILE),
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_FREE_PAGES),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_page_state(NR_FILE_MAPPED),
 		global_page_state(NR_SHMEM),
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 	for_each_populated_zone(zone) {
 		int i;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
 			" active_anon:%lukB"
 			" inactive_anon:%lukB"
 			" active_file:%lukB"
 			" inactive_file:%lukB"
 			" unevictable:%lukB"
 			" isolated(anon):%lukB"
 			" isolated(file):%lukB"
 			" present:%lukB"
 			" mlocked:%lukB"
 			" dirty:%lukB"
 			" writeback:%lukB"
 			" mapped:%lukB"
 			" shmem:%lukB"
 			" slab_reclaimable:%lukB"
 			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" unstable:%lukB"
 			" bounce:%lukB"
 			" writeback_tmp:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			zone->name,
 			K(zone_page_state(zone, NR_FREE_PAGES)),
 			K(min_wmark_pages(zone)),
 			K(low_wmark_pages(zone)),
 			K(high_wmark_pages(zone)),
 			K(zone_page_state(zone, NR_ACTIVE_ANON)),
 			K(zone_page_state(zone, NR_INACTIVE_ANON)),
 			K(zone_page_state(zone, NR_ACTIVE_FILE)),
 			K(zone_page_state(zone, NR_INACTIVE_FILE)),
 			K(zone_page_state(zone, NR_UNEVICTABLE)),
 			K(zone_page_state(zone, NR_ISOLATED_ANON)),
 			K(zone_page_state(zone, NR_ISOLATED_FILE)),
 			K(zone->present_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_FILE_DIRTY)),
 			K(zone_page_state(zone, NR_WRITEBACK)),
 			K(zone_page_state(zone, NR_FILE_MAPPED)),
 			K(zone_page_state(zone, NR_SHMEM)),
 			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
 			(zone->all_unreclaimable ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			printk(" %lu", zone->lowmem_reserve[i]);
 		printk("\n");
 	}
 	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
 			continue;
 		show_node(zone);
 		printk("%s: ", zone->name);
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++) {
 			nr[order] = zone->free_area[order].nr_free;
 			total += nr[order] << order;
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		for (order = 0; order < MAX_ORDER; order++)
 			printk("%lu*%lukB ", nr[order], K(1UL) << order);
 		printk("= %lukB\n", K(total));
 	}
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 	show_swap_cache_info();
 }
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;
 	zoneref->zone_idx = zone_idx(zone);
 }
 /*
  * Builds allocation fallback zone lists.
  *
  * Add all populated zones of a node to the zonelist.
  */
 static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 				int nr_zones, enum zone_type zone_type)
 {
 	struct zone *zone;
 	BUG_ON(zone_type >= MAX_NR_ZONES);
 	zone_type++;
 	do {
 		zone_type--;
 		zone = pgdat->node_zones + zone_type;
 		if (populated_zone(zone)) {
 			zoneref_set_zone(zone,
 				&zonelist->_zonerefs[nr_zones++]);
 			check_highest_zone(zone_type);
 		}
 	} while (zone_type);
 	return nr_zones;
 }
 /*
  *  zonelist_order:
  *  0 = automatic detection of better ordering.
  *  1 = order by ([node] distance, -zonetype)
  *  2 = order by (-zonetype, [node] distance)
  *
  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
  *  the same zonelist. So only NUMA can configure this param.
  */
 #define ZONELIST_ORDER_DEFAULT  0
 #define ZONELIST_ORDER_NODE     1
 #define ZONELIST_ORDER_ZONE     2
 /* zonelist order in the kernel.
  * set_zonelist_order() will set this to NODE or ZONE.
  */
 static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
 static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
 #ifdef CONFIG_NUMA
 /* The value user specified ....changed by config */
 static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 /* string for sysctl */
 #define NUMA_ZONELIST_ORDER_LEN	16
 char numa_zonelist_order[16] = "default";
 /*
  * interface for configure zonelist ordering.
  * command line option "numa_zonelist_order"
  *	= "[dD]efault	- default, automatic configuration.
  *	= "[nN]ode 	- order by node locality, then by zone within node
  *	= "[zZ]one      - order by zone, then by locality within zone
  */
 static int __parse_numa_zonelist_order(char *s)
 {
 	if (*s == 'd' || *s == 'D') {
 		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
 	} else if (*s == 'n' || *s == 'N') {
 		user_zonelist_order = ZONELIST_ORDER_NODE;
 	} else if (*s == 'z' || *s == 'Z') {
 		user_zonelist_order = ZONELIST_ORDER_ZONE;
 	} else {
 		printk(KERN_WARNING
 			"Ignoring invalid numa_zonelist_order value:  "
 			"%s\n", s);
 		return -EINVAL;
 	}
 	return 0;
 }
 static __init int setup_numa_zonelist_order(char *s)
 {
 	int ret;
 	if (!s)
 		return 0;
 	ret = __parse_numa_zonelist_order(s);
 	if (ret == 0)
 		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
 	return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 /*
  * sysctl handler for numa_zonelist_order
  */
 int numa_zonelist_order_handler(ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
 	char saved_string[NUMA_ZONELIST_ORDER_LEN];
 	int ret;
 	static DEFINE_MUTEX(zl_order_mutex);
 	mutex_lock(&zl_order_mutex);
 	if (write)
 		strcpy(saved_string, (char*)table->data);
 	ret = proc_dostring(table, write, buffer, length, ppos);
 	if (ret)
 		goto out;
 	if (write) {
 		int oldval = user_zonelist_order;
 		if (__parse_numa_zonelist_order((char*)table->data)) {
 			/*
 			 * bogus value.  restore saved string
 			 */
 			strncpy((char*)table->data, saved_string,
 				NUMA_ZONELIST_ORDER_LEN);
 			user_zonelist_order = oldval;
 		} else if (oldval != user_zonelist_order) {
 			mutex_lock(&zonelists_mutex);
 			build_all_zonelists(NULL, NULL);
 			mutex_unlock(&zonelists_mutex);
 		}
 	}
 out:
 	mutex_unlock(&zl_order_mutex);
 	return ret;
 }
 #define MAX_NODE_LOAD (nr_online_nodes)
 static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
  * @used_node_mask: nodemask_t of already used nodes
  *
  * We use a number of factors to determine which is the next node that should
  * appear on a given node's fallback list.  The node should not have appeared
  * already in @node's fallback list, and it should be the next closest node
  * according to the distance array (which contains arbitrary distance values
  * from each node to each node in the system), and should also prefer nodes
  * with no CPUs, since presumably they'll have very little allocation pressure
  * on them otherwise.
  * It returns -1 if no node is found.
  */
 static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
 	int best_node = -1;
 	const struct cpumask *tmp = cpumask_of_node(0);
 	/* Use the local node if we haven't already */
 	if (!node_isset(node, *used_node_mask)) {
 		node_set(node, *used_node_mask);
 		return node;
 	}
 	for_each_node_state(n, N_HIGH_MEMORY) {
 		/* Don't want a node to appear more than once */
 		if (node_isset(n, *used_node_mask))
 			continue;
 		/* Use the distance array to find the distance */
 		val = node_distance(node, n);
 		/* Penalize nodes under us ("prefer the next node") */
 		val += (n < node);
 		/* Give preference to headless and unused nodes */
 		tmp = cpumask_of_node(n);
 		if (!cpumask_empty(tmp))
 			val += PENALTY_FOR_NODE_WITH_CPUS;
 		/* Slight preference for less loaded node */
 		val *= (MAX_NODE_LOAD*MAX_NUMNODES);
 		val += node_load[n];
 		if (val < min_val) {
 			min_val = val;
 			best_node = n;
 		}
 	}
 	if (best_node >= 0)
 		node_set(best_node, *used_node_mask);
 	return best_node;
 }
 /*
  * Build zonelists ordered by node and zones within node.
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
 		;
 	j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build gfp_thisnode zonelists
  */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
 	int j;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[1];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
  * Build zonelists ordered by zone and nodes within zones.
  * This results in conserving DMA zone[s] until all Normal memory is
  * exhausted, but results in overflowing to remote node while memory
  * may still exist in local DMA zone.
  */
 static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
 	int pos, j, node;
 	int zone_type;		/* needs to be signed */
 	struct zone *z;
 	struct zonelist *zonelist;
 	zonelist = &pgdat->node_zonelists[0];
 	pos = 0;
 	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
 		for (j = 0; j < nr_nodes; j++) {
 			node = node_order[j];
 			z = &NODE_DATA(node)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				zoneref_set_zone(z,
 					&zonelist->_zonerefs[pos++]);
 				check_highest_zone(zone_type);
 			}
 		}
 	}
 	zonelist->_zonerefs[pos].zone = NULL;
 	zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
 	unsigned long low_kmem_size,total_size;
 	struct zone *z;
 	int average_size;
 	/*
          * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
 	 */
 	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
 	low_kmem_size = 0;
 	total_size = 0;
 	for_each_online_node(nid) {
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			} else if (zone_type == ZONE_NORMAL) {
 				/*
 				 * If any node has only lowmem, then node order
 				 * is preferred to allow kernel allocations
 				 * locally; otherwise, they can easily infringe
 				 * on other nodes when there is an abundance of
 				 * lowmem available to allocate from.
 				 */
 				return ZONELIST_ORDER_NODE;
 			}
 		}
 	}
 	if (!low_kmem_size ||  /* there are no DMA area. */
 	    low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
   	 * If there is a node whose DMA/DMA32 memory is very big area on
  	 * local memory, NODE_ORDER may be suitable.
          */
 	average_size = total_size /
 				(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
 	for_each_online_node(nid) {
 		low_kmem_size = 0;
 		total_size = 0;
 		for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
 			z = &NODE_DATA(nid)->node_zones[zone_type];
 			if (populated_zone(z)) {
 				if (zone_type < ZONE_NORMAL)
 					low_kmem_size += z->present_pages;
 				total_size += z->present_pages;
 			}
 		}
 		if (low_kmem_size &&
 		    total_size > average_size && /* ignore small node */
 		    low_kmem_size > total_size * 70/100)
 			return ZONELIST_ORDER_NODE;
 	}
 	return ZONELIST_ORDER_ZONE;
 }
 static void set_zonelist_order(void)
 {
 	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
 		current_zonelist_order = default_zonelist_order();
 	else
 		current_zonelist_order = user_zonelist_order;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int j, node, load;
 	enum zone_type i;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
 	int order = current_zonelist_order;
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
 		zonelist = pgdat->node_zonelists + i;
 		zonelist->_zonerefs[0].zone = NULL;
 		zonelist->_zonerefs[0].zone_idx = 0;
 	}
 	/* NUMA-aware ordering of nodes */
 	local_node = pgdat->node_id;
 	load = nr_online_nodes;
 	prev_node = local_node;
 	nodes_clear(used_mask);
 	memset(node_order, 0, sizeof(node_order));
 	j = 0;
 	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
 		int distance = node_distance(local_node, node);
 		/*
 		 * If another node is sufficiently far away then it is better
 		 * to reclaim pages in a zone before going off node.
 		 */
 		if (distance > RECLAIM_DISTANCE)
 			zone_reclaim_mode = 1;
 		/*
 		 * We don't want to pressure a particular node.
 		 * So adding penalty to the first node in same
 		 * distance group to make it round-robin.
 		 */
 		if (distance != node_distance(local_node, prev_node))
 			node_load[node] = load;
 		prev_node = node;
 		load--;
 		if (order == ZONELIST_ORDER_NODE)
 			build_zonelists_in_node_order(pgdat, node);
 		else
 			node_order[j++] = node;	/* remember order */
 	}
 	if (order == ZONELIST_ORDER_ZONE) {
 		/* calculate node order -- i.e., DMA last! */
 		build_zonelists_in_zone_order(pgdat, j);
 	}
 	build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	struct zonelist *zonelist;
 	struct zonelist_cache *zlc;
 	struct zoneref *z;
 	zonelist = &pgdat->node_zonelists[0];
 	zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 	for (z = zonelist->_zonerefs; z->zone; z++)
 		zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
 }
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * Return node id of node used for "local" allocations.
  * I.e., first node id of first zone in arg node's generic zonelist.
  * Used for initializing percpu 'numa_mem', which is used primarily
  * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
  */
 int local_memory_node(int node)
 {
 	struct zone *zone;
 	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
 				   NULL,
 				   &zone);
 	return zone->node;
 }
 #endif
 #else	/* CONFIG_NUMA */
 static void set_zonelist_order(void)
 {
 	current_zonelist_order = ZONELIST_ORDER_ZONE;
 }
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
 	enum zone_type j;
 	struct zonelist *zonelist;
 	local_node = pgdat->node_id;
 	zonelist = &pgdat->node_zonelists[0];
 	j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
 	/*
 	 * Now we build the zonelist so that it contains the zones
 	 * of all the other nodes.
 	 * We don't want to pressure a particular node, so when
 	 * building the zones for node N, we make sure that the
 	 * zones coming right after the local ones are those from
 	 * node N+1 (modulo N)
 	 */
 	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	for (node = 0; node < local_node; node++) {
 		if (!node_online(node))
 			continue;
 		j = build_zonelists_node(NODE_DATA(node), zonelist, j,
 							MAX_NR_ZONES - 1);
 	}
 	zonelist->_zonerefs[j].zone = NULL;
 	zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
 	pgdat->node_zonelists[0].zlcache_ptr = NULL;
 }
 #endif	/* CONFIG_NUMA */
 /*
  * Boot pageset table. One per cpu which is going to be used for all
  * zones and all nodes. The parameters will be set in such a way
  * that an item put on a list will immediately be handed over to
  * the buddy list. This is safe since pageset manipulation is done
  * with interrupts disabled.
  *
  * The boot_pagesets must be kept even after bootup is complete for
  * unused processors and/or zones. They do play a role for bootstrapping
  * hotplugged processors.
  *
  * zoneinfo_show() and maybe other functions do
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 /*
  * Global mutex to protect against size modification of zonelists
  * as well as to serialize pageset setup for the new populated zone.
  */
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *data)
 {
 	int nid;
 	int cpu;
 	pg_data_t *self = data;
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
 #endif
 	if (self && !node_online(self->node_id)) {
 		build_zonelists(self);
 		build_zonelist_cache(self);
 	}
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		build_zonelists(pgdat);
 		build_zonelist_cache(pgdat);
 	}
 	/*
 	 * Initialize the boot_pagesets that are going to be used
 	 * for bootstrapping processors. The real pagesets for
 	 * each zone will be allocated later when the per cpu
 	 * allocator is available.
 	 *
 	 * boot_pagesets are used also for bootstrapping offline
 	 * cpus if the system is already booted because the pagesets
 	 * are needed to initialize allocators on a specific cpu too.
 	 * F.e. the percpu allocator needs the page allocator which
 	 * needs the percpu allocator in order to allocate its pagesets
 	 * (a chicken-egg dilemma).
 	 */
 	for_each_possible_cpu(cpu) {
 		setup_pageset(&per_cpu(boot_pageset, cpu), 0);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 		/*
 		 * We now know the "local memory node" for each node--
 		 * i.e., the node of the first zone in the generic zonelist.
 		 * Set up numa_mem percpu variable for on-line cpus.  During
 		 * boot, only the boot cpu should be on-line;  we'll init the
 		 * secondary cpus' numa_mem as they come on-line.  During
 		 * node/memory hotplug, we'll fixup all on-line cpus.
 		 */
 		if (cpu_online(cpu))
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
 	return 0;
 }
 /*
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
 	set_zonelist_order();
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		mminit_verify_zonelist();
 		cpuset_init_current_mems_allowed();
 	} else {
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
 		if (zone)
 			setup_zone_pageset(zone);
 #endif
 		stop_machine(__build_all_zonelists, pgdat, NULL);
 		/* cpuset refresh routine should be here */
 	}
 	vm_total_pages = nr_free_pagecache_pages();
 	/*
 	 * Disable grouping by mobility if the number of pages in the
 	 * system is too low to allow the mechanism to work. It would be
 	 * more accurate, but expensive to check per-zone. This check is
 	 * made on memory-hotadd so a system can start with mobility
 	 * disabled and enable it later
 	 */
 	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
 		page_group_by_mobility_disabled = 1;
 	else
 		page_group_by_mobility_disabled = 0;
 	printk("Built %i zonelists in %s order, mobility grouping %s.  "
 		"Total pages: %ld\n",
 			nr_online_nodes,
 			zonelist_order_name[current_zonelist_order],
 			page_group_by_mobility_disabled ? "off" : "on",
 			vm_total_pages);
 #ifdef CONFIG_NUMA
 	printk("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
 }
 /*
  * Helper functions to size the waitqueue hash table.
  * Essentially these want to choose hash table sizes sufficiently
  * large so that collisions trying to wait on pages are rare.
  * But in fact, the number of active page waitqueues on typical
  * systems is ridiculously low, less than 200. So this is even
  * conservative, even though it seems large.
  *
  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  * waitqueues, i.e. the size of the waitq table given the number of pages.
  */
 #define PAGES_PER_WAITQUEUE	256
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	unsigned long size = 1;
 	pages /= PAGES_PER_WAITQUEUE;
 	while (size < pages)
 		size <<= 1;
 	/*
 	 * Once we have dozens or even hundreds of threads sleeping
 	 * on IO we've got bigger problems than wait queue collision.
 	 * Limit the size of the wait table to a reasonable size.
 	 */
 	size = min(size, 4096UL);
 	return max(size, 4UL);
 }
 #else
 /*
  * A zone's size might be changed by hot-add, so it is not possible to determine
  * a suitable size for its wait_table.  So we use the maximum size now.
  *
  * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
  *
  *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
  *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
  *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
  *
  * The maximum entries are prepared when a zone's memory is (512K + 256) pages
  * or more by the traditional way. (See above).  It equals:
  *
  *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
  *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
  *    powerpc (64K page size)             : =  (32G +16M)byte.
  */
 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
 	return 4096UL;
 }
 #endif
 /*
  * This is an integer logarithm so that shifts can be used later
  * to extract the more random high bits from the multiplicative
  * hash function before the remainder is taken.
  */
 static inline unsigned long wait_table_bits(unsigned long size)
 {
 	return ffz(~size);
 }
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
  * Check if a pageblock contains reserved pages
  */
 static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 			return 1;
 	}
 	return 0;
 }
 /*
  * Mark a number of pageblocks as MIGRATE_RESERVE. The number
  * of blocks reserved is based on min_wmark_pages(zone). The memory within
  * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
  * higher will lead to a bigger reserve which will get freed as contiguous
  * blocks as reclaim kicks in
  */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
 	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 	struct page *page;
 	unsigned long block_migratetype;
 	int reserve;
 	/*
 	 * Get the start pfn, end pfn and the number of blocks to reserve
 	 * We have to be careful to be aligned to pageblock_nr_pages to
 	 * make sure that we always check pfn_valid for the first page in
 	 * the block.
 	 */
 	start_pfn = zone->zone_start_pfn;
 	end_pfn = start_pfn + zone->spanned_pages;
 	start_pfn = roundup(start_pfn, pageblock_nr_pages);
 	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 							pageblock_order;
 	/*
 	 * Reserve blocks are generally in place to help high-order atomic
 	 * allocations that are short-lived. A min_free_kbytes value that
 	 * would result in more than 2 reserve blocks for atomic allocations
 	 * is assumed to be in place to help anti-fragmentation for the
 	 * future allocation of hugepages at runtime.
 	 */
 	reserve = min(2, reserve);
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
 		/* Watch out for overlapping nodes */
 		if (page_to_nid(page) != zone_to_nid(zone))
 			continue;
 		block_migratetype = get_pageblock_migratetype(page);
 		/* Only test what is necessary when the reserves are not met */
 		if (reserve > 0) {
 			/*
 			 * Blocks with reserved pages will never free, skip
 			 * them.
 			 */
 			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 			if (pageblock_is_reserved(pfn, block_end_pfn))
 				continue;
 			/* If this block is reserved, account for it */
 			if (block_migratetype == MIGRATE_RESERVE) {
 				reserve--;
 				continue;
 			}
 			/* Suitable for reserving if this block is movable */
 			if (block_migratetype == MIGRATE_MOVABLE) {
 				set_pageblock_migratetype(page,
 							MIGRATE_RESERVE);
 				move_freepages_block(zone, page,
 							MIGRATE_RESERVE);
 				reserve--;
 				continue;
 			}
 		}
 		/*
 		 * If the reserve is met and this is a previous reserved block,
 		 * take it back
 		 */
 		if (block_migratetype == MIGRATE_RESERVE) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 			move_freepages_block(zone, page, MIGRATE_MOVABLE);
 		}
 	}
 }
 /*
  * Initially all pages are reserved - free ones are freed
  * up by free_all_bootmem() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
 	struct page *page;
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 	z = &NODE_DATA(nid)->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
 		 * handed to this function.  They do not
 		 * exist on hotplugged memory.
 		 */
 		if (context == MEMMAP_EARLY) {
 			if (!early_pfn_valid(pfn))
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
 		}
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
 		 * movable at startup. This will force kernel allocations
 		 * to reserve their blocks rather than leaking throughout
 		 * the address space during boot when many long-lived
 		 * kernel allocations are made. Later some blocks near
 		 * the start are marked MIGRATE_RESERVE by
 		 * setup_zone_migrate_reserve()
 		 *
 		 * bitmap is created for zone's valid pfn range. but memmap
 		 * can be created for invalid pages (for alignment)
 		 * check here not to call set_pageblock_migratetype() against
 		 * pfn out of zone.
 		 */
 		if ((z->zone_start_pfn <= pfn)
 		    && (pfn < z->zone_start_pfn + z->spanned_pages)
 		    && !(pfn & (pageblock_nr_pages - 1)))
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
 		/* The shift won't overflow because ZONE_NORMAL is below 4G. */
 		if (!is_highmem_idx(zone))
 			set_page_address(page, __va(pfn << PAGE_SHIFT));
 #endif
 	}
 }
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
 	int order, t;
 	for_each_migratetype_order(order, t) {
 		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
 		zone->free_area[order].nr_free = 0;
 	}
 }
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
 static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
 	int batch;
 	/*
 	 * The per-cpu-pages pools are set to around 1000th of the
 	 * size of the zone.  But no more than 1/2 of a meg.
 	 *
 	 * OK, so we don't know how big the cache is.  So guess.
 	 */
 	batch = zone->present_pages / 1024;
 	if (batch * PAGE_SIZE > 512 * 1024)
 		batch = (512 * 1024) / PAGE_SIZE;
 	batch /= 4;		/* We effectively *= 4 below */
 	if (batch < 1)
 		batch = 1;
 	/*
 	 * Clamp the batch to a 2^n - 1 value. Having a power
 	 * of 2 value was found to be more likely to have
 	 * suboptimal cache aliasing properties in some cases.
 	 *
 	 * For example if 2 tasks are alternately allocating
 	 * batches of pages, one task can end up with a lot
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 	return batch;
 #else
 	/* The deferral and batching of frees should be suppressed under NOMMU
 	 * conditions.
 	 *
 	 * The problem is that NOMMU needs to be able to allocate large chunks
 	 * of contiguous memory as there's no hardware page translation to
 	 * assemble apparent contiguous memory from discontiguous pages.
 	 *
 	 * Queueing large contiguous runs of pages for batching, however,
 	 * causes the pages to actually be freed in smaller chunks.  As there
 	 * can be a significant delay between the individual batches being
 	 * recycled, this leads to the once large chunks of space being
 	 * fragmented and becoming unavailable for high-order allocations.
 	 */
 	return 0;
 #endif
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
 	struct per_cpu_pages *pcp;
 	int migratetype;
 	memset(p, 0, sizeof(*p));
 	pcp = &p->pcp;
 	pcp->count = 0;
 	pcp->high = 6 * batch;
 	pcp->batch = max(1UL, 1 * batch);
 	for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 		INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 /*
  * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
  * to the value high for the pageset p.
  */
 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 				unsigned long high)
 {
 	struct per_cpu_pages *pcp;
 	pcp = &p->pcp;
 	pcp->high = high;
 	pcp->batch = max(1UL, high/4);
 	if ((high/4) > (PAGE_SHIFT * 8))
 		pcp->batch = PAGE_SHIFT * 8;
 }
 static void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
 		setup_pageset(pcp, zone_batchsize(zone));
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(pcp,
 				(zone->present_pages /
 					percpu_pagelist_fraction));
 	}
 }
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
  */
 void __init setup_per_cpu_pageset(void)
 {
 	struct zone *zone;
 	for_each_populated_zone(zone)
 		setup_zone_pageset(zone);
 }
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
 	int i;
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	size_t alloc_size;
 	/*
 	 * The per-page waitqueue mechanism uses hashed waitqueues
 	 * per zone.
 	 */
 	zone->wait_table_hash_nr_entries =
 		 wait_table_hash_nr_entries(zone_size_pages);
 	zone->wait_table_bits =
 		wait_table_bits(zone->wait_table_hash_nr_entries);
 	alloc_size = zone->wait_table_hash_nr_entries
 					* sizeof(wait_queue_head_t);
 	if (!slab_is_available()) {
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node_nopanic(pgdat, alloc_size);
 	} else {
 		/*
 		 * This case means that a zone whose size was 0 gets new memory
 		 * via memory hot-add.
 		 * But it may be the case that a new node was hot-added.  In
 		 * this case vmalloc() will not be able to use this new node's
 		 * memory - this wait_table must be initialized to use this new
 		 * node itself as well.
 		 * To use this new node's memory, further consideration will be
 		 * necessary.
 		 */
 		zone->wait_table = vmalloc(alloc_size);
 	}
 	if (!zone->wait_table)
 		return -ENOMEM;
 	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 	return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
 {
 	/*
 	 * per cpu subsystem is not up at this point. The following code
 	 * relies on the ability of the linker to provide the
 	 * offset of a (static) per cpu variable into the per cpu area.
 	 */
 	zone->pageset = &boot_pageset;
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
 			zone->name, zone->present_pages,
 					 zone_batchsize(zone));
 }
 int __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size,
 					enum memmap_context context)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int ret;
 	ret = zone_wait_table_init(zone, size);
 	if (ret)
 		return ret;
 	pgdat->nr_zones = zone_idx(zone) + 1;
 	zone->zone_start_pfn = zone_start_pfn;
 	mminit_dprintk(MMINIT_TRACE, "memmap_init",
 			"Initialising map node %d zone %lu pfns %lu -> %lu\n",
 			pgdat->node_id,
 			(unsigned long)zone_idx(zone),
 			zone_start_pfn, (zone_start_pfn + size));
 	zone_init_free_lists(zone);
 	return 0;
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  * Architectures may implement their own version but if add_active_range()
  * was used and there are no special requirements, this is a convenient
  * alternative
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		if (start_pfn <= pfn && pfn < end_pfn)
 			return nid;
 	/* This is a memory hole */
 	return -1;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
 	return 0;
 }
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	int nid;
 	nid = __early_pfn_to_nid(pfn);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
 #endif
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
  * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
 		start_pfn = min(start_pfn, max_low_pfn);
 		end_pfn = min(end_pfn, max_low_pfn);
 		if (start_pfn < end_pfn)
 			free_bootmem_node(NODE_DATA(this_nid),
 					  PFN_PHYS(start_pfn),
 					  (end_pfn - start_pfn) << PAGE_SHIFT);
 	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
  * function may be used instead of calling memory_present() manually.
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, this_nid;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
 		memory_present(this_nid, start_pfn, end_pfn);
 }
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
  * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
  * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
  *
  * It returns the start and end page frame of a node based on information
  * provided by an arch calling add_active_range(). If called for a node
  * with no available memory, a warning is printed and the start and end
  * PFNs will be 0.
  */
 void __meminit get_pfn_range_for_nid(unsigned int nid,
 			unsigned long *start_pfn, unsigned long *end_pfn)
 {
 	unsigned long this_start_pfn, this_end_pfn;
 	int i;
 	*start_pfn = -1UL;
 	*end_pfn = 0;
 	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
 		*start_pfn = min(*start_pfn, this_start_pfn);
 		*end_pfn = max(*end_pfn, this_end_pfn);
 	}
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
 }
 /*
  * This finds a zone that can be used for ZONE_MOVABLE pages. The
  * assumption is made that zones within a node are ordered in monotonic
  * increasing memory addresses so that the "highest" populated zone is used
  */
 static void __init find_usable_zone_for_movable(void)
 {
 	int zone_index;
 	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
 		if (zone_index == ZONE_MOVABLE)
 			continue;
 		if (arch_zone_highest_possible_pfn[zone_index] >
 				arch_zone_lowest_possible_pfn[zone_index])
 			break;
 	}
 	VM_BUG_ON(zone_index == -1);
 	movable_zone = zone_index;
 }
 /*
  * The zone ranges provided by the architecture do not include ZONE_MOVABLE
  * because it is sized independent of architecture. Unlike the other zones,
  * the starting point for ZONE_MOVABLE is not fixed. It may be different
  * in each node depending on the size of each node and how evenly kernelcore
  * is distributed. This helper function adjusts the zone ranges
  * provided by the architecture for a given node by using the end of the
  * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
  * zones within a node are in order of monotonic increases memory addresses
  */
 static void __meminit adjust_zone_range_for_zone_movable(int nid,
 					unsigned long zone_type,
 					unsigned long node_start_pfn,
 					unsigned long node_end_pfn,
 					unsigned long *zone_start_pfn,
 					unsigned long *zone_end_pfn)
 {
 	/* Only adjust if ZONE_MOVABLE is on this node */
 	if (zone_movable_pfn[nid]) {
 		/* Size ZONE_MOVABLE */
 		if (zone_type == ZONE_MOVABLE) {
 			*zone_start_pfn = zone_movable_pfn[nid];
 			*zone_end_pfn = min(node_end_pfn,
 				arch_zone_highest_possible_pfn[movable_zone]);
 		/* Adjust for ZONE_MOVABLE starting within this range */
 		} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
 				*zone_end_pfn > zone_movable_pfn[nid]) {
 			*zone_end_pfn = zone_movable_pfn[nid];
 		/* Check if this whole range is within ZONE_MOVABLE */
 		} else if (*zone_start_pfn >= zone_movable_pfn[nid])
 			*zone_start_pfn = *zone_end_pfn;
 	}
 }
 /*
  * Return the number of pages a zone spans in a node, including holes
  * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
  */
 static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	/* Get the start and end of the node and zone */
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
 	zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 				node_start_pfn, node_end_pfn,
 				&zone_start_pfn, &zone_end_pfn);
 	/* Check that this node has pages within the zone's required range */
 	if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
 		return 0;
 	/* Move the zone boundaries inside the node if necessary */
 	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
 	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
 	/* Return the spanned pages */
 	return zone_end_pfn - zone_start_pfn;
 }
 /*
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
 unsigned long __meminit __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
 	unsigned long nr_absent = range_end_pfn - range_start_pfn;
 	unsigned long start_pfn, end_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 		start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
 		end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
 		nr_absent -= end_pfn - start_pfn;
 	}
 	return nr_absent;
 }
 /**
  * absent_pages_in_range - Return number of page frames in holes within a range
  * @start_pfn: The start PFN to start searching for holes
  * @end_pfn: The end PFN to stop searching for holes
  *
  * It returns the number of pages frames in memory holes within a range.
  */
 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 							unsigned long end_pfn)
 {
 	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
 }
 /* Return the number of page frames in holes in a zone on a node */
 static unsigned long __meminit zone_absent_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *ignored)
 {
 	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
 	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
 	unsigned long node_start_pfn, node_end_pfn;
 	unsigned long zone_start_pfn, zone_end_pfn;
 	get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
 	zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
 	zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 	adjust_zone_range_for_zone_movable(nid, zone_type,
 			node_start_pfn, node_end_pfn,
 			&zone_start_pfn, &zone_end_pfn);
 	return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
 					unsigned long zone_type,
 					unsigned long *zones_size)
 {
 	return zones_size[zone_type];
 }
 static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 						unsigned long zone_type,
 						unsigned long *zholes_size)
 {
 	if (!zholes_size)
 		return 0;
 	return zholes_size[zone_type];
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long realtotalpages, totalpages = 0;
 	enum zone_type i;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
 								zones_size);
 	pgdat->node_spanned_pages = totalpages;
 	realtotalpages = totalpages;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		realtotalpages -=
 			zone_absent_pages_in_node(pgdat->node_id, i,
 								zholes_size);
 	pgdat->node_present_pages = realtotalpages;
 	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
 							realtotalpages);
 }
 #ifndef CONFIG_SPARSEMEM
 /*
  * Calculate the size of the zone->blockflags rounded to an unsigned long
  * Start by making sure zonesize is a multiple of pageblock_order by rounding
  * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
  * round what is now in bits to nearest long in bits, then return it in
  * bytes.
  */
 static unsigned long __init usemap_size(unsigned long zonesize)
 {
 	unsigned long usemapsize;
 	usemapsize = roundup(zonesize, pageblock_nr_pages);
 	usemapsize = usemapsize >> pageblock_order;
 	usemapsize *= NR_PAGEBLOCK_BITS;
 	usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
 	return usemapsize / 8;
 }
 static void __init setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize)
 {
 	unsigned long usemapsize = usemap_size(zonesize);
 	zone->pageblock_flags = NULL;
 	if (usemapsize)
 		zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
 								   usemapsize);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
 	unsigned int order;
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
 		return;
 	if (HPAGE_SHIFT > PAGE_SHIFT)
 		order = HUGETLB_PAGE_ORDER;
 	else
 		order = MAX_ORDER - 1;
 	/*
 	 * Assume the largest contiguous order of interest is a huge page.
 	 * This value may be variable depending on boot parameters on IA64 and
 	 * powerpc.
 	 */
 	pageblock_order = order;
 }
 #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
  * is unused as pageblock_order is set at compile-time. See
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
 void __init set_pageblock_order(void)
 {
 }
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
  *   - mark all memory queues empty
  *   - clear the memory bitmaps
  */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 	int ret;
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
 								zholes_size);
 		/*
 		 * Adjust realsize so that it accounts for how much memory
 		 * is used by this zone for memmap. This affects the watermark
 		 * and per-cpu initialisations
 		 */
 		memmap_pages =
 			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
 		if (realsize >= memmap_pages) {
 			realsize -= memmap_pages;
 			if (memmap_pages)
 				printk(KERN_DEBUG
 				       "  %s zone: %lu pages used for memmap\n",
 				       zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
 				"  %s zone: %lu pages exceeds realsize %lu\n",
 				zone_names[j], memmap_pages, realsize);
 		/* Account for reserved pages */
 		if (j == 0 && realsize > dma_reserve) {
 			realsize -= dma_reserve;
 			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 		zone->compact_cached_free_pfn = zone->zone_start_pfn +
 						zone->spanned_pages;
 		zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
 #endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
 		zone_pcp_init(zone);
 		lruvec_init(&zone->lruvec, zone);
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 #ifdef CONFIG_MEMORY_ISOLATION
 		zone->nr_pageblock_isolate = 0;
 #endif
 		if (!size)
 			continue;
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, size);
 		ret = init_currently_empty_zone(zone, zone_start_pfn,
 						size, MEMMAP_EARLY);
 		BUG_ON(ret);
 		memmap_init(size, nid, j, zone_start_pfn);
 		zone_start_pfn += size;
 	}
 }
 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		unsigned long size, start, end;
 		struct page *map;
 		/*
 		 * The zone's endpoints aren't required to be MAX_ORDER
 		 * aligned but the node_mem_map endpoints must be in order
 		 * for the buddy allocator to function correctly.
 		 */
 		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
 		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
 		end = ALIGN(end, MAX_ORDER_NR_PAGES);
 		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node_nopanic(pgdat, size);
 		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
 		mem_map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
 			mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 		unsigned long node_start_pfn, unsigned long *zholes_size)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_node_totalpages(pgdat, zones_size, zholes_size);
 	alloc_node_mem_map(pgdat);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
 		nid, (unsigned long)pgdat,
 		(unsigned long)pgdat->node_mem_map);
 #endif
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
 /*
  * Figure out the number of possible node ids.
  */
 static void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
 	for_each_node_mask(node, node_possible_map)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
 #else
 static inline void setup_nr_node_ids(void)
 {
 }
 #endif
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
  * This function should be called after node map is populated and sorted.
  * It calculates the maximum power of two alignment which can distinguish
  * all the nodes.
  *
  * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
  * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
  * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
  * shifted, 1GiB is enough and this function will indicate so.
  *
  * This is used to test whether pfn -> nid mapping of the chosen memory
  * model has fine enough granularity to avoid incorrect mapping for the
  * populated node map.
  *
  * Returns the determined alignment in pfn's.  0 if there is no alignment
  * requirement (single node).
  */
 unsigned long __init node_map_pfn_alignment(void)
 {
 	unsigned long accl_mask = 0, last_end = 0;
 	unsigned long start, end, mask;
 	int last_nid = -1;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
 		if (!start || last_nid < 0 || last_nid == nid) {
 			last_nid = nid;
 			last_end = end;
 			continue;
 		}
 		/*
 		 * Start with a mask granular enough to pin-point to the
 		 * start pfn and tick off bits one-by-one until it becomes
 		 * too coarse to separate the current node from the last.
 		 */
 		mask = ~((1 << __ffs(start)) - 1);
 		while (mask && last_end <= (start & (mask << 1)))
 			mask <<= 1;
 		/* accumulate all internode masks */
 		accl_mask |= mask;
 	}
 	/* convert mask to number of pages */
 	return ~accl_mask + 1;
 }
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
 	unsigned long min_pfn = ULONG_MAX;
 	unsigned long start_pfn;
 	int i;
 	for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
 		min_pfn = min(min_pfn, start_pfn);
 	if (min_pfn == ULONG_MAX) {
 		printk(KERN_WARNING
 			"Could not find start_pfn for node %d\n", nid);
 		return 0;
 	}
 	return min_pfn;
 }
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
  * It returns the minimum PFN based on information provided via
  * add_active_range().
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
 	return find_min_pfn_for_node(MAX_NUMNODES);
 }
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
  * Populate N_HIGH_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
 	unsigned long totalpages = 0;
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		unsigned long pages = end_pfn - start_pfn;
 		totalpages += pages;
 		if (pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 	}
   	return totalpages;
 }
 /*
  * Find the PFN the Movable zone begins in each node. Kernel memory
  * is spread evenly between nodes as long as the nodes have enough
  * memory. When they don't, some nodes will have more kernelcore than
  * others
  */
 static void __init find_zone_movable_pfns_for_nodes(void)
 {
 	int i, nid;
 	unsigned long usable_startpfn;
 	unsigned long kernelcore_node, kernelcore_remaining;
 	/* save the state before borrow the nodemask */
 	nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
 	unsigned long totalpages = early_calculate_totalpages();
 	int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
 	/*
 	 * If movablecore was specified, calculate what size of
 	 * kernelcore that corresponds so that memory usable for
 	 * any allocation type is evenly spread. If both kernelcore
 	 * and movablecore are specified, then the value of kernelcore
 	 * will be used for required_kernelcore if it's greater than
 	 * what movablecore would have allowed.
 	 */
 	if (required_movablecore) {
 		unsigned long corepages;
 		/*
 		 * Round-up so that ZONE_MOVABLE is at least as large as what
 		 * was requested by the user
 		 */
 		required_movablecore =
 			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
 		corepages = totalpages - required_movablecore;
 		required_kernelcore = max(required_kernelcore, corepages);
 	}
 	/* If kernelcore was not specified, there is no ZONE_MOVABLE */
 	if (!required_kernelcore)
 		goto out;
 	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
 	find_usable_zone_for_movable();
 	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 restart:
 	/* Spread kernelcore memory as evenly as possible throughout nodes */
 	kernelcore_node = required_kernelcore / usable_nodes;
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		unsigned long start_pfn, end_pfn;
 		/*
 		 * Recalculate kernelcore_node if the division per node
 		 * now exceeds what is necessary to satisfy the requested
 		 * amount of memory for the kernel
 		 */
 		if (required_kernelcore < kernelcore_node)
 			kernelcore_node = required_kernelcore / usable_nodes;
 		/*
 		 * As the map is walked, we track how much memory is usable
 		 * by the kernel using kernelcore_remaining. When it is
 		 * 0, the rest of the node is usable by ZONE_MOVABLE
 		 */
 		kernelcore_remaining = kernelcore_node;
 		/* Go through each range of PFNs within this node */
 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
 			unsigned long size_pages;
 			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
 			if (start_pfn >= end_pfn)
 				continue;
 			/* Account for what is only usable for kernelcore */
 			if (start_pfn < usable_startpfn) {
 				unsigned long kernel_pages;
 				kernel_pages = min(end_pfn, usable_startpfn)
 								- start_pfn;
 				kernelcore_remaining -= min(kernel_pages,
 							kernelcore_remaining);
 				required_kernelcore -= min(kernel_pages,
 							required_kernelcore);
 				/* Continue if range is now fully accounted */
 				if (end_pfn <= usable_startpfn) {
 					/*
 					 * Push zone_movable_pfn to the end so
 					 * that if we have to rebalance
 					 * kernelcore across nodes, we will
 					 * not double account here
 					 */
 					zone_movable_pfn[nid] = end_pfn;
 					continue;
 				}
 				start_pfn = usable_startpfn;
 			}
 			/*
 			 * The usable PFN range for ZONE_MOVABLE is from
 			 * start_pfn->end_pfn. Calculate size_pages as the
 			 * number of pages used as kernelcore
 			 */
 			size_pages = end_pfn - start_pfn;
 			if (size_pages > kernelcore_remaining)
 				size_pages = kernelcore_remaining;
 			zone_movable_pfn[nid] = start_pfn + size_pages;
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
 			 * satisified
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
 			kernelcore_remaining -= size_pages;
 			if (!kernelcore_remaining)
 				break;
 		}
 	}
 	/*
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
 	 * satisified
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
 		goto restart;
 	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
 		zone_movable_pfn[nid] =
 			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 out:
 	/* restore the node_state */
 	node_states[N_HIGH_MEMORY] = saved_node_state;
 }
 /* Any regular memory on that node ? */
 static void __init check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
 	enum zone_type zone_type;
 	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
 		struct zone *zone = &pgdat->node_zones[zone_type];
 		if (zone->present_pages) {
 			node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 			break;
 		}
 	}
 #endif
 }
 /**
  * free_area_init_nodes - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
  *
  * This will call free_area_init_node() for each active node in the system.
  * Using the page ranges provided by add_active_range(), the size of each
  * zone in each node and their holes is calculated. If the maximum PFN
  * between two adjacent zones match, it is assumed that the zone is empty.
  * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
  * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
  * starts where the previous one ended. For example, ZONE_DMA32 starts
  * at arch_max_dma_pfn.
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
 	unsigned long start_pfn, end_pfn;
 	int i, nid;
 	/* Record where the zone boundaries are */
 	memset(arch_zone_lowest_possible_pfn, 0,
 				sizeof(arch_zone_lowest_possible_pfn));
 	memset(arch_zone_highest_possible_pfn, 0,
 				sizeof(arch_zone_highest_possible_pfn));
 	arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
 	arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
 	for (i = 1; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		arch_zone_lowest_possible_pfn[i] =
 			arch_zone_highest_possible_pfn[i-1];
 		arch_zone_highest_possible_pfn[i] =
 			max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
 	}
 	arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
 	arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
 	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
 	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
 	find_zone_movable_pfns_for_nodes();
 	/* Print out the zone ranges */
 	printk("Zone ranges:\n");
 	for (i = 0; i < MAX_NR_ZONES; i++) {
 		if (i == ZONE_MOVABLE)
 			continue;
 		printk(KERN_CONT "  %-8s ", zone_names[i]);
 		if (arch_zone_lowest_possible_pfn[i] ==
 				arch_zone_highest_possible_pfn[i])
 			printk(KERN_CONT "empty\n");
 		else
 			printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
 				arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
 				(arch_zone_highest_possible_pfn[i]
 					<< PAGE_SHIFT) - 1);
 	}
 	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
 	printk("Movable zone start for each node\n");
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		if (zone_movable_pfn[i])
 			printk("  Node %d: %#010lx\n", i,
 			       zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 	/* Print out the early_node_map[] */
 	printk("Early memory node ranges\n");
 	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
 		printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
 		       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
 	setup_nr_node_ids();
 	for_each_online_node(nid) {
 		pg_data_t *pgdat = NODE_DATA(nid);
 		free_area_init_node(nid, NULL,
 				find_min_pfn_for_node(nid), NULL);
 		/* Any memory on that node */
 		if (pgdat->node_present_pages)
 			node_set_state(nid, N_HIGH_MEMORY);
 		check_for_regular_memory(pgdat);
 	}
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
 {
 	unsigned long long coremem;
 	if (!p)
 		return -EINVAL;
 	coremem = memparse(p, &p);
 	*core = coremem >> PAGE_SHIFT;
 	/* Paranoid check that UL is enough for the coremem value */
 	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
 	return 0;
 }
 /*
  * kernelcore=size sets the amount of memory for use for allocations that
  * cannot be reclaimed or migrated.
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
 	return cmdline_parse_core(p, &required_kernelcore);
 }
 /*
  * movablecore=size sets the amount of memory for use for allocations that
  * can be reclaimed or migrated.
  */
 static int __init cmdline_parse_movablecore(char *p)
 {
 	return cmdline_parse_core(p, &required_movablecore);
 }
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
  *
  * The per-cpu batchsize and zone watermarks are determined by present_pages.
  * In the DMA zone, a significant percentage may be consumed by kernel image
  * and other unfreeable allocations which can skew the watermarks badly. This
  * function may optionally be used to account for unfreeable pages in the
  * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
  * smaller per-cpu batchsize.
  */
 void __init set_dma_reserve(unsigned long new_dma_reserve)
 {
 	dma_reserve = new_dma_reserve;
 }
 void __init free_area_init(unsigned long *zones_size)
 {
 	free_area_init_node(0, zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
 static int page_alloc_cpu_notify(struct notifier_block *self,
 				 unsigned long action, void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		lru_add_drain_cpu(cpu);
 		drain_pages(cpu);
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
 		 * This artificially elevates the count of the current
 		 * processor.
 		 */
 		vm_events_fold_cpu(cpu);
 		/*
 		 * Zero the differential counters of the dead processor
 		 * so that the vm statistics are consistent.
 		 *
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
 		refresh_cpu_vm_stats(cpu);
 	}
 	return NOTIFY_OK;
 }
 void __init page_alloc_init(void)
 {
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 /*
  * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
  *	or min_free_kbytes changes.
  */
 static void calculate_totalreserve_pages(void)
 {
 	struct pglist_data *pgdat;
 	unsigned long reserve_pages = 0;
 	enum zone_type i, j;
 	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < MAX_NR_ZONES; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			unsigned long max = 0;
 			/* Find valid and maximum lowmem_reserve in the zone */
 			for (j = i; j < MAX_NR_ZONES; j++) {
 				if (zone->lowmem_reserve[j] > max)
 					max = zone->lowmem_reserve[j];
 			}
 			/* we treat the high watermark as reserved pages. */
 			max += high_wmark_pages(zone);
 			if (max > zone->present_pages)
 				max = zone->present_pages;
 			reserve_pages += max;
 			/*
 			 * Lowmem reserves are not available to
 			 * GFP_HIGHUSER page cache allocations and
 			 * kswapd tries to balance zones to their high
 			 * watermark.  As a result, neither should be
 			 * regarded as dirtyable memory, to prevent a
 			 * situation where reclaim has to clean pages
 			 * in order to balance the zones.
 			 */
 			zone->dirty_balance_reserve = max;
 		}
 	}
 	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 /*
  * setup_per_zone_lowmem_reserve - called whenever
  *	sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
  *	has a correct pages reserved value, so an adequate number of
  *	pages are left in the zone after a successful __alloc_pages().
  */
 static void setup_per_zone_lowmem_reserve(void)
 {
 	struct pglist_data *pgdat;
 	enum zone_type j, idx;
 	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
 			zone->lowmem_reserve[j] = 0;
 			idx = j;
 			while (idx) {
 				struct zone *lower_zone;
 				idx--;
 				if (sysctl_lowmem_reserve_ratio[idx] < 1)
 					sysctl_lowmem_reserve_ratio[idx] = 1;
 				lower_zone = pgdat->node_zones + idx;
 				lower_zone->lowmem_reserve[j] = present_pages /
 					sysctl_lowmem_reserve_ratio[idx];
 				present_pages += lower_zone->present_pages;
 			}
 		}
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 static void __setup_per_zone_wmarks(void)
 {
 	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
 	/* Calculate total number of !ZONE_HIGHMEM pages */
 	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
 	}
 	for_each_zone(zone) {
 		u64 tmp;
 		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->present_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
 			 * need highmem pages, so cap pages_min to a small
 			 * value here.
 			 *
 			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
 			 * deltas controls asynch page reclaim, and so should
 			 * not be capped for highmem.
 			 */
 			int min_pages;
 			min_pages = zone->present_pages / 1024;
 			if (min_pages < SWAP_CLUSTER_MAX)
 				min_pages = SWAP_CLUSTER_MAX;
 			if (min_pages > 128)
 				min_pages = 128;
 			zone->watermark[WMARK_MIN] = min_pages;
 		} else {
 			/*
 			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 		zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
 		zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
 		zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 /**
  * setup_per_zone_wmarks - called when min_free_kbytes changes
  * or when memory is hot-{added|removed}
  *
  * Ensures that the watermark[min,low,high] values for each zone are set
  * correctly with respect to min_free_kbytes.
  */
 void setup_per_zone_wmarks(void)
 {
 	mutex_lock(&zonelists_mutex);
 	__setup_per_zone_wmarks();
 	mutex_unlock(&zonelists_mutex);
 }
 /*
  * The inactive anon list should be small enough that the VM never has to
  * do too much work, but large enough that each inactive page has a chance
  * to be referenced again before it is swapped out.
  *
  * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
  * INACTIVE_ANON pages on this zone's LRU, maintained by the
  * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
  * the anonymous pages are kept on the inactive list.
  *
  * total     target    max
  * memory    ratio     inactive anon
  * -------------------------------------
  *   10MB       1         5MB
  *  100MB       1        50MB
  *    1GB       3       250MB
  *   10GB      10       0.9GB
  *  100GB      31         3GB
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
 static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
 	unsigned int gb, ratio;
 	/* Zone size in gigabytes */
 	gb = zone->present_pages >> (30 - PAGE_SHIFT);
 	if (gb)
 		ratio = int_sqrt(10 * gb);
 	else
 		ratio = 1;
 	zone->inactive_ratio = ratio;
 }
 static void __meminit setup_per_zone_inactive_ratio(void)
 {
 	struct zone *zone;
 	for_each_zone(zone)
 		calculate_zone_inactive_ratio(zone);
 }
 /*
  * Initialise min_free_kbytes.
  *
  * For small machines we want it small (128k min).  For large machines
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
  * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
  *
  * 16MB:	512k
  * 32MB:	724k
  * 64MB:	1024k
  * 128MB:	1448k
  * 256MB:	2048k
  * 512MB:	2896k
  * 1024MB:	4096k
  * 2048MB:	5792k
  * 4096MB:	8192k
  * 8192MB:	11584k
  * 16384MB:	16384k
  */
 int __meminit init_per_zone_wmark_min(void)
 {
 	unsigned long lowmem_kbytes;
 	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
 	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
 	if (min_free_kbytes < 128)
 		min_free_kbytes = 128;
 	if (min_free_kbytes > 65536)
 		min_free_kbytes = 65536;
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
 	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_wmark_min)
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
 	if (write)
 		setup_per_zone_wmarks();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_unmapped_pages = (zone->present_pages *
 				sysctl_min_unmapped_ratio) / 100;
 	return 0;
 }
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	int rc;
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 	for_each_zone(zone)
 		zone->min_slab_pages = (zone->present_pages *
 				sysctl_min_slab_ratio) / 100;
 	return 0;
 }
 #endif
 /*
  * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
  *	proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
  *	whenever sysctl_lowmem_reserve_ratio changes.
  *
  * The reserve ratio obviously has absolutely no relation with the
  * minimum watermarks. The lowmem reserve ratio can only make sense
  * if in function of the boot time zone sizes.
  */
 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec_minmax(table, write, buffer, length, ppos);
 	setup_per_zone_lowmem_reserve();
 	return 0;
 }
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
  * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
  * can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	struct zone *zone;
 	unsigned int cpu;
 	int ret;
 	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (!write || (ret < 0))
 		return ret;
 	for_each_populated_zone(zone) {
 		for_each_possible_cpu(cpu) {
 			unsigned long  high;
 			high = zone->present_pages / percpu_pagelist_fraction;
 			setup_pagelist_highmark(
 				per_cpu_ptr(zone->pageset, cpu), high);
 		}
 	}
 	return 0;
 }
 int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
 		return 0;
 	hashdist = simple_strtoul(str, &str, 0);
 	return 1;
 }
 __setup("hashdist=", set_hashdist);
 #endif
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
  *   quantity of entries
  * - limit is the number of hash buckets, not the total allocation size
  */
 void *__init alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long low_limit,
 				     unsigned long high_limit)
 {
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
 		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
 		numentries >>= 20 - PAGE_SHIFT;
 		numentries <<= 20 - PAGE_SHIFT;
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 		/* Make sure we've got at least a 0-order allocation.. */
 		if (unlikely(flags & HASH_SMALL)) {
 			/* Makes no sense without HASH_EARLY */
 			WARN_ON(!(flags & HASH_EARLY));
 			if (!(numentries >> *_hash_shift)) {
 				numentries = 1UL << *_hash_shift;
 				BUG_ON(!numentries);
 			}
 		} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 			numentries = PAGE_SIZE / bucketsize;
 	}
 	numentries = roundup_pow_of_two(numentries);
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
 		max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
 		do_div(max, bucketsize);
 	}
 	max = min(max, 0x80000000ULL);
 	if (numentries < low_limit)
 		numentries = low_limit;
 	if (numentries > max)
 		numentries = max;
 	log2qty = ilog2(numentries);
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = alloc_bootmem_nopanic(size);
 		else if (hashdist)
 			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
 			 * some pages at the end of hash table which
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
 				table = alloc_pages_exact(size, GFP_ATOMIC);
 				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
 	if (!table)
 		panic("Failed to allocate %s hash table\n", tablename);
 	printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
 	       tablename,
 	       (1UL << log2qty),
 	       ilog2(size) - PAGE_SHIFT,
 	       size);
 	if (_hash_shift)
 		*_hash_shift = log2qty;
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 	return table;
 }
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
 	return zone->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
 	pfn = pfn - zone->zone_start_pfn;
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
 /**
  * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest to retrieve
  * @end_bitidx: The last bit of interest
  * returns pageblock_bits flags
  */
 unsigned long get_pageblock_flags_group(struct page *page,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long flags = 0;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (test_bit(bitidx + start_bitidx, bitmap))
 			flags |= value;
 	return flags;
 }
 /**
  * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @start_bitidx: The first bit of interest
  * @end_bitidx: The last bit of interest
  * @flags: The flags to set
  */
 void set_pageblock_flags_group(struct page *page, unsigned long flags,
 					int start_bitidx, int end_bitidx)
 {
 	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long pfn, bitidx;
 	unsigned long value = 1;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	bitmap = get_pageblock_bitmap(zone, pfn);
 	bitidx = pfn_to_bitidx(zone, pfn);
 	VM_BUG_ON(pfn < zone->zone_start_pfn);
 	VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
 	for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
 		if (flags & value)
 			__set_bit(bitidx + start_bitidx, bitmap);
 		else
 			__clear_bit(bitidx + start_bitidx, bitmap);
 }
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
  * PageLRU check wihtout isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
 {
 	unsigned long pfn, iter, found;
 	int mt;
 	/*
 	 * For avoiding noise data, lru_add_drain_all() should be called
 	 * If ZONE_MOVABLE, the zone never contains unmovable pages
 	 */
 	if (zone_idx(zone) == ZONE_MOVABLE)
 		return false;
 	mt = get_pageblock_migratetype(page);
 	if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
 		return false;
 	pfn = page_to_pfn(page);
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 		if (!pfn_valid_within(check))
 			continue;
 		page = pfn_to_page(check);
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
 		 * because their page->_count is zero at all time.
 		 */
 		if (!atomic_read(&page->_count)) {
 			if (PageBuddy(page))
 				iter += (1 << page_order(page)) - 1;
 			continue;
 		}
 		if (!PageLRU(page))
 			found++;
 		/*
 		 * If there are RECLAIMABLE pages, we need to check it.
 		 * But now, memory offline itself doesn't call shrink_slab()
 		 * and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
 		 * we don't need more check. This is an _used_ not-movable page.
 		 *
 		 * The problematic thing here is PG_reserved pages. PG_reserved
 		 * is set to both of a memory hole page and a _used_ kernel
 		 * page at boot.
 		 */
 		if (found > count)
 			return true;
 	}
 	return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
 {
 	struct zone *zone;
 	unsigned long pfn;
 	/*
 	 * We have to be careful here because we are iterating over memory
 	 * sections which are not zone aware so we might end up outside of
 	 * the zone but still within the section.
 	 * We have to take care about the node as well. If the node is offline
 	 * its NODE_DATA will be NULL - see page_zone.
 	 */
 	if (!node_online(page_to_nid(page)))
 		return false;
 	zone = page_zone(page);
 	pfn = page_to_pfn(page);
 	if (zone->zone_start_pfn > pfn ||
 			zone->zone_start_pfn + zone->spanned_pages <= pfn)
 		return false;
 	return !has_unmovable_pages(zone, page, 0);
 }
 #ifdef CONFIG_CMA
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
 	return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
 			     pageblock_nr_pages) - 1);
 }
 static unsigned long pfn_max_align_up(unsigned long pfn)
 {
 	return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
 				pageblock_nr_pages));
 }
 static struct page *
 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
 			     int **resultp)
 {
 	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
 	return alloc_page(gfp_mask);
 }
 /* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned long pfn = start;
 	unsigned int tries = 0;
 	int ret = 0;
 	struct compact_control cc = {
 		.nr_migratepages = 0,
 		.order = -1,
 		.zone = page_zone(pfn_to_page(start)),
 		.sync = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 	migrate_prep_local();
 	while (pfn < end || !list_empty(&cc.migratepages)) {
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (list_empty(&cc.migratepages)) {
 			cc.nr_migratepages = 0;
 			pfn = isolate_migratepages_range(cc.zone, &cc,
 							 pfn, end);
 			if (!pfn) {
 				ret = -EINTR;
 				break;
 			}
 			tries = 0;
 		} else if (++tries == 5) {
 			ret = ret < 0 ? ret : -EBUSY;
 			break;
 		}
 		ret = migrate_pages(&cc.migratepages,
 				    __alloc_contig_migrate_alloc,
 				    0, false, MIGRATE_SYNC);
 	}
 	putback_lru_pages(&cc.migratepages);
 	return ret > 0 ? 0 : ret;
 }
 /*
  * Update zone's cma pages counter used for watermark level calculation.
  */
 static inline void __update_cma_watermarks(struct zone *zone, int count)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&zone->lock, flags);
 	zone->min_cma_pages += count;
 	spin_unlock_irqrestore(&zone->lock, flags);
 	setup_per_zone_wmarks();
 }
 /*
  * Trigger memory pressure bump to reclaim some pages in order to be able to
  * allocate 'count' pages in single page units. Does similar work as
  *__alloc_pages_slowpath() function.
  */
 static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
 {
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	struct zonelist *zonelist = node_zonelist(0, gfp_mask);
 	int did_some_progress = 0;
 	int order = 1;
 	/*
 	 * Increase level of watermarks to force kswapd do his job
 	 * to stabilise at new watermark level.
 	 */
 	__update_cma_watermarks(zone, count);
 	/* Obey watermarks as if the page was being allocated */
 	while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
 		wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
 		did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
 						      NULL);
 		if (!did_some_progress) {
 			/* Exhausted what can be done so it's blamo time */
 			out_of_memory(zonelist, gfp_mask, order, NULL, false);
 		}
 	}
 	/* Restore original watermark levels. */
 	__update_cma_watermarks(zone, -count);
 	return count;
 }
 /**
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
  * @migratetype:	migratetype of the underlaying pageblocks (either
  *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
  *			in range must have the same migratetype and it must
  *			be either of the two.
  *
  * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
  * aligned, however it's the caller's responsibility to guarantee that
  * we are the only thread that changes migrate type of pageblocks the
  * pages fall in.
  *
  * The PFN range must belong to a single zone.
  *
  * Returns zero on success or negative error code.  On success all
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range(unsigned long start, unsigned long end,
 		       unsigned migratetype)
 {
 	struct zone *zone = page_zone(pfn_to_page(start));
 	unsigned long outer_start, outer_end;
 	int ret = 0, order;
 	/*
 	 * What we do here is we mark all pageblocks in range as
 	 * MIGRATE_ISOLATE.  Because pageblock and max order pages may
 	 * have different sizes, and due to the way page allocator
 	 * work, we align the range to biggest of the two pages so
 	 * that page allocator won't try to merge buddies from
 	 * different pageblocks and change MIGRATE_ISOLATE to some
 	 * other migration type.
 	 *
 	 * Once the pageblocks are marked as MIGRATE_ISOLATE, we
 	 * migrate the pages from an unaligned range (ie. pages that
 	 * we are interested in).  This will put all the pages in
 	 * range back to page allocator as MIGRATE_ISOLATE.
 	 *
 	 * When this is done, we take the pages in range from page
 	 * allocator removing them from the buddy system.  This way
 	 * page allocator will never consider using them.
 	 *
 	 * This lets us mark the pageblocks back as
 	 * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
 	 * aligned range but not in the unaligned, original range are
 	 * put back to page allocator so that buddy can use them.
 	 */
 	ret = start_isolate_page_range(pfn_max_align_down(start),
 				       pfn_max_align_up(end), migratetype);
 	if (ret)
 		goto done;
 	ret = __alloc_contig_migrate_range(start, end);
 	if (ret)
 		goto done;
 	/*
 	 * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
 	 * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
 	 * more, all pages in [start, end) are free in page allocator.
 	 * What we are going to do is to allocate all pages from
 	 * [start, end) (that is remove them from page allocator).
 	 *
 	 * The only problem is that pages at the beginning and at the
 	 * end of interesting range may be not aligned with pages that
 	 * page allocator holds, ie. they can be part of higher order
 	 * pages.  Because of this, we reserve the bigger range and
 	 * once this is done free the pages we are not interested in.
 	 *
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
 	lru_add_drain_all();
 	drain_all_pages();
 	order = 0;
 	outer_start = start;
 	while (!PageBuddy(pfn_to_page(outer_start))) {
 		if (++order >= MAX_ORDER) {
 			ret = -EBUSY;
 			goto done;
 		}
 		outer_start &= ~0UL << order;
 	}
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end)) {
 		pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
 		       outer_start, end);
 		ret = -EBUSY;
 		goto done;
 	}
 	/*
 	 * Reclaim enough pages to make sure that contiguous allocation
 	 * will not starve the system.
 	 */
 	__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
 	/* Grab isolated pages from freelists. */
 	outer_end = isolate_freepages_range(outer_start, end);
 	if (!outer_end) {
 		ret = -EBUSY;
 		goto done;
 	}
 	/* Free head and tail (if any) */
 	if (start != outer_start)
 		free_contig_range(outer_start, start - outer_start);
 	if (end != outer_end)
 		free_contig_range(end, outer_end - end);
 done:
 	undo_isolate_page_range(pfn_max_align_down(start),
 				pfn_max_align_up(end), migratetype);
 	return ret;
 }
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
 	for (; nr_pages--; ++pfn)
 		__free_page(pfn_to_page(pfn));
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 static int __meminit __zone_pcp_update(void *data)
 {
 	struct zone *zone = data;
 	int cpu;
 	unsigned long batch = zone_batchsize(zone), flags;
 	for_each_possible_cpu(cpu) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 		pset = per_cpu_ptr(zone->pageset, cpu);
 		pcp = &pset->pcp;
 		local_irq_save(flags);
 		if (pcp->count > 0)
 			free_pcppages_bulk(zone, pcp->count, pcp);
 		setup_pageset(pset, batch);
 		local_irq_restore(flags);
 	}
 	return 0;
 }
 void __meminit zone_pcp_update(struct zone *zone)
 {
 	stop_machine(__zone_pcp_update, zone, NULL);
 }
 #endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
 void zone_pcp_reset(struct zone *zone)
 {
 	unsigned long flags;
 	/* avoid races with drain_pages()  */
 	local_irq_save(flags);
 	if (zone->pageset != &boot_pageset) {
 		free_percpu(zone->pageset);
 		zone->pageset = &boot_pageset;
 	}
 	local_irq_restore(flags);
 }
 /*
  * All pages in the range must be isolated before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct page *page;
 	struct zone *zone;
 	int order, i;
 	unsigned long pfn;
 	unsigned long flags;
 	/* find the first valid pfn */
 	for (pfn = start_pfn; pfn < end_pfn; pfn++)
 		if (pfn_valid(pfn))
 			break;
 	if (pfn == end_pfn)
 		return;
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
 	while (pfn < end_pfn) {
 		if (!pfn_valid(pfn)) {
 			pfn++;
 			continue;
 		}
 		page = pfn_to_page(pfn);
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
 		order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
 		printk(KERN_INFO "remove from free list %lx %d %lx\n",
 		       pfn, 1 << order, end_pfn);
 #endif
 		list_del(&page->lru);
 		rmv_page_order(page);
 		zone->free_area[order].nr_free--;
 		__mod_zone_page_state(zone, NR_FREE_PAGES,
 				      - (1UL << order));
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
 	unsigned long pfn = page_to_pfn(page);
 	unsigned long flags;
 	int order;
 	spin_lock_irqsave(&zone->lock, flags);
 	for (order = 0; order < MAX_ORDER; order++) {
 		struct page *page_head = page - (pfn & ((1 << order) - 1));
 		if (PageBuddy(page_head) && page_order(page_head) >= order)
 			break;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return order < MAX_ORDER;
 }
 #endif
 static const struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_locked,		"locked"	},
 	{1UL << PG_error,		"error"		},
 	{1UL << PG_referenced,		"referenced"	},
 	{1UL << PG_uptodate,		"uptodate"	},
 	{1UL << PG_dirty,		"dirty"		},
 	{1UL << PG_lru,			"lru"		},
 	{1UL << PG_active,		"active"	},
 	{1UL << PG_slab,		"slab"		},
 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 	{1UL << PG_arch_1,		"arch_1"	},
 	{1UL << PG_reserved,		"reserved"	},
 	{1UL << PG_private,		"private"	},
 	{1UL << PG_private_2,		"private_2"	},
 	{1UL << PG_writeback,		"writeback"	},
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{1UL << PG_head,		"head"		},
 	{1UL << PG_tail,		"tail"		},
 #else
 	{1UL << PG_compound,		"compound"	},
 #endif
 	{1UL << PG_swapcache,		"swapcache"	},
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 	{1UL << PG_reclaim,		"reclaim"	},
 	{1UL << PG_swapbacked,		"swapbacked"	},
 	{1UL << PG_unevictable,		"unevictable"	},
 #ifdef CONFIG_MMU
 	{1UL << PG_mlocked,		"mlocked"	},
 #endif
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 	{1UL << PG_uncached,		"uncached"	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
 	{1UL << PG_hwpoison,		"hwpoison"	},
 #endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
 };
 static void dump_page_flags(unsigned long flags)
 {
 	const char *delim = "";
 	unsigned long mask;
 	int i;
 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 	printk(KERN_ALERT "page flags: %#lx(", flags);
 	/* remove zone id */
 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 	for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
 		mask = pageflag_names[i].mask;
 		if ((flags & mask) != mask)
 			continue;
 		flags &= ~mask;
 		printk("%s%s", delim, pageflag_names[i].name);
 		delim = "|";
 	}
 	/* check for left over flags */
 	if (flags)
 		printk("%s%#lx", delim, flags);
 	printk(")\n");
 }
 void dump_page(struct page *page)
 {
 	printk(KERN_ALERT
 	       "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 		page, atomic_read(&page->_count), page_mapcount(page),
 		page->mapping, page->index);
 	dump_page_flags(page->flags);
 	mem_cgroup_print_bad_page(page);
 }