Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/page_alloc.c

2

* linux/mm/page_alloc.c

3

*

3

*

4

* Manages the free list, the system allocates free pages here.

4

* Manages the free list, the system allocates free pages here.

5

* Note that kmalloc() lives in slab.c

5

* Note that kmalloc() lives in slab.c

6

*

6

*

7

8

* Swap reorganised 29.12.95, Stephen Tweedie

8

* Swap reorganised 29.12.95, Stephen Tweedie

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

9

* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

10

* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

11

* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

12

* Zone balancing, Kanoj Sarcar, SGI, Jan 2000

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

13

* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

14

* (lots of bits borrowed from Ingo Molnar & Andrew Morton)

15

*/

15

*/

16

17

#include <linux/stddef.h>

17

#include <linux/stddef.h>

18

#include <linux/mm.h>

18

#include <linux/mm.h>

19

#include <linux/swap.h>

19

#include <linux/swap.h>

20

#include <linux/interrupt.h>

20

#include <linux/interrupt.h>

21

#include <linux/pagemap.h>

21

#include <linux/pagemap.h>

22

#include <linux/bootmem.h>

22

#include <linux/bootmem.h>

23

#include <linux/compiler.h>

23

#include <linux/compiler.h>

24

#include <linux/kernel.h>

24

#include <linux/kernel.h>

25

#include <linux/module.h>

25

#include <linux/module.h>

26

#include <linux/suspend.h>

26

#include <linux/suspend.h>

27

#include <linux/pagevec.h>

27

#include <linux/pagevec.h>

28

#include <linux/blkdev.h>

28

#include <linux/blkdev.h>

29

#include <linux/slab.h>

29

#include <linux/slab.h>

30

#include <linux/notifier.h>

30

#include <linux/notifier.h>

31

#include <linux/topology.h>

31

#include <linux/topology.h>

32

#include <linux/sysctl.h>

32

#include <linux/sysctl.h>

33

#include <linux/cpu.h>

33

#include <linux/cpu.h>

34

#include <linux/cpuset.h>

34

#include <linux/cpuset.h>

35

#include <linux/memory_hotplug.h>

35

#include <linux/memory_hotplug.h>

36

#include <linux/nodemask.h>

36

#include <linux/nodemask.h>

37

#include <linux/vmalloc.h>

37

#include <linux/vmalloc.h>

38

#include <linux/mempolicy.h>

38

#include <linux/mempolicy.h>

39

#include <linux/stop_machine.h>

39

#include <linux/stop_machine.h>

40

#include <linux/sort.h>

40

#include <linux/sort.h>

41

#include <linux/pfn.h>

41

#include <linux/pfn.h>

42

#include <linux/backing-dev.h>

42

#include <linux/backing-dev.h>

43

44

#include <asm/tlbflush.h>

44

#include <asm/tlbflush.h>

45

#include <asm/div64.h>

45

#include <asm/div64.h>

46

#include "internal.h"

46

#include "internal.h"

47

48

/*

48

/*

49

* MCD - HACK: Find somewhere to initialize this EARLY, or make this

49

* MCD - HACK: Find somewhere to initialize this EARLY, or make this

50

* initializer cleaner

50

* initializer cleaner

51

*/

51

*/

52

nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };

52

nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };

53

EXPORT_SYMBOL(node_online_map);

53

EXPORT_SYMBOL(node_online_map);

54

nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;

54

nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;

55

EXPORT_SYMBOL(node_possible_map);

55

EXPORT_SYMBOL(node_possible_map);

56

unsigned long totalram_pages __read_mostly;

56

unsigned long totalram_pages __read_mostly;

57

unsigned long totalreserve_pages __read_mostly;

57

unsigned long totalreserve_pages __read_mostly;

58

long nr_swap_pages;

58

long nr_swap_pages;

59

int percpu_pagelist_fraction;

59

int percpu_pagelist_fraction;

60

61

static void __free_pages_ok(struct page *page, unsigned int order);

61

static void __free_pages_ok(struct page *page, unsigned int order);

62

63

/*

63

/*

64

* results with 256, 32 in the lowmem_reserve sysctl:

64

* results with 256, 32 in the lowmem_reserve sysctl:

65

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

65

* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)

66

* 1G machine -> (16M dma, 784M normal, 224M high)

66

* 1G machine -> (16M dma, 784M normal, 224M high)

67

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

67

* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA

68

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

68

* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL

69

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

69

* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA

70

*

70

*

71

* TBD: should special case ZONE_DMA32 machines here - in those we normally

71

* TBD: should special case ZONE_DMA32 machines here - in those we normally

72

* don't need any ZONE_NORMAL reservation

72

* don't need any ZONE_NORMAL reservation

73

*/

73

*/

74

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

74

int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {

75

256,

75

256,

76

#ifdef CONFIG_ZONE_DMA32

76

#ifdef CONFIG_ZONE_DMA32

77

256,

77

256,

78

#endif

78

#endif

79

#ifdef CONFIG_HIGHMEM

79

#ifdef CONFIG_HIGHMEM

80

32

80

32

81

#endif

81

#endif

82

};

82

};

83

84

EXPORT_SYMBOL(totalram_pages);

84

EXPORT_SYMBOL(totalram_pages);

85

86

static char *zone_names[MAX_NR_ZONES] = {

86

static char *zone_names[MAX_NR_ZONES] = {

87

"DMA",

87

"DMA",

88

#ifdef CONFIG_ZONE_DMA32

88

#ifdef CONFIG_ZONE_DMA32

89

"DMA32",

89

"DMA32",

90

#endif

90

#endif

91

"Normal",

91

"Normal",

92

#ifdef CONFIG_HIGHMEM

92

#ifdef CONFIG_HIGHMEM

93

"HighMem"

93

"HighMem"

94

#endif

94

#endif

95

};

95

};

96

97

int min_free_kbytes = 1024;

97

int min_free_kbytes = 1024;

98

99

unsigned long __meminitdata nr_kernel_pages;

99

unsigned long __meminitdata nr_kernel_pages;

100

unsigned long __meminitdata nr_all_pages;

100

unsigned long __meminitdata nr_all_pages;

101

static unsigned long __initdata dma_reserve;

101

static unsigned long __initdata dma_reserve;

102

103

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

103

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

104

/*

104

/*

105

* MAX_ACTIVE_REGIONS determines the maxmimum number of distinct

105

* MAX_ACTIVE_REGIONS determines the maxmimum number of distinct

106

* ranges of memory (RAM) that may be registered with add_active_range().

106

* ranges of memory (RAM) that may be registered with add_active_range().

107

* Ranges passed to add_active_range() will be merged if possible

107

* Ranges passed to add_active_range() will be merged if possible

108

* so the number of times add_active_range() can be called is

108

* so the number of times add_active_range() can be called is

109

* related to the number of nodes and the number of holes

109

* related to the number of nodes and the number of holes

110

*/

110

*/

111

#ifdef CONFIG_MAX_ACTIVE_REGIONS

111

#ifdef CONFIG_MAX_ACTIVE_REGIONS

112

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

112

/* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */

113

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

113

#define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS

114

#else

114

#else

115

#if MAX_NUMNODES >= 32

115

#if MAX_NUMNODES >= 32

116

/* If there can be many nodes, allow up to 50 holes per node */

116

/* If there can be many nodes, allow up to 50 holes per node */

117

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

117

#define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)

118

#else

118

#else

119

/* By default, allow up to 256 distinct regions */

119

/* By default, allow up to 256 distinct regions */

120

#define MAX_ACTIVE_REGIONS 256

120

#define MAX_ACTIVE_REGIONS 256

121

#endif

121

#endif

122

#endif

122

#endif

123

124

struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];

124

struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];

125

int __initdata nr_nodemap_entries;

125

int __initdata nr_nodemap_entries;

126

unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

126

unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];

127

unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

127

unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];

128

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

128

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

129

unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];

129

unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];

130

unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];

130

unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];

131

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

131

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

132

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

132

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

133

134

#ifdef CONFIG_DEBUG_VM

134

#ifdef CONFIG_DEBUG_VM

135

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

135

static int page_outside_zone_boundaries(struct zone *zone, struct page *page)

136

{

136

{

137

int ret = 0;

137

int ret = 0;

138

unsigned seq;

138

unsigned seq;

139

unsigned long pfn = page_to_pfn(page);

139

unsigned long pfn = page_to_pfn(page);

140

141

do {

141

do {

142

seq = zone_span_seqbegin(zone);

142

seq = zone_span_seqbegin(zone);

143

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

143

if (pfn >= zone->zone_start_pfn + zone->spanned_pages)

144

ret = 1;

144

ret = 1;

145

else if (pfn < zone->zone_start_pfn)

145

else if (pfn < zone->zone_start_pfn)

146

ret = 1;

146

ret = 1;

147

} while (zone_span_seqretry(zone, seq));

147

} while (zone_span_seqretry(zone, seq));

148

149

return ret;

149

return ret;

150

}

150

}

151

152

static int page_is_consistent(struct zone *zone, struct page *page)

152

static int page_is_consistent(struct zone *zone, struct page *page)

153

{

153

{

154

#ifdef CONFIG_HOLES_IN_ZONE

154

#ifdef CONFIG_HOLES_IN_ZONE

155

if (!pfn_valid(page_to_pfn(page)))

155

if (!pfn_valid(page_to_pfn(page)))

156

return 0;

156

return 0;

157

#endif

157

#endif

158

if (zone != page_zone(page))

158

if (zone != page_zone(page))

159

return 0;

159

return 0;

160

161

return 1;

161

return 1;

162

}

162

}

163

/*

163

/*

164

* Temporary debugging check for pages not lying within a given zone.

164

* Temporary debugging check for pages not lying within a given zone.

165

*/

165

*/

166

static int bad_range(struct zone *zone, struct page *page)

166

static int bad_range(struct zone *zone, struct page *page)

167

{

167

{

168

if (page_outside_zone_boundaries(zone, page))

168

if (page_outside_zone_boundaries(zone, page))

169

return 1;

169

return 1;

170

if (!page_is_consistent(zone, page))

170

if (!page_is_consistent(zone, page))

171

return 1;

171

return 1;

172

173

return 0;

173

return 0;

174

}

174

}

175

#else

175

#else

176

static inline int bad_range(struct zone *zone, struct page *page)

176

static inline int bad_range(struct zone *zone, struct page *page)

177

{

177

{

178

return 0;

178

return 0;

179

}

179

}

180

#endif

180

#endif

181

182

static void bad_page(struct page *page)

182

static void bad_page(struct page *page)

183

{

183

{

184

printk(KERN_EMERG "Bad page state in process '%s'\n"

184

printk(KERN_EMERG "Bad page state in process '%s'\n"

185

KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"

185

KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"

186

KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

186

KERN_EMERG "Trying to fix it up, but a reboot is needed\n"

187

KERN_EMERG "Backtrace:\n",

187

KERN_EMERG "Backtrace:\n",

188

current->comm, page, (int)(2*sizeof(unsigned long)),

188

current->comm, page, (int)(2*sizeof(unsigned long)),

189

(unsigned long)page->flags, page->mapping,

189

(unsigned long)page->flags, page->mapping,

190

page_mapcount(page), page_count(page));

190

page_mapcount(page), page_count(page));

191

dump_stack();

191

dump_stack();

192

page->flags &= ~(1 << PG_lru |

192

page->flags &= ~(1 << PG_lru |

193

1 << PG_private |

193

1 << PG_private |

194

1 << PG_locked |

194

1 << PG_locked |

195

1 << PG_active |

195

1 << PG_active |

196

1 << PG_dirty |

196

1 << PG_dirty |

197

1 << PG_reclaim |

197

1 << PG_reclaim |

198

1 << PG_slab |

198

1 << PG_slab |

199

1 << PG_swapcache |

199

1 << PG_swapcache |

200

1 << PG_writeback |

200

1 << PG_writeback |

201

1 << PG_buddy );

201

1 << PG_buddy );

202

set_page_count(page, 0);

202

set_page_count(page, 0);

203

reset_page_mapcount(page);

203

reset_page_mapcount(page);

204

page->mapping = NULL;

204

page->mapping = NULL;

205

add_taint(TAINT_BAD_PAGE);

205

add_taint(TAINT_BAD_PAGE);

206

}

206

}

207

208

/*

208

/*

209

* Higher-order pages are called "compound pages". They are structured thusly:

209

* Higher-order pages are called "compound pages". They are structured thusly:

210

*

210

*

211

* The first PAGE_SIZE page is called the "head page".

211

* The first PAGE_SIZE page is called the "head page".

212

*

212

*

213

* The remaining PAGE_SIZE pages are called "tail pages".

213

* The remaining PAGE_SIZE pages are called "tail pages".

214

*

214

*

215

* All pages have PG_compound set. All pages have their ->private pointing at

215

* All pages have PG_compound set. All pages have their ->private pointing at

216

* the head page (even the head page has this).

216

* the head page (even the head page has this).

217

*

217

*

218

* The first tail page's ->lru.next holds the address of the compound page's

218

* The first tail page's ->lru.next holds the address of the compound page's

219

* put_page() function. Its ->lru.prev holds the order of allocation.

219

* put_page() function. Its ->lru.prev holds the order of allocation.

220

* This usage means that zero-order pages may not be compound.

220

* This usage means that zero-order pages may not be compound.

221

*/

221

*/

222

223

static void free_compound_page(struct page *page)

223

static void free_compound_page(struct page *page)

224

{

224

{

225

__free_pages_ok(page, (unsigned long)page[1].lru.prev);

225

__free_pages_ok(page, (unsigned long)page[1].lru.prev);

226

}

226

}

227

228

static void prep_compound_page(struct page *page, unsigned long order)

228

static void prep_compound_page(struct page *page, unsigned long order)

229

{

229

{

230

int i;

230

int i;

231

int nr_pages = 1 << order;

231

int nr_pages = 1 << order;

232

233

set_compound_page_dtor(page, free_compound_page);

233

set_compound_page_dtor(page, free_compound_page);

234

page[1].lru.prev = (void *)order;

234

page[1].lru.prev = (void *)order;

235

for (i = 0; i < nr_pages; i++) {

235

for (i = 0; i < nr_pages; i++) {

236

struct page *p = page + i;

236

struct page *p = page + i;

237

238

__SetPageCompound(p);

238

__SetPageCompound(p);

239

set_page_private(p, (unsigned long)page);

239

set_page_private(p, (unsigned long)page);

240

}

240

}

241

}

241

}

242

243

static void destroy_compound_page(struct page *page, unsigned long order)

243

static void destroy_compound_page(struct page *page, unsigned long order)

244

{

244

{

245

int i;

245

int i;

246

int nr_pages = 1 << order;

246

int nr_pages = 1 << order;

247

248

if (unlikely((unsigned long)page[1].lru.prev != order))

248

if (unlikely((unsigned long)page[1].lru.prev != order))

249

bad_page(page);

249

bad_page(page);

250

251

for (i = 0; i < nr_pages; i++) {

251

for (i = 0; i < nr_pages; i++) {

252

struct page *p = page + i;

252

struct page *p = page + i;

253

254

if (unlikely(!PageCompound(p) |

254

if (unlikely(!PageCompound(p) |

255

(page_private(p) != (unsigned long)page)))

255

(page_private(p) != (unsigned long)page)))

256

bad_page(page);

256

bad_page(page);

257

__ClearPageCompound(p);

257

__ClearPageCompound(p);

258

}

258

}

259

}

259

}

260

261

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

261

static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)

262

{

262

{

263

int i;

263

int i;

264

265

VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);

265

VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);

266

/*

266

/*

267

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

267

* clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO

268

* and __GFP_HIGHMEM from hard or soft interrupt context.

268

* and __GFP_HIGHMEM from hard or soft interrupt context.

269

*/

269

*/

270

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

270

VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());

271

for (i = 0; i < (1 << order); i++)

271

for (i = 0; i < (1 << order); i++)

272

clear_highpage(page + i);

272

clear_highpage(page + i);

273

}

273

}

274

275

/*

275

/*

276

* function for dealing with page's order in buddy system.

276

* function for dealing with page's order in buddy system.

277

* zone->lock is already acquired when we use these.

277

* zone->lock is already acquired when we use these.

278

* So, we don't need atomic page->flags operations here.

278

* So, we don't need atomic page->flags operations here.

279

*/

279

*/

280

static inline unsigned long page_order(struct page *page)

280

static inline unsigned long page_order(struct page *page)

281

{

281

{

282

return page_private(page);

282

return page_private(page);

283

}

283

}

284

285

static inline void set_page_order(struct page *page, int order)

285

static inline void set_page_order(struct page *page, int order)

286

{

286

{

287

set_page_private(page, order);

287

set_page_private(page, order);

288

__SetPageBuddy(page);

288

__SetPageBuddy(page);

289

}

289

}

290

291

static inline void rmv_page_order(struct page *page)

291

static inline void rmv_page_order(struct page *page)

292

{

292

{

293

__ClearPageBuddy(page);

293

__ClearPageBuddy(page);

294

set_page_private(page, 0);

294

set_page_private(page, 0);

295

}

295

}

296

297

/*

297

/*

298

* Locate the struct page for both the matching buddy in our

298

* Locate the struct page for both the matching buddy in our

299

* pair (buddy1) and the combined O(n+1) page they form (page).

299

* pair (buddy1) and the combined O(n+1) page they form (page).

300

*

300

*

301

* 1) Any buddy B1 will have an order O twin B2 which satisfies

301

* 1) Any buddy B1 will have an order O twin B2 which satisfies

302

* the following equation:

302

* the following equation:

303

* B2 = B1 ^ (1 << O)

303

* B2 = B1 ^ (1 << O)

304

* For example, if the starting buddy (buddy2) is #8 its order

304

* For example, if the starting buddy (buddy2) is #8 its order

305

* 1 buddy is #10:

305

* 1 buddy is #10:

306

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

306

* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10

307

*

307

*

308

* 2) Any buddy B will have an order O+1 parent P which

308

* 2) Any buddy B will have an order O+1 parent P which

309

* satisfies the following equation:

309

* satisfies the following equation:

310

* P = B & ~(1 << O)

310

* P = B & ~(1 << O)

311

*

311

*

312

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

312

* Assumption: *_mem_map is contiguous at least up to MAX_ORDER

313

*/

313

*/

314

static inline struct page *

314

static inline struct page *

315

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

315

__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)

316

{

316

{

317

unsigned long buddy_idx = page_idx ^ (1 << order);

317

unsigned long buddy_idx = page_idx ^ (1 << order);

318

319

return page + (buddy_idx - page_idx);

319

return page + (buddy_idx - page_idx);

320

}

320

}

321

322

static inline unsigned long

322

static inline unsigned long

323

__find_combined_index(unsigned long page_idx, unsigned int order)

323

__find_combined_index(unsigned long page_idx, unsigned int order)

324

{

324

{

325

return (page_idx & ~(1 << order));

325

return (page_idx & ~(1 << order));

326

}

326

}

327

328

/*

328

/*

329

* This function checks whether a page is free && is the buddy

329

* This function checks whether a page is free && is the buddy

330

* we can do coalesce a page and its buddy if

330

* we can do coalesce a page and its buddy if

331

* (a) the buddy is not in a hole &&

331

* (a) the buddy is not in a hole &&

332

* (b) the buddy is in the buddy system &&

332

* (b) the buddy is in the buddy system &&

333

* (c) a page and its buddy have the same order &&

333

* (c) a page and its buddy have the same order &&

334

* (d) a page and its buddy are in the same zone.

334

* (d) a page and its buddy are in the same zone.

335

*

335

*

336

* For recording whether a page is in the buddy system, we use PG_buddy.

336

* For recording whether a page is in the buddy system, we use PG_buddy.

337

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

337

* Setting, clearing, and testing PG_buddy is serialized by zone->lock.

338

*

338

*

339

* For recording page's order, we use page_private(page).

339

* For recording page's order, we use page_private(page).

340

*/

340

*/

341

static inline int page_is_buddy(struct page *page, struct page *buddy,

341

static inline int page_is_buddy(struct page *page, struct page *buddy,

342

int order)

342

int order)

343

{

343

{

344

#ifdef CONFIG_HOLES_IN_ZONE

344

#ifdef CONFIG_HOLES_IN_ZONE

345

if (!pfn_valid(page_to_pfn(buddy)))

345

if (!pfn_valid(page_to_pfn(buddy)))

346

return 0;

346

return 0;

347

#endif

347

#endif

348

349

if (page_zone_id(page) != page_zone_id(buddy))

349

if (page_zone_id(page) != page_zone_id(buddy))

350

return 0;

350

return 0;

351

352

if (PageBuddy(buddy) && page_order(buddy) == order) {

352

if (PageBuddy(buddy) && page_order(buddy) == order) {

353

BUG_ON(page_count(buddy) != 0);

353

BUG_ON(page_count(buddy) != 0);

354

return 1;

354

return 1;

355

}

355

}

356

return 0;

356

return 0;

357

}

357

}

358

359

/*

359

/*

360

* Freeing function for a buddy system allocator.

360

* Freeing function for a buddy system allocator.

361

*

361

*

362

* The concept of a buddy system is to maintain direct-mapped table

362

* The concept of a buddy system is to maintain direct-mapped table

363

* (containing bit values) for memory blocks of various "orders".

363

* (containing bit values) for memory blocks of various "orders".

364

* The bottom level table contains the map for the smallest allocatable

364

* The bottom level table contains the map for the smallest allocatable

365

* units of memory (here, pages), and each level above it describes

365

* units of memory (here, pages), and each level above it describes

366

* pairs of units from the levels below, hence, "buddies".

366

* pairs of units from the levels below, hence, "buddies".

367

* At a high level, all that happens here is marking the table entry

367

* At a high level, all that happens here is marking the table entry

368

* at the bottom level available, and propagating the changes upward

368

* at the bottom level available, and propagating the changes upward

369

* as necessary, plus some accounting needed to play nicely with other

369

* as necessary, plus some accounting needed to play nicely with other

370

* parts of the VM system.

370

* parts of the VM system.

371

* At each level, we keep a list of pages, which are heads of continuous

371

* At each level, we keep a list of pages, which are heads of continuous

372

* free pages of length of (1 << order) and marked with PG_buddy. Page's

372

* free pages of length of (1 << order) and marked with PG_buddy. Page's

373

* order is recorded in page_private(page) field.

373

* order is recorded in page_private(page) field.

374

* So when we are allocating or freeing one, we can derive the state of the

374

* So when we are allocating or freeing one, we can derive the state of the

375

* other. That is, if we allocate a small block, and both were

375

* other. That is, if we allocate a small block, and both were

376

* free, the remainder of the region must be split into blocks.

376

* free, the remainder of the region must be split into blocks.

377

* If a block is freed, and its buddy is also free, then this

377

* If a block is freed, and its buddy is also free, then this

378

* triggers coalescing into a block of larger size.

378

* triggers coalescing into a block of larger size.

379

*

379

*

380

* -- wli

380

* -- wli

381

*/

381

*/

382

383

static inline void __free_one_page(struct page *page,

383

static inline void __free_one_page(struct page *page,

384

struct zone *zone, unsigned int order)

384

struct zone *zone, unsigned int order)

385

{

385

{

386

unsigned long page_idx;

386

unsigned long page_idx;

387

int order_size = 1 << order;

387

int order_size = 1 << order;

388

389

if (unlikely(PageCompound(page)))

389

if (unlikely(PageCompound(page)))

390

destroy_compound_page(page, order);

390

destroy_compound_page(page, order);

391

392

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

392

page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);

393

394

VM_BUG_ON(page_idx & (order_size - 1));

394

VM_BUG_ON(page_idx & (order_size - 1));

395

VM_BUG_ON(bad_range(zone, page));

395

VM_BUG_ON(bad_range(zone, page));

396

397

zone->free_pages += order_size;

397

zone->free_pages += order_size;

398

while (order < MAX_ORDER-1) {

398

while (order < MAX_ORDER-1) {

399

unsigned long combined_idx;

399

unsigned long combined_idx;

400

struct free_area *area;

400

struct free_area *area;

401

struct page *buddy;

401

struct page *buddy;

402

403

buddy = __page_find_buddy(page, page_idx, order);

403

buddy = __page_find_buddy(page, page_idx, order);

404

if (!page_is_buddy(page, buddy, order))

404

if (!page_is_buddy(page, buddy, order))

405

break; /* Move the buddy up one level. */

405

break; /* Move the buddy up one level. */

406

407

list_del(&buddy->lru);

407

list_del(&buddy->lru);

408

area = zone->free_area + order;

408

area = zone->free_area + order;

409

area->nr_free--;

409

area->nr_free--;

410

rmv_page_order(buddy);

410

rmv_page_order(buddy);

411

combined_idx = __find_combined_index(page_idx, order);

411

combined_idx = __find_combined_index(page_idx, order);

412

page = page + (combined_idx - page_idx);

412

page = page + (combined_idx - page_idx);

413

page_idx = combined_idx;

413

page_idx = combined_idx;

414

order++;

414

order++;

415

}

415

}

416

set_page_order(page, order);

416

set_page_order(page, order);

417

list_add(&page->lru, &zone->free_area[order].free_list);

417

list_add(&page->lru, &zone->free_area[order].free_list);

418

zone->free_area[order].nr_free++;

418

zone->free_area[order].nr_free++;

419

}

419

}

420

421

static inline int free_pages_check(struct page *page)

421

static inline int free_pages_check(struct page *page)

422

{

422

{

423

if (unlikely(page_mapcount(page) |

423

if (unlikely(page_mapcount(page) |

424

(page->mapping != NULL) |

424

(page->mapping != NULL) |

425

(page_count(page) != 0) |

425

(page_count(page) != 0) |

426

(page->flags & (

426

(page->flags & (

427

1 << PG_lru |

427

1 << PG_lru |

428

1 << PG_private |

428

1 << PG_private |

429

1 << PG_locked |

429

1 << PG_locked |

430

1 << PG_active |

430

1 << PG_active |

431

1 << PG_reclaim |

431

1 << PG_reclaim |

432

1 << PG_slab |

432

1 << PG_slab |

433

1 << PG_swapcache |

433

1 << PG_swapcache |

434

1 << PG_writeback |

434

1 << PG_writeback |

435

1 << PG_reserved |

435

1 << PG_reserved |

436

1 << PG_buddy ))))

436

1 << PG_buddy ))))

437

bad_page(page);

437

bad_page(page);

438

if (PageDirty(page))

438

if (PageDirty(page))

439

__ClearPageDirty(page);

439

__ClearPageDirty(page);

440

/*

440

/*

441

* For now, we report if PG_reserved was found set, but do not

441

* For now, we report if PG_reserved was found set, but do not

442

* clear it, and do not free the page. But we shall soon need

442

* clear it, and do not free the page. But we shall soon need

443

* to do more, for when the ZERO_PAGE count wraps negative.

443

* to do more, for when the ZERO_PAGE count wraps negative.

444

*/

444

*/

445

return PageReserved(page);

445

return PageReserved(page);

446

}

446

}

447

448

/*

448

/*

449

* Frees a list of pages.

449

* Frees a list of pages.

450

* Assumes all pages on list are in same zone, and of same order.

450

* Assumes all pages on list are in same zone, and of same order.

451

* count is the number of pages to free.

451

* count is the number of pages to free.

452

*

452

*

453

* If the zone was previously in an "all pages pinned" state then look to

453

* If the zone was previously in an "all pages pinned" state then look to

454

* see if this freeing clears that state.

454

* see if this freeing clears that state.

455

*

455

*

456

* And clear the zone's pages_scanned counter, to hold off the "all pages are

456

* And clear the zone's pages_scanned counter, to hold off the "all pages are

457

* pinned" detection logic.

457

* pinned" detection logic.

458

*/

458

*/

459

static void free_pages_bulk(struct zone *zone, int count,

459

static void free_pages_bulk(struct zone *zone, int count,

460

struct list_head *list, int order)

460

struct list_head *list, int order)

461

{

461

{

462

spin_lock(&zone->lock);

462

spin_lock(&zone->lock);

463

zone->all_unreclaimable = 0;

463

zone->all_unreclaimable = 0;

464

zone->pages_scanned = 0;

464

zone->pages_scanned = 0;

465

while (count--) {

465

while (count--) {

466

struct page *page;

466

struct page *page;

467

468

VM_BUG_ON(list_empty(list));

468

VM_BUG_ON(list_empty(list));

469

page = list_entry(list->prev, struct page, lru);

469

page = list_entry(list->prev, struct page, lru);

470

/* have to delete it as __free_one_page list manipulates */

470

/* have to delete it as __free_one_page list manipulates */

471

list_del(&page->lru);

471

list_del(&page->lru);

472

__free_one_page(page, zone, order);

472

__free_one_page(page, zone, order);

473

}

473

}

474

spin_unlock(&zone->lock);

474

spin_unlock(&zone->lock);

475

}

475

}

476

477

static void free_one_page(struct zone *zone, struct page *page, int order)

477

static void free_one_page(struct zone *zone, struct page *page, int order)

478

{

478

{

479

spin_lock(&zone->lock);

479

spin_lock(&zone->lock);

480

zone->all_unreclaimable = 0;

480

zone->all_unreclaimable = 0;

481

zone->pages_scanned = 0;

481

zone->pages_scanned = 0;

482

__free_one_page(page, zone, order);

482

__free_one_page(page, zone, order);

483

spin_unlock(&zone->lock);

483

spin_unlock(&zone->lock);

484

}

484

}

485

486

static void __free_pages_ok(struct page *page, unsigned int order)

486

static void __free_pages_ok(struct page *page, unsigned int order)

487

{

487

{

488

unsigned long flags;

488

unsigned long flags;

489

int i;

489

int i;

490

int reserved = 0;

490

int reserved = 0;

491

492

for (i = 0 ; i < (1 << order) ; ++i)

492

for (i = 0 ; i < (1 << order) ; ++i)

493

reserved += free_pages_check(page + i);

493

reserved += free_pages_check(page + i);

494

if (reserved)

494

if (reserved)

495

return;

495

return;

496

497

if (!PageHighMem(page))

497

if (!PageHighMem(page))

498

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

498

debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);

499

arch_free_page(page, order);

499

arch_free_page(page, order);

500

kernel_map_pages(page, 1 << order, 0);

500

kernel_map_pages(page, 1 << order, 0);

501

502

local_irq_save(flags);

502

local_irq_save(flags);

503

__count_vm_events(PGFREE, 1 << order);

503

__count_vm_events(PGFREE, 1 << order);

504

free_one_page(page_zone(page), page, order);

504

free_one_page(page_zone(page), page, order);

505

local_irq_restore(flags);

505

local_irq_restore(flags);

506

}

506

}

507

508

/*

508

/*

509

* permit the bootmem allocator to evade page validation on high-order frees

509

* permit the bootmem allocator to evade page validation on high-order frees

510

*/

510

*/

511

void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)

511

void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)

512

{

512

{

513

if (order == 0) {

513

if (order == 0) {

514

__ClearPageReserved(page);

514

__ClearPageReserved(page);

515

set_page_count(page, 0);

515

set_page_count(page, 0);

516

set_page_refcounted(page);

516

set_page_refcounted(page);

517

__free_page(page);

517

__free_page(page);

518

} else {

518

} else {

519

int loop;

519

int loop;

520

521

prefetchw(page);

521

prefetchw(page);

522

for (loop = 0; loop < BITS_PER_LONG; loop++) {

522

for (loop = 0; loop < BITS_PER_LONG; loop++) {

523

struct page *p = &page[loop];

523

struct page *p = &page[loop];

524

525

if (loop + 1 < BITS_PER_LONG)

525

if (loop + 1 < BITS_PER_LONG)

526

prefetchw(p + 1);

526

prefetchw(p + 1);

527

__ClearPageReserved(p);

527

__ClearPageReserved(p);

528

set_page_count(p, 0);

528

set_page_count(p, 0);

529

}

529

}

530

531

set_page_refcounted(page);

531

set_page_refcounted(page);

532

__free_pages(page, order);

532

__free_pages(page, order);

533

}

533

}

534

}

534

}

535

536

537

/*

537

/*

538

* The order of subdivision here is critical for the IO subsystem.

538

* The order of subdivision here is critical for the IO subsystem.

539

* Please do not alter this order without good reasons and regression

539

* Please do not alter this order without good reasons and regression

540

* testing. Specifically, as large blocks of memory are subdivided,

540

* testing. Specifically, as large blocks of memory are subdivided,

541

* the order in which smaller blocks are delivered depends on the order

541

* the order in which smaller blocks are delivered depends on the order

542

* they're subdivided in this function. This is the primary factor

542

* they're subdivided in this function. This is the primary factor

543

* influencing the order in which pages are delivered to the IO

543

* influencing the order in which pages are delivered to the IO

544

* subsystem according to empirical testing, and this is also justified

544

* subsystem according to empirical testing, and this is also justified

545

* by considering the behavior of a buddy system containing a single

545

* by considering the behavior of a buddy system containing a single

546

* large block of memory acted on by a series of small allocations.

546

* large block of memory acted on by a series of small allocations.

547

* This behavior is a critical factor in sglist merging's success.

547

* This behavior is a critical factor in sglist merging's success.

548

*

548

*

549

* -- wli

549

* -- wli

550

*/

550

*/

551

static inline void expand(struct zone *zone, struct page *page,

551

static inline void expand(struct zone *zone, struct page *page,

552

int low, int high, struct free_area *area)

552

int low, int high, struct free_area *area)

553

{

553

{

554

unsigned long size = 1 << high;

554

unsigned long size = 1 << high;

555

556

while (high > low) {

556

while (high > low) {

557

area--;

557

area--;

558

high--;

558

high--;

559

size >>= 1;

559

size >>= 1;

560

VM_BUG_ON(bad_range(zone, &page[size]));

560

VM_BUG_ON(bad_range(zone, &page[size]));

561

list_add(&page[size].lru, &area->free_list);

561

list_add(&page[size].lru, &area->free_list);

562

area->nr_free++;

562

area->nr_free++;

563

set_page_order(&page[size], high);

563

set_page_order(&page[size], high);

564

}

564

}

565

}

565

}

566

567

/*

567

/*

568

* This page is about to be returned from the page allocator

568

* This page is about to be returned from the page allocator

569

*/

569

*/

570

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

570

static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)

571

{

571

{

572

if (unlikely(page_mapcount(page) |

572

if (unlikely(page_mapcount(page) |

573

(page->mapping != NULL) |

573

(page->mapping != NULL) |

574

(page_count(page) != 0) |

574

(page_count(page) != 0) |

575

(page->flags & (

575

(page->flags & (

576

1 << PG_lru |

576

1 << PG_lru |

577

1 << PG_private |

577

1 << PG_private |

578

1 << PG_locked |

578

1 << PG_locked |

579

1 << PG_active |

579

1 << PG_active |

580

1 << PG_dirty |

580

1 << PG_dirty |

581

1 << PG_reclaim |

581

1 << PG_reclaim |

582

1 << PG_slab |

582

1 << PG_slab |

583

1 << PG_swapcache |

583

1 << PG_swapcache |

584

1 << PG_writeback |

584

1 << PG_writeback |

585

1 << PG_reserved |

585

1 << PG_reserved |

586

1 << PG_buddy ))))

586

1 << PG_buddy ))))

587

bad_page(page);

587

bad_page(page);

588

589

/*

589

/*

590

* For now, we report if PG_reserved was found set, but do not

590

* For now, we report if PG_reserved was found set, but do not

591

* clear it, and do not allocate the page: as a safety net.

591

* clear it, and do not allocate the page: as a safety net.

592

*/

592

*/

593

if (PageReserved(page))

593

if (PageReserved(page))

594

return 1;

594

return 1;

595

596

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |

596

page->flags &= ~(1 << PG_uptodate | 1 << PG_error |

597

1 << PG_referenced | 1 << PG_arch_1 |

597

1 << PG_referenced | 1 << PG_arch_1 |

598

1 << PG_checked | 1 << PG_mappedtodisk);

598

1 << PG_checked | 1 << PG_mappedtodisk);

599

set_page_private(page, 0);

599

set_page_private(page, 0);

600

set_page_refcounted(page);

600

set_page_refcounted(page);

601

602

arch_alloc_page(page, order);

602

arch_alloc_page(page, order);

603

kernel_map_pages(page, 1 << order, 1);

603

kernel_map_pages(page, 1 << order, 1);

604

605

if (gfp_flags & __GFP_ZERO)

605

if (gfp_flags & __GFP_ZERO)

606

prep_zero_page(page, order, gfp_flags);

606

prep_zero_page(page, order, gfp_flags);

607

608

if (order && (gfp_flags & __GFP_COMP))

608

if (order && (gfp_flags & __GFP_COMP))

609

prep_compound_page(page, order);

609

prep_compound_page(page, order);

610

611

return 0;

611

return 0;

612

}

612

}

613

614

/*

614

/*

615

* Do the hard work of removing an element from the buddy allocator.

615

* Do the hard work of removing an element from the buddy allocator.

616

* Call me with the zone->lock already held.

616

* Call me with the zone->lock already held.

617

*/

617

*/

618

static struct page *__rmqueue(struct zone *zone, unsigned int order)

618

static struct page *__rmqueue(struct zone *zone, unsigned int order)

619

{

619

{

620

struct free_area * area;

620

struct free_area * area;

621

unsigned int current_order;

621

unsigned int current_order;

622

struct page *page;

622

struct page *page;

623

624

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

624

for (current_order = order; current_order < MAX_ORDER; ++current_order) {

625

area = zone->free_area + current_order;

625

area = zone->free_area + current_order;

626

if (list_empty(&area->free_list))

626

if (list_empty(&area->free_list))

627

continue;

627

continue;

628

629

page = list_entry(area->free_list.next, struct page, lru);

629

page = list_entry(area->free_list.next, struct page, lru);

630

list_del(&page->lru);

630

list_del(&page->lru);

631

rmv_page_order(page);

631

rmv_page_order(page);

632

area->nr_free--;

632

area->nr_free--;

633

zone->free_pages -= 1UL << order;

633

zone->free_pages -= 1UL << order;

634

expand(zone, page, order, current_order, area);

634

expand(zone, page, order, current_order, area);

635

return page;

635

return page;

636

}

636

}

637

638

return NULL;

638

return NULL;

639

}

639

}

640

641

/*

641

/*

642

* Obtain a specified number of elements from the buddy allocator, all under

642

* Obtain a specified number of elements from the buddy allocator, all under

643

* a single hold of the lock, for efficiency. Add them to the supplied list.

643

* a single hold of the lock, for efficiency. Add them to the supplied list.

644

* Returns the number of new pages which were placed at *list.

644

* Returns the number of new pages which were placed at *list.

645

*/

645

*/

646

static int rmqueue_bulk(struct zone *zone, unsigned int order,

646

static int rmqueue_bulk(struct zone *zone, unsigned int order,

647

unsigned long count, struct list_head *list)

647

unsigned long count, struct list_head *list)

648

{

648

{

649

int i;

649

int i;

650

651

spin_lock(&zone->lock);

651

spin_lock(&zone->lock);

652

for (i = 0; i < count; ++i) {

652

for (i = 0; i < count; ++i) {

653

struct page *page = __rmqueue(zone, order);

653

struct page *page = __rmqueue(zone, order);

654

if (unlikely(page == NULL))

654

if (unlikely(page == NULL))

655

break;

655

break;

656

list_add_tail(&page->lru, list);

656

list_add_tail(&page->lru, list);

657

}

657

}

658

spin_unlock(&zone->lock);

658

spin_unlock(&zone->lock);

659

return i;

659

return i;

660

}

660

}

661

662

#ifdef CONFIG_NUMA

662

#ifdef CONFIG_NUMA

663

/*

663

/*

664

* Called from the slab reaper to drain pagesets on a particular node that

664

* Called from the slab reaper to drain pagesets on a particular node that

665

* belongs to the currently executing processor.

665

* belongs to the currently executing processor.

666

* Note that this function must be called with the thread pinned to

666

* Note that this function must be called with the thread pinned to

667

* a single processor.

667

* a single processor.

668

*/

668

*/

669

void drain_node_pages(int nodeid)

669

void drain_node_pages(int nodeid)

670

{

670

{

671

int i;

671

int i;

672

enum zone_type z;

672

enum zone_type z;

673

unsigned long flags;

673

unsigned long flags;

674

675

for (z = 0; z < MAX_NR_ZONES; z++) {

675

for (z = 0; z < MAX_NR_ZONES; z++) {

676

struct zone *zone = NODE_DATA(nodeid)->node_zones + z;

676

struct zone *zone = NODE_DATA(nodeid)->node_zones + z;

677

struct per_cpu_pageset *pset;

677

struct per_cpu_pageset *pset;

678

679

if (!populated_zone(zone))

679

if (!populated_zone(zone))

680

continue;

680

continue;

681

682

pset = zone_pcp(zone, smp_processor_id());

682

pset = zone_pcp(zone, smp_processor_id());

683

for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {

683

for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {

684

struct per_cpu_pages *pcp;

684

struct per_cpu_pages *pcp;

685

686

pcp = &pset->pcp[i];

686

pcp = &pset->pcp[i];

687

if (pcp->count) {

687

if (pcp->count) {

688

int to_drain;

688

int to_drain;

689

690

local_irq_save(flags);

690

local_irq_save(flags);

691

if (pcp->count >= pcp->batch)

691

if (pcp->count >= pcp->batch)

692

to_drain = pcp->batch;

692

to_drain = pcp->batch;

693

else

693

else

694

to_drain = pcp->count;

694

to_drain = pcp->count;

695

free_pages_bulk(zone, to_drain, &pcp->list, 0);

695

free_pages_bulk(zone, to_drain, &pcp->list, 0);

696

pcp->count -= to_drain;

696

pcp->count -= to_drain;

697

local_irq_restore(flags);

697

local_irq_restore(flags);

698

}

698

}

699

}

699

}

700

}

700

}

701

}

701

}

702

#endif

702

#endif

703

704

#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)

704

#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)

705

static void __drain_pages(unsigned int cpu)

705

static void __drain_pages(unsigned int cpu)

706

{

706

{

707

unsigned long flags;

707

unsigned long flags;

708

struct zone *zone;

708

struct zone *zone;

709

int i;

709

int i;

710

711

for_each_zone(zone) {

711

for_each_zone(zone) {

712

struct per_cpu_pageset *pset;

712

struct per_cpu_pageset *pset;

713

714

pset = zone_pcp(zone, cpu);

714

pset = zone_pcp(zone, cpu);

715

for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {

715

for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {

716

struct per_cpu_pages *pcp;

716

struct per_cpu_pages *pcp;

717

718

pcp = &pset->pcp[i];

718

pcp = &pset->pcp[i];

719

local_irq_save(flags);

719

local_irq_save(flags);

720

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

720

free_pages_bulk(zone, pcp->count, &pcp->list, 0);

721

pcp->count = 0;

721

pcp->count = 0;

722

local_irq_restore(flags);

722

local_irq_restore(flags);

723

}

723

}

724

}

724

}

725

}

725

}

726

#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */

726

#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */

727

728

#ifdef CONFIG_PM

728

#ifdef CONFIG_PM

729

730

void mark_free_pages(struct zone *zone)

730

void mark_free_pages(struct zone *zone)

731

{

731

{

732

unsigned long pfn, max_zone_pfn;

732

unsigned long pfn, max_zone_pfn;

733

unsigned long flags;

733

unsigned long flags;

734

int order;

734

int order;

735

struct list_head *curr;

735

struct list_head *curr;

736

737

if (!zone->spanned_pages)

737

if (!zone->spanned_pages)

738

return;

738

return;

739

740

spin_lock_irqsave(&zone->lock, flags);

740

spin_lock_irqsave(&zone->lock, flags);

741

742

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

742

max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;

743

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

743

for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)

744

if (pfn_valid(pfn)) {

744

if (pfn_valid(pfn)) {

745

struct page *page = pfn_to_page(pfn);

745

struct page *page = pfn_to_page(pfn);

746

747

if (!PageNosave(page))

747

if (!PageNosave(page))

748

ClearPageNosaveFree(page);

748

ClearPageNosaveFree(page);

749

}

749

}

750

751

for (order = MAX_ORDER - 1; order >= 0; --order)

751

for (order = MAX_ORDER - 1; order >= 0; --order)

752

list_for_each(curr, &zone->free_area[order].free_list) {

752

list_for_each(curr, &zone->free_area[order].free_list) {

753

unsigned long i;

753

unsigned long i;

754

755

pfn = page_to_pfn(list_entry(curr, struct page, lru));

755

pfn = page_to_pfn(list_entry(curr, struct page, lru));

756

for (i = 0; i < (1UL << order); i++)

756

for (i = 0; i < (1UL << order); i++)

757

SetPageNosaveFree(pfn_to_page(pfn + i));

757

SetPageNosaveFree(pfn_to_page(pfn + i));

758

}

758

}

759

760

spin_unlock_irqrestore(&zone->lock, flags);

760

spin_unlock_irqrestore(&zone->lock, flags);

761

}

761

}

762

763

/*

763

/*

764

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

764

* Spill all of this CPU's per-cpu pages back into the buddy allocator.

765

*/

765

*/

766

void drain_local_pages(void)

766

void drain_local_pages(void)

767

{

767

{

768

unsigned long flags;

768

unsigned long flags;

769

770

local_irq_save(flags);

770

local_irq_save(flags);

771

__drain_pages(smp_processor_id());

771

__drain_pages(smp_processor_id());

772

local_irq_restore(flags);

772

local_irq_restore(flags);

773

}

773

}

774

#endif /* CONFIG_PM */

774

#endif /* CONFIG_PM */

775

776

/*

776

/*

777

* Free a 0-order page

777

* Free a 0-order page

778

*/

778

*/

779

static void fastcall free_hot_cold_page(struct page *page, int cold)

779

static void fastcall free_hot_cold_page(struct page *page, int cold)

780

{

780

{

781

struct zone *zone = page_zone(page);

781

struct zone *zone = page_zone(page);

782

struct per_cpu_pages *pcp;

782

struct per_cpu_pages *pcp;

783

unsigned long flags;

783

unsigned long flags;

784

785

if (PageAnon(page))

785

if (PageAnon(page))

786

page->mapping = NULL;

786

page->mapping = NULL;

787

if (free_pages_check(page))

787

if (free_pages_check(page))

788

return;

788

return;

789

790

if (!PageHighMem(page))

790

if (!PageHighMem(page))

791

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

791

debug_check_no_locks_freed(page_address(page), PAGE_SIZE);

792

arch_free_page(page, 0);

792

arch_free_page(page, 0);

793

kernel_map_pages(page, 1, 0);

793

kernel_map_pages(page, 1, 0);

794

795

pcp = &zone_pcp(zone, get_cpu())->pcp[cold];

795

pcp = &zone_pcp(zone, get_cpu())->pcp[cold];

796

local_irq_save(flags);

796

local_irq_save(flags);

797

__count_vm_event(PGFREE);

797

__count_vm_event(PGFREE);

798

list_add(&page->lru, &pcp->list);

798

list_add(&page->lru, &pcp->list);

799

pcp->count++;

799

pcp->count++;

800

if (pcp->count >= pcp->high) {

800

if (pcp->count >= pcp->high) {

801

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

801

free_pages_bulk(zone, pcp->batch, &pcp->list, 0);

802

pcp->count -= pcp->batch;

802

pcp->count -= pcp->batch;

803

}

803

}

804

local_irq_restore(flags);

804

local_irq_restore(flags);

805

put_cpu();

805

put_cpu();

806

}

806

}

807

808

void fastcall free_hot_page(struct page *page)

808

void fastcall free_hot_page(struct page *page)

809

{

809

{

810

free_hot_cold_page(page, 0);

810

free_hot_cold_page(page, 0);

811

}

811

}

812

813

void fastcall free_cold_page(struct page *page)

813

void fastcall free_cold_page(struct page *page)

814

{

814

{

815

free_hot_cold_page(page, 1);

815

free_hot_cold_page(page, 1);

816

}

816

}

817

818

/*

818

/*

819

* split_page takes a non-compound higher-order page, and splits it into

819

* split_page takes a non-compound higher-order page, and splits it into

820

* n (1<<order) sub-pages: page[0..n]

820

* n (1<<order) sub-pages: page[0..n]

821

* Each sub-page must be freed individually.

821

* Each sub-page must be freed individually.

822

*

822

*

823

* Note: this is probably too low level an operation for use in drivers.

823

* Note: this is probably too low level an operation for use in drivers.

824

* Please consult with lkml before using this in your driver.

824

* Please consult with lkml before using this in your driver.

825

*/

825

*/

826

void split_page(struct page *page, unsigned int order)

826

void split_page(struct page *page, unsigned int order)

827

{

827

{

828

int i;

828

int i;

829

830

VM_BUG_ON(PageCompound(page));

830

VM_BUG_ON(PageCompound(page));

831

VM_BUG_ON(!page_count(page));

831

VM_BUG_ON(!page_count(page));

832

for (i = 1; i < (1 << order); i++)

832

for (i = 1; i < (1 << order); i++)

833

set_page_refcounted(page + i);

833

set_page_refcounted(page + i);

834

}

834

}

835

836

/*

836

/*

837

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

837

* Really, prep_compound_page() should be called from __rmqueue_bulk(). But

838

* we cheat by calling it from here, in the order > 0 path. Saves a branch

838

* we cheat by calling it from here, in the order > 0 path. Saves a branch

839

* or two.

839

* or two.

840

*/

840

*/

841

static struct page *buffered_rmqueue(struct zonelist *zonelist,

841

static struct page *buffered_rmqueue(struct zonelist *zonelist,

842

struct zone *zone, int order, gfp_t gfp_flags)

842

struct zone *zone, int order, gfp_t gfp_flags)

843

{

843

{

844

unsigned long flags;

844

unsigned long flags;

845

struct page *page;

845

struct page *page;

846

int cold = !!(gfp_flags & __GFP_COLD);

846

int cold = !!(gfp_flags & __GFP_COLD);

847

int cpu;

847

int cpu;

848

849

again:

849

again:

850

cpu = get_cpu();

850

cpu = get_cpu();

851

if (likely(order == 0)) {

851

if (likely(order == 0)) {

852

struct per_cpu_pages *pcp;

852

struct per_cpu_pages *pcp;

853

854

pcp = &zone_pcp(zone, cpu)->pcp[cold];

854

pcp = &zone_pcp(zone, cpu)->pcp[cold];

855

local_irq_save(flags);

855

local_irq_save(flags);

856

if (!pcp->count) {

856

if (!pcp->count) {

857

pcp->count = rmqueue_bulk(zone, 0,

857

pcp->count = rmqueue_bulk(zone, 0,

858

pcp->batch, &pcp->list);

858

pcp->batch, &pcp->list);

859

if (unlikely(!pcp->count))

859

if (unlikely(!pcp->count))

860

goto failed;

860

goto failed;

861

}

861

}

862

page = list_entry(pcp->list.next, struct page, lru);

862

page = list_entry(pcp->list.next, struct page, lru);

863

list_del(&page->lru);

863

list_del(&page->lru);

864

pcp->count--;

864

pcp->count--;

865

} else {

865

} else {

866

spin_lock_irqsave(&zone->lock, flags);

866

spin_lock_irqsave(&zone->lock, flags);

867

page = __rmqueue(zone, order);

867

page = __rmqueue(zone, order);

868

spin_unlock(&zone->lock);

868

spin_unlock(&zone->lock);

869

if (!page)

869

if (!page)

870

goto failed;

870

goto failed;

871

}

871

}

872

873

__count_zone_vm_events(PGALLOC, zone, 1 << order);

873

__count_zone_vm_events(PGALLOC, zone, 1 << order);

874

zone_statistics(zonelist, zone);

874

zone_statistics(zonelist, zone);

875

local_irq_restore(flags);

875

local_irq_restore(flags);

876

put_cpu();

876

put_cpu();

877

878

VM_BUG_ON(bad_range(zone, page));

878

VM_BUG_ON(bad_range(zone, page));

879

if (prep_new_page(page, order, gfp_flags))

879

if (prep_new_page(page, order, gfp_flags))

880

goto again;

880

goto again;

881

return page;

881

return page;

882

883

failed:

883

failed:

884

local_irq_restore(flags);

884

local_irq_restore(flags);

885

put_cpu();

885

put_cpu();

886

return NULL;

886

return NULL;

887

}

887

}

888

889

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

889

#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */

890

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

890

#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */

891

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

891

#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */

892

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

892

#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */

893

#define ALLOC_HARDER 0x10 /* try to alloc harder */

893

#define ALLOC_HARDER 0x10 /* try to alloc harder */

894

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

894

#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */

895

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

895

#define ALLOC_CPUSET 0x40 /* check for correct cpuset */

896

897

/*

897

/*

898

* Return 1 if free pages are above 'mark'. This takes into account the order

898

* Return 1 if free pages are above 'mark'. This takes into account the order

899

* of the allocation.

899

* of the allocation.

900

*/

900

*/

901

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

901

int zone_watermark_ok(struct zone *z, int order, unsigned long mark,

902

int classzone_idx, int alloc_flags)

902

int classzone_idx, int alloc_flags)

903

{

903

{

904

/* free_pages my go negative - that's OK */

904

/* free_pages my go negative - that's OK */

905

unsigned long min = mark;

905

unsigned long min = mark;

906

long free_pages = z->free_pages - (1 << order) + 1;

906

long free_pages = z->free_pages - (1 << order) + 1;

907

int o;

907

int o;

908

909

if (alloc_flags & ALLOC_HIGH)

909

if (alloc_flags & ALLOC_HIGH)

910

min -= min / 2;

910

min -= min / 2;

911

if (alloc_flags & ALLOC_HARDER)

911

if (alloc_flags & ALLOC_HARDER)

912

min -= min / 4;

912

min -= min / 4;

913

914

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

914

if (free_pages <= min + z->lowmem_reserve[classzone_idx])

915

return 0;

915

return 0;

916

for (o = 0; o < order; o++) {

916

for (o = 0; o < order; o++) {

917

/* At the next order, this order's pages become unavailable */

917

/* At the next order, this order's pages become unavailable */

918

free_pages -= z->free_area[o].nr_free << o;

918

free_pages -= z->free_area[o].nr_free << o;

919

920

/* Require fewer higher order pages to be free */

920

/* Require fewer higher order pages to be free */

921

min >>= 1;

921

min >>= 1;

922

923

if (free_pages <= min)

923

if (free_pages <= min)

924

return 0;

924

return 0;

925

}

925

}

926

return 1;

926

return 1;

927

}

927

}

928

929

#ifdef CONFIG_NUMA

929

#ifdef CONFIG_NUMA

930

/*

930

/*

931

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

931

* zlc_setup - Setup for "zonelist cache". Uses cached zone data to

932

* skip over zones that are not allowed by the cpuset, or that have

932

* skip over zones that are not allowed by the cpuset, or that have

933

* been recently (in last second) found to be nearly full. See further

933

* been recently (in last second) found to be nearly full. See further

934

* comments in mmzone.h. Reduces cache footprint of zonelist scans

934

* comments in mmzone.h. Reduces cache footprint of zonelist scans

935

* that have to skip over alot of full or unallowed zones.

935

* that have to skip over alot of full or unallowed zones.

936

*

936

*

937

* If the zonelist cache is present in the passed in zonelist, then

937

* If the zonelist cache is present in the passed in zonelist, then

938

* returns a pointer to the allowed node mask (either the current

938

* returns a pointer to the allowed node mask (either the current

939

* tasks mems_allowed, or node_online_map.)

939

* tasks mems_allowed, or node_online_map.)

940

*

940

*

941

* If the zonelist cache is not available for this zonelist, does

941

* If the zonelist cache is not available for this zonelist, does

942

* nothing and returns NULL.

942

* nothing and returns NULL.

943

*

943

*

944

* If the fullzones BITMAP in the zonelist cache is stale (more than

944

* If the fullzones BITMAP in the zonelist cache is stale (more than

945

* a second since last zap'd) then we zap it out (clear its bits.)

945

* a second since last zap'd) then we zap it out (clear its bits.)

946

*

946

*

947

* We hold off even calling zlc_setup, until after we've checked the

947

* We hold off even calling zlc_setup, until after we've checked the

948

* first zone in the zonelist, on the theory that most allocations will

948

* first zone in the zonelist, on the theory that most allocations will

949

* be satisfied from that first zone, so best to examine that zone as

949

* be satisfied from that first zone, so best to examine that zone as

950

* quickly as we can.

950

* quickly as we can.

951

*/

951

*/

952

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

952

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

953

{

953

{

954

struct zonelist_cache *zlc; /* cached zonelist speedup info */

954

struct zonelist_cache *zlc; /* cached zonelist speedup info */

955

nodemask_t *allowednodes; /* zonelist_cache approximation */

955

nodemask_t *allowednodes; /* zonelist_cache approximation */

956

957

zlc = zonelist->zlcache_ptr;

957

zlc = zonelist->zlcache_ptr;

958

if (!zlc)

958

if (!zlc)

959

return NULL;

959

return NULL;

960

961

if (jiffies - zlc->last_full_zap > 1 * HZ) {

961

if (jiffies - zlc->last_full_zap > 1 * HZ) {

962

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

962

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

963

zlc->last_full_zap = jiffies;

963

zlc->last_full_zap = jiffies;

964

}

964

}

965

966

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

966

allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?

967

&cpuset_current_mems_allowed :

967

&cpuset_current_mems_allowed :

968

&node_online_map;

968

&node_online_map;

969

return allowednodes;

969

return allowednodes;

970

}

970

}

971

972

/*

972

/*

973

* Given 'z' scanning a zonelist, run a couple of quick checks to see

973

* Given 'z' scanning a zonelist, run a couple of quick checks to see

974

* if it is worth looking at further for free memory:

974

* if it is worth looking at further for free memory:

975

* 1) Check that the zone isn't thought to be full (doesn't have its

975

* 1) Check that the zone isn't thought to be full (doesn't have its

976

* bit set in the zonelist_cache fullzones BITMAP).

976

* bit set in the zonelist_cache fullzones BITMAP).

977

* 2) Check that the zones node (obtained from the zonelist_cache

977

* 2) Check that the zones node (obtained from the zonelist_cache

978

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

978

* z_to_n[] mapping) is allowed in the passed in allowednodes mask.

979

* Return true (non-zero) if zone is worth looking at further, or

979

* Return true (non-zero) if zone is worth looking at further, or

980

* else return false (zero) if it is not.

980

* else return false (zero) if it is not.

981

*

981

*

982

* This check -ignores- the distinction between various watermarks,

982

* This check -ignores- the distinction between various watermarks,

983

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

983

* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is

984

* found to be full for any variation of these watermarks, it will

984

* found to be full for any variation of these watermarks, it will

985

* be considered full for up to one second by all requests, unless

985

* be considered full for up to one second by all requests, unless

986

* we are so low on memory on all allowed nodes that we are forced

986

* we are so low on memory on all allowed nodes that we are forced

987

* into the second scan of the zonelist.

987

* into the second scan of the zonelist.

988

*

988

*

989

* In the second scan we ignore this zonelist cache and exactly

989

* In the second scan we ignore this zonelist cache and exactly

990

* apply the watermarks to all zones, even it is slower to do so.

990

* apply the watermarks to all zones, even it is slower to do so.

991

* We are low on memory in the second scan, and should leave no stone

991

* We are low on memory in the second scan, and should leave no stone

992

* unturned looking for a free page.

992

* unturned looking for a free page.

993

*/

993

*/

994

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,

994

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,

995

nodemask_t *allowednodes)

995

nodemask_t *allowednodes)

996

{

996

{

997

struct zonelist_cache *zlc; /* cached zonelist speedup info */

997

struct zonelist_cache *zlc; /* cached zonelist speedup info */

998

int i; /* index of *z in zonelist zones */

998

int i; /* index of *z in zonelist zones */

999

int n; /* node that zone *z is on */

999

int n; /* node that zone *z is on */

1000

1001

zlc = zonelist->zlcache_ptr;

1001

zlc = zonelist->zlcache_ptr;

1002

if (!zlc)

1002

if (!zlc)

1003

return 1;

1003

return 1;

1004

1005

i = z - zonelist->zones;

1005

i = z - zonelist->zones;

1006

n = zlc->z_to_n[i];

1006

n = zlc->z_to_n[i];

1007

1008

/* This zone is worth trying if it is allowed but not full */

1008

/* This zone is worth trying if it is allowed but not full */

1009

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1009

return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);

1010

}

1010

}

1011

1012

/*

1012

/*

1013

* Given 'z' scanning a zonelist, set the corresponding bit in

1013

* Given 'z' scanning a zonelist, set the corresponding bit in

1014

* zlc->fullzones, so that subsequent attempts to allocate a page

1014

* zlc->fullzones, so that subsequent attempts to allocate a page

1015

* from that zone don't waste time re-examining it.

1015

* from that zone don't waste time re-examining it.

1016

*/

1016

*/

1017

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)

1017

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)

1018

{

1018

{

1019

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1019

struct zonelist_cache *zlc; /* cached zonelist speedup info */

1020

int i; /* index of *z in zonelist zones */

1020

int i; /* index of *z in zonelist zones */

1021

1022

zlc = zonelist->zlcache_ptr;

1022

zlc = zonelist->zlcache_ptr;

1023

if (!zlc)

1023

if (!zlc)

1024

return;

1024

return;

1025

1026

i = z - zonelist->zones;

1026

i = z - zonelist->zones;

1027

1028

set_bit(i, zlc->fullzones);

1028

set_bit(i, zlc->fullzones);

1029

}

1029

}

1030

1031

#else /* CONFIG_NUMA */

1031

#else /* CONFIG_NUMA */

1032

1033

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1033

static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)

1034

{

1034

{

1035

return NULL;

1035

return NULL;

1036

}

1036

}

1037

1038

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,

1038

static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,

1039

nodemask_t *allowednodes)

1039

nodemask_t *allowednodes)

1040

{

1040

{

1041

return 1;

1041

return 1;

1042

}

1042

}

1043

1044

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)

1044

static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)

1045

{

1045

{

1046

}

1046

}

1047

#endif /* CONFIG_NUMA */

1047

#endif /* CONFIG_NUMA */

1048

1049

/*

1049

/*

1050

* get_page_from_freelist goes through the zonelist trying to allocate

1050

* get_page_from_freelist goes through the zonelist trying to allocate

1051

* a page.

1051

* a page.

1052

*/

1052

*/

1053

static struct page *

1053

static struct page *

1054

get_page_from_freelist(gfp_t gfp_mask, unsigned int order,

1054

get_page_from_freelist(gfp_t gfp_mask, unsigned int order,

1055

struct zonelist *zonelist, int alloc_flags)

1055

struct zonelist *zonelist, int alloc_flags)

1056

{

1056

{

1057

struct zone **z;

1057

struct zone **z;

1058

struct page *page = NULL;

1058

struct page *page = NULL;

1059

int classzone_idx = zone_idx(zonelist->zones[0]);

1059

int classzone_idx = zone_idx(zonelist->zones[0]);

1060

struct zone *zone;

1060

struct zone *zone;

1061

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1061

nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */

1062

int zlc_active = 0; /* set if using zonelist_cache */

1062

int zlc_active = 0; /* set if using zonelist_cache */

1063

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1063

int did_zlc_setup = 0; /* just call zlc_setup() one time */

1064

1065

zonelist_scan:

1065

zonelist_scan:

1066

/*

1066

/*

1067

* Scan zonelist, looking for a zone with enough free.

1067

* Scan zonelist, looking for a zone with enough free.

1068

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1068

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1069

*/

1069

*/

1070

z = zonelist->zones;

1070

z = zonelist->zones;

1071

1072

do {

1072

do {

1073

if (NUMA_BUILD && zlc_active &&

1073

if (NUMA_BUILD && zlc_active &&

1074

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1074

!zlc_zone_worth_trying(zonelist, z, allowednodes))

1075

continue;

1075

continue;

1076

zone = *z;

1076

zone = *z;

1077

if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&

1077

if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&

1078

zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))

1078

zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))

1079

break;

1079

break;

1080

if ((alloc_flags & ALLOC_CPUSET) &&

1080

if ((alloc_flags & ALLOC_CPUSET) &&

1081

!cpuset_zone_allowed(zone, gfp_mask))

1081

!cpuset_zone_allowed(zone, gfp_mask))

1082

goto try_next_zone;

1082

goto try_next_zone;

1083

1084

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1084

if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {

1085

unsigned long mark;

1085

unsigned long mark;

1086

if (alloc_flags & ALLOC_WMARK_MIN)

1086

if (alloc_flags & ALLOC_WMARK_MIN)

1087

mark = zone->pages_min;

1087

mark = zone->pages_min;

1088

else if (alloc_flags & ALLOC_WMARK_LOW)

1088

else if (alloc_flags & ALLOC_WMARK_LOW)

1089

mark = zone->pages_low;

1089

mark = zone->pages_low;

1090

else

1090

else

1091

mark = zone->pages_high;

1091

mark = zone->pages_high;

1092

if (!zone_watermark_ok(zone, order, mark,

1092

if (!zone_watermark_ok(zone, order, mark,

1093

classzone_idx, alloc_flags)) {

1093

classzone_idx, alloc_flags)) {

1094

if (!zone_reclaim_mode ||

1094

if (!zone_reclaim_mode ||

1095

!zone_reclaim(zone, gfp_mask, order))

1095

!zone_reclaim(zone, gfp_mask, order))

1096

goto this_zone_full;

1096

goto this_zone_full;

1097

}

1097

}

1098

}

1098

}

1099

1100

page = buffered_rmqueue(zonelist, zone, order, gfp_mask);

1100

page = buffered_rmqueue(zonelist, zone, order, gfp_mask);

1101

if (page)

1101

if (page)

1102

break;

1102

break;

1103

this_zone_full:

1103

this_zone_full:

1104

if (NUMA_BUILD)

1104

if (NUMA_BUILD)

1105

zlc_mark_zone_full(zonelist, z);

1105

zlc_mark_zone_full(zonelist, z);

1106

try_next_zone:

1106

try_next_zone:

1107

if (NUMA_BUILD && !did_zlc_setup) {

1107

if (NUMA_BUILD && !did_zlc_setup) {

1108

/* we do zlc_setup after the first zone is tried */

1108

/* we do zlc_setup after the first zone is tried */

1109

allowednodes = zlc_setup(zonelist, alloc_flags);

1109

allowednodes = zlc_setup(zonelist, alloc_flags);

1110

zlc_active = 1;

1110

zlc_active = 1;

1111

did_zlc_setup = 1;

1111

did_zlc_setup = 1;

1112

}

1112

}

1113

} while (*(++z) != NULL);

1113

} while (*(++z) != NULL);

1114

1115

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1115

if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {

1116

/* Disable zlc cache for second zonelist scan */

1116

/* Disable zlc cache for second zonelist scan */

1117

zlc_active = 0;

1117

zlc_active = 0;

1118

goto zonelist_scan;

1118

goto zonelist_scan;

1119

}

1119

}

1120

return page;

1120

return page;

1121

}

1121

}

1122

1123

/*

1123

/*

1124

* This is the 'heart' of the zoned buddy allocator.

1124

* This is the 'heart' of the zoned buddy allocator.

1125

*/

1125

*/

1126

struct page * fastcall

1126

struct page * fastcall

1127

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1127

__alloc_pages(gfp_t gfp_mask, unsigned int order,

1128

struct zonelist *zonelist)

1128

struct zonelist *zonelist)

1129

{

1129

{

1130

const gfp_t wait = gfp_mask & __GFP_WAIT;

1130

const gfp_t wait = gfp_mask & __GFP_WAIT;

1131

struct zone **z;

1131

struct zone **z;

1132

struct page *page;

1132

struct page *page;

1133

struct reclaim_state reclaim_state;

1133

struct reclaim_state reclaim_state;

1134

struct task_struct *p = current;

1134

struct task_struct *p = current;

1135

int do_retry;

1135

int do_retry;

1136

int alloc_flags;

1136

int alloc_flags;

1137

int did_some_progress;

1137

int did_some_progress;

1138

1139

might_sleep_if(wait);

1139

might_sleep_if(wait);

1140

1141

restart:

1141

restart:

1142

z = zonelist->zones; /* the list of zones suitable for gfp_mask */

1142

z = zonelist->zones; /* the list of zones suitable for gfp_mask */

1143

1144

if (unlikely(*z == NULL)) {

1144

if (unlikely(*z == NULL)) {

1145

/* Should this ever happen?? */

1145

/* Should this ever happen?? */

1146

return NULL;

1146

return NULL;

1147

}

1147

}

1148

1149

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

1149

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

1150

zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1150

zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);

1151

if (page)

1151

if (page)

1152

goto got_pg;

1152

goto got_pg;

1153

1154

/*

1154

/*

1155

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1155

* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and

1156

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1156

* __GFP_NOWARN set) should not cause reclaim since the subsystem

1157

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1157

* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim

1158

* using a larger set of nodes after it has established that the

1158

* using a larger set of nodes after it has established that the

1159

* allowed per node queues are empty and that nodes are

1159

* allowed per node queues are empty and that nodes are

1160

* over allocated.

1160

* over allocated.

1161

*/

1161

*/

1162

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1162

if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)

1163

goto nopage;

1163

goto nopage;

1164

1165

for (z = zonelist->zones; *z; z++)

1165

for (z = zonelist->zones; *z; z++)

1166

wakeup_kswapd(*z, order);

1166

wakeup_kswapd(*z, order);

1167

1168

/*

1168

/*

1169

* OK, we're below the kswapd watermark and have kicked background

1169

* OK, we're below the kswapd watermark and have kicked background

1170

* reclaim. Now things get more complex, so set up alloc_flags according

1170

* reclaim. Now things get more complex, so set up alloc_flags according

1171

* to how we want to proceed.

1171

* to how we want to proceed.

1172

*

1172

*

1173

* The caller may dip into page reserves a bit more if the caller

1173

* The caller may dip into page reserves a bit more if the caller

1174

* cannot run direct reclaim, or if the caller has realtime scheduling

1174

* cannot run direct reclaim, or if the caller has realtime scheduling

1175

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1175

* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will

1176

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1176

* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

1177

*/

1177

*/

1178

alloc_flags = ALLOC_WMARK_MIN;

1178

alloc_flags = ALLOC_WMARK_MIN;

1179

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1179

if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

1180

alloc_flags |= ALLOC_HARDER;

1180

alloc_flags |= ALLOC_HARDER;

1181

if (gfp_mask & __GFP_HIGH)

1181

if (gfp_mask & __GFP_HIGH)

1182

alloc_flags |= ALLOC_HIGH;

1182

alloc_flags |= ALLOC_HIGH;

1183

if (wait)

1183

if (wait)

1184

alloc_flags |= ALLOC_CPUSET;

1184

alloc_flags |= ALLOC_CPUSET;

1185

1186

/*

1186

/*

1187

* Go through the zonelist again. Let __GFP_HIGH and allocations

1187

* Go through the zonelist again. Let __GFP_HIGH and allocations

1188

* coming from realtime tasks go deeper into reserves.

1188

* coming from realtime tasks go deeper into reserves.

1189

*

1189

*

1190

* This is the last chance, in general, before the goto nopage.

1190

* This is the last chance, in general, before the goto nopage.

1191

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1191

* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

1192

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1192

* See also cpuset_zone_allowed() comment in kernel/cpuset.c.

1193

*/

1193

*/

1194

page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);

1194

page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);

1195

if (page)

1195

if (page)

1196

goto got_pg;

1196

goto got_pg;

1197

1198

/* This allocation should allow future memory freeing. */

1198

/* This allocation should allow future memory freeing. */

1199

1200

rebalance:

1200

rebalance:

1201

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1201

if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))

1202

&& !in_interrupt()) {

1202

&& !in_interrupt()) {

1203

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1203

if (!(gfp_mask & __GFP_NOMEMALLOC)) {

1204

nofail_alloc:

1204

nofail_alloc:

1205

/* go through the zonelist yet again, ignoring mins */

1205

/* go through the zonelist yet again, ignoring mins */

1206

page = get_page_from_freelist(gfp_mask, order,

1206

page = get_page_from_freelist(gfp_mask, order,

1207

zonelist, ALLOC_NO_WATERMARKS);

1207

zonelist, ALLOC_NO_WATERMARKS);

1208

if (page)

1208

if (page)

1209

goto got_pg;

1209

goto got_pg;

1210

if (gfp_mask & __GFP_NOFAIL) {

1210

if (gfp_mask & __GFP_NOFAIL) {

1211

congestion_wait(WRITE, HZ/50);

1211

congestion_wait(WRITE, HZ/50);

1212

goto nofail_alloc;

1212

goto nofail_alloc;

1213

}

1213

}

1214

}

1214

}

1215

goto nopage;

1215

goto nopage;

1216

}

1216

}

1217

1218

/* Atomic allocations - we can't balance anything */

1218

/* Atomic allocations - we can't balance anything */

1219

if (!wait)

1219

if (!wait)

1220

goto nopage;

1220

goto nopage;

1221

1222

cond_resched();

1222

cond_resched();

1223

1224

/* We now go into synchronous reclaim */

1224

/* We now go into synchronous reclaim */

1225

cpuset_memory_pressure_bump();

1225

cpuset_memory_pressure_bump();

1226

p->flags |= PF_MEMALLOC;

1226

p->flags |= PF_MEMALLOC;

1227

reclaim_state.reclaimed_slab = 0;

1227

reclaim_state.reclaimed_slab = 0;

1228

p->reclaim_state = &reclaim_state;

1228

p->reclaim_state = &reclaim_state;

1229

1230

did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);

1230

did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);

1231

1232

p->reclaim_state = NULL;

1232

p->reclaim_state = NULL;

1233

p->flags &= ~PF_MEMALLOC;

1233

p->flags &= ~PF_MEMALLOC;

1234

1235

cond_resched();

1235

cond_resched();

1236

1237

if (likely(did_some_progress)) {

1237

if (likely(did_some_progress)) {

1238

page = get_page_from_freelist(gfp_mask, order,

1238

page = get_page_from_freelist(gfp_mask, order,

1239

zonelist, alloc_flags);

1239

zonelist, alloc_flags);

1240

if (page)

1240

if (page)

1241

goto got_pg;

1241

goto got_pg;

1242

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1242

} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {

1243

/*

1243

/*

1244

* Go through the zonelist yet one more time, keep

1244

* Go through the zonelist yet one more time, keep

1245

* very high watermark here, this is only to catch

1245

* very high watermark here, this is only to catch

1246

* a parallel oom killing, we must fail if we're still

1246

* a parallel oom killing, we must fail if we're still

1247

* under heavy pressure.

1247

* under heavy pressure.

1248

*/

1248

*/

1249

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

1249

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

1250

zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1250

zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1251

if (page)

1251

if (page)

1252

goto got_pg;

1252

goto got_pg;

1253

1254

out_of_memory(zonelist, gfp_mask, order);

1254

out_of_memory(zonelist, gfp_mask, order);

1255

goto restart;

1255

goto restart;

1256

}

1256

}

1257

1258

/*

1258

/*

1259

* Don't let big-order allocations loop unless the caller explicitly

1259

* Don't let big-order allocations loop unless the caller explicitly

1260

* requests that. Wait for some write requests to complete then retry.

1260

* requests that. Wait for some write requests to complete then retry.

1261

*

1261

*

1262

* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order

1262

* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order

1263

* <= 3, but that may not be true in other implementations.

1263

* <= 3, but that may not be true in other implementations.

1264

*/

1264

*/

1265

do_retry = 0;

1265

do_retry = 0;

1266

if (!(gfp_mask & __GFP_NORETRY)) {

1266

if (!(gfp_mask & __GFP_NORETRY)) {

1267

if ((order <= 3) || (gfp_mask & __GFP_REPEAT))

1267

if ((order <= 3) || (gfp_mask & __GFP_REPEAT))

1268

do_retry = 1;

1268

do_retry = 1;

1269

if (gfp_mask & __GFP_NOFAIL)

1269

if (gfp_mask & __GFP_NOFAIL)

1270

do_retry = 1;

1270

do_retry = 1;

1271

}

1271

}

1272

if (do_retry) {

1272

if (do_retry) {

1273

congestion_wait(WRITE, HZ/50);

1273

congestion_wait(WRITE, HZ/50);

1274

goto rebalance;

1274

goto rebalance;

1275

}

1275

}

1276

1277

nopage:

1277

nopage:

1278

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1278

if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1279

printk(KERN_WARNING "%s: page allocation failure."

1279

printk(KERN_WARNING "%s: page allocation failure."

1280

" order:%d, mode:0x%x\n",

1280

" order:%d, mode:0x%x\n",

1281

p->comm, order, gfp_mask);

1281

p->comm, order, gfp_mask);

1282

dump_stack();

1282

dump_stack();

1283

show_mem();

1283

show_mem();

1284

}

1284

}

1285

got_pg:

1285

got_pg:

1286

return page;

1286

return page;

1287

}

1287

}

1288

1289

EXPORT_SYMBOL(__alloc_pages);

1289

EXPORT_SYMBOL(__alloc_pages);

1290

1291

/*

1291

/*

1292

* Common helper functions.

1292

* Common helper functions.

1293

*/

1293

*/

1294

fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1294

fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)

1295

{

1295

{

1296

struct page * page;

1296

struct page * page;

1297

page = alloc_pages(gfp_mask, order);

1297

page = alloc_pages(gfp_mask, order);

1298

if (!page)

1298

if (!page)

1299

return 0;

1299

return 0;

1300

return (unsigned long) page_address(page);

1300

return (unsigned long) page_address(page);

1301

}

1301

}

1302

1303

EXPORT_SYMBOL(__get_free_pages);

1303

EXPORT_SYMBOL(__get_free_pages);

1304

1305

fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)

1305

fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)

1306

{

1306

{

1307

struct page * page;

1307

struct page * page;

1308

1309

/*

1309

/*

1310

* get_zeroed_page() returns a 32-bit address, which cannot represent

1310

* get_zeroed_page() returns a 32-bit address, which cannot represent

1311

* a highmem page

1311

* a highmem page

1312

*/

1312

*/

1313

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1313

VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

1314

1315

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1315

page = alloc_pages(gfp_mask | __GFP_ZERO, 0);

1316

if (page)

1316

if (page)

1317

return (unsigned long) page_address(page);

1317

return (unsigned long) page_address(page);

1318

return 0;

1318

return 0;

1319

}

1319

}

1320

1321

EXPORT_SYMBOL(get_zeroed_page);

1321

EXPORT_SYMBOL(get_zeroed_page);

1322

1323

void __pagevec_free(struct pagevec *pvec)

1323

void __pagevec_free(struct pagevec *pvec)

1324

{

1324

{

1325

int i = pagevec_count(pvec);

1325

int i = pagevec_count(pvec);

1326

1327

while (--i >= 0)

1327

while (--i >= 0)

1328

free_hot_cold_page(pvec->pages[i], pvec->cold);

1328

free_hot_cold_page(pvec->pages[i], pvec->cold);

1329

}

1329

}

1330

1331

fastcall void __free_pages(struct page *page, unsigned int order)

1331

fastcall void __free_pages(struct page *page, unsigned int order)

1332

{

1332

{

1333

if (put_page_testzero(page)) {

1333

if (put_page_testzero(page)) {

1334

if (order == 0)

1334

if (order == 0)

1335

free_hot_page(page);

1335

free_hot_page(page);

1336

else

1336

else

1337

__free_pages_ok(page, order);

1337

__free_pages_ok(page, order);

1338

}

1338

}

1339

}

1339

}

1340

1341

EXPORT_SYMBOL(__free_pages);

1341

EXPORT_SYMBOL(__free_pages);

1342

1343

fastcall void free_pages(unsigned long addr, unsigned int order)

1343

fastcall void free_pages(unsigned long addr, unsigned int order)

1344

{

1344

{

1345

if (addr != 0) {

1345

if (addr != 0) {

1346

VM_BUG_ON(!virt_addr_valid((void *)addr));

1346

VM_BUG_ON(!virt_addr_valid((void *)addr));

1347

__free_pages(virt_to_page((void *)addr), order);

1347

__free_pages(virt_to_page((void *)addr), order);

1348

}

1348

}

1349

}

1349

}

1350

1351

EXPORT_SYMBOL(free_pages);

1351

EXPORT_SYMBOL(free_pages);

1352

1353

/*

1353

/*

1354

* Total amount of free (allocatable) RAM:

1354

* Total amount of free (allocatable) RAM:

1355

*/

1355

*/

1356

unsigned int nr_free_pages(void)

1356

unsigned int nr_free_pages(void)

1357

{

1357

{

1358

unsigned int sum = 0;

1358

unsigned int sum = 0;

1359

struct zone *zone;

1359

struct zone *zone;

1360

1361

for_each_zone(zone)

1361

for_each_zone(zone)

1362

sum += zone->free_pages;

1362

sum += zone->free_pages;

1363

1364

return sum;

1364

return sum;

1365

}

1365

}

1366

1367

EXPORT_SYMBOL(nr_free_pages);

1367

EXPORT_SYMBOL(nr_free_pages);

1368

1369

#ifdef CONFIG_NUMA

1369

#ifdef CONFIG_NUMA

1370

unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)

1370

unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)

1371

{

1371

{

1372

unsigned int sum = 0;

1372

unsigned int sum = 0;

1373

enum zone_type i;

1373

enum zone_type i;

1374

1375

for (i = 0; i < MAX_NR_ZONES; i++)

1375

for (i = 0; i < MAX_NR_ZONES; i++)

1376

sum += pgdat->node_zones[i].free_pages;

1376

sum += pgdat->node_zones[i].free_pages;

1377

1378

return sum;

1378

return sum;

1379

}

1379

}

1380

#endif

1380

#endif

1381

1382

static unsigned int nr_free_zone_pages(int offset)

1382

static unsigned int nr_free_zone_pages(int offset)

1383

{

1383

{

1384

/* Just pick one node, since fallback list is circular */

1384

/* Just pick one node, since fallback list is circular */

1385

pg_data_t *pgdat = NODE_DATA(numa_node_id());

1385

pg_data_t *pgdat = NODE_DATA(numa_node_id());

1386

unsigned int sum = 0;

1386

unsigned int sum = 0;

1387

1388

struct zonelist *zonelist = pgdat->node_zonelists + offset;

1388

struct zonelist *zonelist = pgdat->node_zonelists + offset;

1389

struct zone **zonep = zonelist->zones;

1389

struct zone **zonep = zonelist->zones;

1390

struct zone *zone;

1390

struct zone *zone;

1391

1392

for (zone = *zonep++; zone; zone = *zonep++) {

1392

for (zone = *zonep++; zone; zone = *zonep++) {

1393

unsigned long size = zone->present_pages;

1393

unsigned long size = zone->present_pages;

1394

unsigned long high = zone->pages_high;

1394

unsigned long high = zone->pages_high;

1395

if (size > high)

1395

if (size > high)

1396

sum += size - high;

1396

sum += size - high;

1397

}

1397

}

1398

1399

return sum;

1399

return sum;

1400

}

1400

}

1401

1402

/*

1402

/*

1403

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1403

* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL

1404

*/

1404

*/

1405

unsigned int nr_free_buffer_pages(void)

1405

unsigned int nr_free_buffer_pages(void)

1406

{

1406

{

1407

return nr_free_zone_pages(gfp_zone(GFP_USER));

1407

return nr_free_zone_pages(gfp_zone(GFP_USER));

1408

}

1408

}

1409

1410

/*

1410

/*

1411

* Amount of free RAM allocatable within all zones

1411

* Amount of free RAM allocatable within all zones

1412

*/

1412

*/

1413

unsigned int nr_free_pagecache_pages(void)

1413

unsigned int nr_free_pagecache_pages(void)

1414

{

1414

{

1415

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));

1415

return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));

1416

}

1416

}

1417

1418

static inline void show_node(struct zone *zone)

1418

static inline void show_node(struct zone *zone)

1419

{

1419

{

1420

if (NUMA_BUILD)

1420

if (NUMA_BUILD)

1421

printk("Node %d ", zone_to_nid(zone));

1421

printk("Node %d ", zone_to_nid(zone));

1422

}

1422

}

1423

1424

void si_meminfo(struct sysinfo *val)

1424

void si_meminfo(struct sysinfo *val)

1425

{

1425

{

1426

val->totalram = totalram_pages;

1426

val->totalram = totalram_pages;

1427

val->sharedram = 0;

1427

val->sharedram = 0;

1428

val->freeram = nr_free_pages();

1428

val->freeram = nr_free_pages();

1429

val->bufferram = nr_blockdev_pages();

1429

val->bufferram = nr_blockdev_pages();

1430

val->totalhigh = totalhigh_pages;

1430

val->totalhigh = totalhigh_pages;

1431

val->freehigh = nr_free_highpages();

1431

val->freehigh = nr_free_highpages();

1432

val->mem_unit = PAGE_SIZE;

1432

val->mem_unit = PAGE_SIZE;

1433

}

1433

}

1434

1435

EXPORT_SYMBOL(si_meminfo);

1435

EXPORT_SYMBOL(si_meminfo);

1436

1437

#ifdef CONFIG_NUMA

1437

#ifdef CONFIG_NUMA

1438

void si_meminfo_node(struct sysinfo *val, int nid)

1438

void si_meminfo_node(struct sysinfo *val, int nid)

1439

{

1439

{

1440

pg_data_t *pgdat = NODE_DATA(nid);

1440

pg_data_t *pgdat = NODE_DATA(nid);

1441

1442

val->totalram = pgdat->node_present_pages;

1442

val->totalram = pgdat->node_present_pages;

1443

val->freeram = nr_free_pages_pgdat(pgdat);

1443

val->freeram = nr_free_pages_pgdat(pgdat);

1444

#ifdef CONFIG_HIGHMEM

1444

#ifdef CONFIG_HIGHMEM

1445

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1445

val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;

1446

val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;

1446

val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;

1447

#else

1447

#else

1448

val->totalhigh = 0;

1448

val->totalhigh = 0;

1449

val->freehigh = 0;

1449

val->freehigh = 0;

1450

#endif

1450

#endif

1451

val->mem_unit = PAGE_SIZE;

1451

val->mem_unit = PAGE_SIZE;

1452

}

1452

}

1453

#endif

1453

#endif

1454

1455

#define K(x) ((x) << (PAGE_SHIFT-10))

1455

#define K(x) ((x) << (PAGE_SHIFT-10))

1456

1457

/*

1457

/*

1458

* Show free area list (used inside shift_scroll-lock stuff)

1458

* Show free area list (used inside shift_scroll-lock stuff)

1459

* We also calculate the percentage fragmentation. We do this by counting the

1459

* We also calculate the percentage fragmentation. We do this by counting the

1460

* memory on each free list with the exception of the first item on the list.

1460

* memory on each free list with the exception of the first item on the list.

1461

*/

1461

*/

1462

void show_free_areas(void)

1462

void show_free_areas(void)

1463

{

1463

{

1464

int cpu;

1464

int cpu;

1465

unsigned long active;

1465

unsigned long active;

1466

unsigned long inactive;

1466

unsigned long inactive;

1467

unsigned long free;

1467

unsigned long free;

1468

struct zone *zone;

1468

struct zone *zone;

1469

1470

for_each_zone(zone) {

1470

for_each_zone(zone) {

1471

if (!populated_zone(zone))

1471

if (!populated_zone(zone))

1472

continue;

1472

continue;

1473

1474

show_node(zone);

1474

show_node(zone);

1475

printk("%s per-cpu:\n", zone->name);

1475

printk("%s per-cpu:\n", zone->name);

1476

1477

for_each_online_cpu(cpu) {

1477

for_each_online_cpu(cpu) {

1478

struct per_cpu_pageset *pageset;

1478

struct per_cpu_pageset *pageset;

1479

1480

pageset = zone_pcp(zone, cpu);

1480

pageset = zone_pcp(zone, cpu);

1481

1482

printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "

1482

printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "

1483

"Cold: hi:%5d, btch:%4d usd:%4d\n",

1483

"Cold: hi:%5d, btch:%4d usd:%4d\n",

1484

cpu, pageset->pcp[0].high,

1484

cpu, pageset->pcp[0].high,

1485

pageset->pcp[0].batch, pageset->pcp[0].count,

1485

pageset->pcp[0].batch, pageset->pcp[0].count,

1486

pageset->pcp[1].high, pageset->pcp[1].batch,

1486

pageset->pcp[1].high, pageset->pcp[1].batch,

1487

pageset->pcp[1].count);

1487

pageset->pcp[1].count);

1488

}

1488

}

1489

}

1489

}

1490

1491

get_zone_counts(&active, &inactive, &free);

1491

get_zone_counts(&active, &inactive, &free);

1492

1493

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "

1493

printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "

1494

"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",

1494

"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",

1495

active,

1495

active,

1496

inactive,

1496

inactive,

1497

global_page_state(NR_FILE_DIRTY),

1497

global_page_state(NR_FILE_DIRTY),

1498

global_page_state(NR_WRITEBACK),

1498

global_page_state(NR_WRITEBACK),

1499

global_page_state(NR_UNSTABLE_NFS),

1499

global_page_state(NR_UNSTABLE_NFS),

1500

nr_free_pages(),

1500

nr_free_pages(),

1501

global_page_state(NR_SLAB_RECLAIMABLE) +

1501

global_page_state(NR_SLAB_RECLAIMABLE) +

1502

global_page_state(NR_SLAB_UNRECLAIMABLE),

1502

global_page_state(NR_SLAB_UNRECLAIMABLE),

1503

global_page_state(NR_FILE_MAPPED),

1503

global_page_state(NR_FILE_MAPPED),

1504

global_page_state(NR_PAGETABLE));

1504

global_page_state(NR_PAGETABLE));

1505

1506

for_each_zone(zone) {

1506

for_each_zone(zone) {

1507

int i;

1507

int i;

1508

1509

if (!populated_zone(zone))

1509

if (!populated_zone(zone))

1510

continue;

1510

continue;

1511

1512

show_node(zone);

1512

show_node(zone);

1513

printk("%s"

1513

printk("%s"

1514

" free:%lukB"

1514

" free:%lukB"

1515

" min:%lukB"

1515

" min:%lukB"

1516

" low:%lukB"

1516

" low:%lukB"

1517

" high:%lukB"

1517

" high:%lukB"

1518

" active:%lukB"

1518

" active:%lukB"

1519

" inactive:%lukB"

1519

" inactive:%lukB"

1520

" present:%lukB"

1520

" present:%lukB"

1521

" pages_scanned:%lu"

1521

" pages_scanned:%lu"

1522

" all_unreclaimable? %s"

1522

" all_unreclaimable? %s"

1523

"\n",

1523

"\n",

1524

zone->name,

1524

zone->name,

1525

K(zone->free_pages),

1525

K(zone->free_pages),

1526

K(zone->pages_min),

1526

K(zone->pages_min),

1527

K(zone->pages_low),

1527

K(zone->pages_low),

1528

K(zone->pages_high),

1528

K(zone->pages_high),

1529

K(zone->nr_active),

1529

K(zone->nr_active),

1530

K(zone->nr_inactive),

1530

K(zone->nr_inactive),

1531

K(zone->present_pages),

1531

K(zone->present_pages),

1532

zone->pages_scanned,

1532

zone->pages_scanned,

1533

(zone->all_unreclaimable ? "yes" : "no")

1533

(zone->all_unreclaimable ? "yes" : "no")

1534

);

1534

);

1535

printk("lowmem_reserve[]:");

1535

printk("lowmem_reserve[]:");

1536

for (i = 0; i < MAX_NR_ZONES; i++)

1536

for (i = 0; i < MAX_NR_ZONES; i++)

1537

printk(" %lu", zone->lowmem_reserve[i]);

1537

printk(" %lu", zone->lowmem_reserve[i]);

1538

printk("\n");

1538

printk("\n");

1539

}

1539

}

1540

1541

for_each_zone(zone) {

1541

for_each_zone(zone) {

1542

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1542

unsigned long nr[MAX_ORDER], flags, order, total = 0;

1543

1544

if (!populated_zone(zone))

1544

if (!populated_zone(zone))

1545

continue;

1545

continue;

1546

1547

show_node(zone);

1547

show_node(zone);

1548

printk("%s: ", zone->name);

1548

printk("%s: ", zone->name);

1549

1550

spin_lock_irqsave(&zone->lock, flags);

1550

spin_lock_irqsave(&zone->lock, flags);

1551

for (order = 0; order < MAX_ORDER; order++) {

1551

for (order = 0; order < MAX_ORDER; order++) {

1552

nr[order] = zone->free_area[order].nr_free;

1552

nr[order] = zone->free_area[order].nr_free;

1553

total += nr[order] << order;

1553

total += nr[order] << order;

1554

}

1554

}

1555

spin_unlock_irqrestore(&zone->lock, flags);

1555

spin_unlock_irqrestore(&zone->lock, flags);

1556

for (order = 0; order < MAX_ORDER; order++)

1556

for (order = 0; order < MAX_ORDER; order++)

1557

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1557

printk("%lu*%lukB ", nr[order], K(1UL) << order);

1558

printk("= %lukB\n", K(total));

1558

printk("= %lukB\n", K(total));

1559

}

1559

}

1560

1561

show_swap_cache_info();

1561

show_swap_cache_info();

1562

}

1562

}

1563

1564

/*

1564

/*

1565

* Builds allocation fallback zone lists.

1565

* Builds allocation fallback zone lists.

1566

*

1566

*

1567

* Add all populated zones of a node to the zonelist.

1567

* Add all populated zones of a node to the zonelist.

1568

*/

1568

*/

1569

static int __meminit build_zonelists_node(pg_data_t *pgdat,

1569

static int __meminit build_zonelists_node(pg_data_t *pgdat,

1570

struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)

1570

struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)

1571

{

1571

{

1572

struct zone *zone;

1572

struct zone *zone;

1573

1574

BUG_ON(zone_type >= MAX_NR_ZONES);

1574

BUG_ON(zone_type >= MAX_NR_ZONES);

1575

zone_type++;

1575

zone_type++;

1576

1577

do {

1577

do {

1578

zone_type--;

1578

zone_type--;

1579

zone = pgdat->node_zones + zone_type;

1579

zone = pgdat->node_zones + zone_type;

1580

if (populated_zone(zone)) {

1580

if (populated_zone(zone)) {

1581

zonelist->zones[nr_zones++] = zone;

1581

zonelist->zones[nr_zones++] = zone;

1582

check_highest_zone(zone_type);

1582

check_highest_zone(zone_type);

1583

}

1583

}

1584

1585

} while (zone_type);

1585

} while (zone_type);

1586

return nr_zones;

1586

return nr_zones;

1587

}

1587

}

1588

1589

#ifdef CONFIG_NUMA

1589

#ifdef CONFIG_NUMA

1590

#define MAX_NODE_LOAD (num_online_nodes())

1590

#define MAX_NODE_LOAD (num_online_nodes())

1591

static int __meminitdata node_load[MAX_NUMNODES];

1591

static int __meminitdata node_load[MAX_NUMNODES];

1592

/**

1592

/**

1593

* find_next_best_node - find the next node that should appear in a given node's fallback list

1593

* find_next_best_node - find the next node that should appear in a given node's fallback list

1594

* @node: node whose fallback list we're appending

1594

* @node: node whose fallback list we're appending

1595

* @used_node_mask: nodemask_t of already used nodes

1595

* @used_node_mask: nodemask_t of already used nodes

1596

*

1596

*

1597

* We use a number of factors to determine which is the next node that should

1597

* We use a number of factors to determine which is the next node that should

1598

* appear on a given node's fallback list. The node should not have appeared

1598

* appear on a given node's fallback list. The node should not have appeared

1599

* already in @node's fallback list, and it should be the next closest node

1599

* already in @node's fallback list, and it should be the next closest node

1600

* according to the distance array (which contains arbitrary distance values

1600

* according to the distance array (which contains arbitrary distance values

1601

* from each node to each node in the system), and should also prefer nodes

1601

* from each node to each node in the system), and should also prefer nodes

1602

* with no CPUs, since presumably they'll have very little allocation pressure

1602

* with no CPUs, since presumably they'll have very little allocation pressure

1603

* on them otherwise.

1603

* on them otherwise.

1604

* It returns -1 if no node is found.

1604

* It returns -1 if no node is found.

1605

*/

1605

*/

1606

static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)

1606

static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)

1607

{

1607

{

1608

int n, val;

1608

int n, val;

1609

int min_val = INT_MAX;

1609

int min_val = INT_MAX;

1610

int best_node = -1;

1610

int best_node = -1;

1611

1612

/* Use the local node if we haven't already */

1612

/* Use the local node if we haven't already */

1613

if (!node_isset(node, *used_node_mask)) {

1613

if (!node_isset(node, *used_node_mask)) {

1614

node_set(node, *used_node_mask);

1614

node_set(node, *used_node_mask);

1615

return node;

1615

return node;

1616

}

1616

}

1617

1618

for_each_online_node(n) {

1618

for_each_online_node(n) {

1619

cpumask_t tmp;

1619

cpumask_t tmp;

1620

1621

/* Don't want a node to appear more than once */

1621

/* Don't want a node to appear more than once */

1622

if (node_isset(n, *used_node_mask))

1622

if (node_isset(n, *used_node_mask))

1623

continue;

1623

continue;

1624

1625

/* Use the distance array to find the distance */

1625

/* Use the distance array to find the distance */

1626

val = node_distance(node, n);

1626

val = node_distance(node, n);

1627

1628

/* Penalize nodes under us ("prefer the next node") */

1628

/* Penalize nodes under us ("prefer the next node") */

1629

val += (n < node);

1629

val += (n < node);

1630

1631

/* Give preference to headless and unused nodes */

1631

/* Give preference to headless and unused nodes */

1632

tmp = node_to_cpumask(n);

1632

tmp = node_to_cpumask(n);

1633

if (!cpus_empty(tmp))

1633

if (!cpus_empty(tmp))

1634

val += PENALTY_FOR_NODE_WITH_CPUS;

1634

val += PENALTY_FOR_NODE_WITH_CPUS;

1635

1636

/* Slight preference for less loaded node */

1636

/* Slight preference for less loaded node */

1637

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

1637

val *= (MAX_NODE_LOAD*MAX_NUMNODES);

1638

val += node_load[n];

1638

val += node_load[n];

1639

1640

if (val < min_val) {

1640

if (val < min_val) {

1641

min_val = val;

1641

min_val = val;

1642

best_node = n;

1642

best_node = n;

1643

}

1643

}

1644

}

1644

}

1645

1646

if (best_node >= 0)

1646

if (best_node >= 0)

1647

node_set(best_node, *used_node_mask);

1647

node_set(best_node, *used_node_mask);

1648

1649

return best_node;

1649

return best_node;

1650

}

1650

}

1651

1652

static void __meminit build_zonelists(pg_data_t *pgdat)

1652

static void __meminit build_zonelists(pg_data_t *pgdat)

1653

{

1653

{

1654

int j, node, local_node;

1654

int j, node, local_node;

1655

enum zone_type i;

1655

enum zone_type i;

1656

int prev_node, load;

1656

int prev_node, load;

1657

struct zonelist *zonelist;

1657

struct zonelist *zonelist;

1658

nodemask_t used_mask;

1658

nodemask_t used_mask;

1659

1660

/* initialize zonelists */

1660

/* initialize zonelists */

1661

for (i = 0; i < MAX_NR_ZONES; i++) {

1661

for (i = 0; i < MAX_NR_ZONES; i++) {

1662

zonelist = pgdat->node_zonelists + i;

1662

zonelist = pgdat->node_zonelists + i;

1663

zonelist->zones[0] = NULL;

1663

zonelist->zones[0] = NULL;

1664

}

1664

}

1665

1666

/* NUMA-aware ordering of nodes */

1666

/* NUMA-aware ordering of nodes */

1667

local_node = pgdat->node_id;

1667

local_node = pgdat->node_id;

1668

load = num_online_nodes();

1668

load = num_online_nodes();

1669

prev_node = local_node;

1669

prev_node = local_node;

1670

nodes_clear(used_mask);

1670

nodes_clear(used_mask);

1671

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

1671

while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {

1672

int distance = node_distance(local_node, node);

1672

int distance = node_distance(local_node, node);

1673

1674

/*

1674

/*

1675

* If another node is sufficiently far away then it is better

1675

* If another node is sufficiently far away then it is better

1676

* to reclaim pages in a zone before going off node.

1676

* to reclaim pages in a zone before going off node.

1677

*/

1677

*/

1678

if (distance > RECLAIM_DISTANCE)

1678

if (distance > RECLAIM_DISTANCE)

1679

zone_reclaim_mode = 1;

1679

zone_reclaim_mode = 1;

1680

1681

/*

1681

/*

1682

* We don't want to pressure a particular node.

1682

* We don't want to pressure a particular node.

1683

* So adding penalty to the first node in same

1683

* So adding penalty to the first node in same

1684

* distance group to make it round-robin.

1684

* distance group to make it round-robin.

1685

*/

1685

*/

1686

1687

if (distance != node_distance(local_node, prev_node))

1687

if (distance != node_distance(local_node, prev_node))

1688

node_load[node] += load;

1688

node_load[node] += load;

1689

prev_node = node;

1689

prev_node = node;

1690

load--;

1690

load--;

1691

for (i = 0; i < MAX_NR_ZONES; i++) {

1691

for (i = 0; i < MAX_NR_ZONES; i++) {

1692

zonelist = pgdat->node_zonelists + i;

1692

zonelist = pgdat->node_zonelists + i;

1693

for (j = 0; zonelist->zones[j] != NULL; j++);

1693

for (j = 0; zonelist->zones[j] != NULL; j++);

1694

1695

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1695

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1696

zonelist->zones[j] = NULL;

1696

zonelist->zones[j] = NULL;

1697

}

1697

}

1698

}

1698

}

1699

}

1699

}

1700

1701

/* Construct the zonelist performance cache - see further mmzone.h */

1701

/* Construct the zonelist performance cache - see further mmzone.h */

1702

static void __meminit build_zonelist_cache(pg_data_t *pgdat)

1702

static void __meminit build_zonelist_cache(pg_data_t *pgdat)

1703

{

1703

{

1704

int i;

1704

int i;

1705

1706

for (i = 0; i < MAX_NR_ZONES; i++) {

1706

for (i = 0; i < MAX_NR_ZONES; i++) {

1707

struct zonelist *zonelist;

1707

struct zonelist *zonelist;

1708

struct zonelist_cache *zlc;

1708

struct zonelist_cache *zlc;

1709

struct zone **z;

1709

struct zone **z;

1710

1711

zonelist = pgdat->node_zonelists + i;

1711

zonelist = pgdat->node_zonelists + i;

1712

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

1712

zonelist->zlcache_ptr = zlc = &zonelist->zlcache;

1713

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1713

bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);

1714

for (z = zonelist->zones; *z; z++)

1714

for (z = zonelist->zones; *z; z++)

1715

zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);

1715

zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);

1716

}

1716

}

1717

}

1717

}

1718

1719

#else /* CONFIG_NUMA */

1719

#else /* CONFIG_NUMA */

1720

1721

static void __meminit build_zonelists(pg_data_t *pgdat)

1721

static void __meminit build_zonelists(pg_data_t *pgdat)

1722

{

1722

{

1723

int node, local_node;

1723

int node, local_node;

1724

enum zone_type i,j;

1724

enum zone_type i,j;

1725

1726

local_node = pgdat->node_id;

1726

local_node = pgdat->node_id;

1727

for (i = 0; i < MAX_NR_ZONES; i++) {

1727

for (i = 0; i < MAX_NR_ZONES; i++) {

1728

struct zonelist *zonelist;

1728

struct zonelist *zonelist;

1729

1730

zonelist = pgdat->node_zonelists + i;

1730

zonelist = pgdat->node_zonelists + i;

1731

1732

j = build_zonelists_node(pgdat, zonelist, 0, i);

1732

j = build_zonelists_node(pgdat, zonelist, 0, i);

1733

/*

1733

/*

1734

* Now we build the zonelist so that it contains the zones

1734

* Now we build the zonelist so that it contains the zones

1735

* of all the other nodes.

1735

* of all the other nodes.

1736

* We don't want to pressure a particular node, so when

1736

* We don't want to pressure a particular node, so when

1737

* building the zones for node N, we make sure that the

1737

* building the zones for node N, we make sure that the

1738

* zones coming right after the local ones are those from

1738

* zones coming right after the local ones are those from

1739

* node N+1 (modulo N)

1739

* node N+1 (modulo N)

1740

*/

1740

*/

1741

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

1741

for (node = local_node + 1; node < MAX_NUMNODES; node++) {

1742

if (!node_online(node))

1742

if (!node_online(node))

1743

continue;

1743

continue;

1744

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1744

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1745

}

1745

}

1746

for (node = 0; node < local_node; node++) {

1746

for (node = 0; node < local_node; node++) {

1747

if (!node_online(node))

1747

if (!node_online(node))

1748

continue;

1748

continue;

1749

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1749

j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);

1750

}

1750

}

1751

1752

zonelist->zones[j] = NULL;

1752

zonelist->zones[j] = NULL;

1753

}

1753

}

1754

}

1754

}

1755

1756

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

1756

/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

1757

static void __meminit build_zonelist_cache(pg_data_t *pgdat)

1757

static void __meminit build_zonelist_cache(pg_data_t *pgdat)

1758

{

1758

{

1759

int i;

1759

int i;

1760

1761

for (i = 0; i < MAX_NR_ZONES; i++)

1761

for (i = 0; i < MAX_NR_ZONES; i++)

1762

pgdat->node_zonelists[i].zlcache_ptr = NULL;

1762

pgdat->node_zonelists[i].zlcache_ptr = NULL;

1763

}

1763

}

1764

1765

#endif /* CONFIG_NUMA */

1765

#endif /* CONFIG_NUMA */

1766

1767

/* return values int ....just for stop_machine_run() */

1767

/* return values int ....just for stop_machine_run() */

1768

static int __meminit __build_all_zonelists(void *dummy)

1768

static int __meminit __build_all_zonelists(void *dummy)

1769

{

1769

{

1770

int nid;

1770

int nid;

1771

1772

for_each_online_node(nid) {

1772

for_each_online_node(nid) {

1773

build_zonelists(NODE_DATA(nid));

1773

build_zonelists(NODE_DATA(nid));

1774

build_zonelist_cache(NODE_DATA(nid));

1774

build_zonelist_cache(NODE_DATA(nid));

1775

}

1775

}

1776

return 0;

1776

return 0;

1777

}

1777

}

1778

1779

void __meminit build_all_zonelists(void)

1779

void __meminit build_all_zonelists(void)

1780

{

1780

{

1781

if (system_state == SYSTEM_BOOTING) {

1781

if (system_state == SYSTEM_BOOTING) {

1782

__build_all_zonelists(NULL);

1782

__build_all_zonelists(NULL);

1783

cpuset_init_current_mems_allowed();

1783

cpuset_init_current_mems_allowed();

1784

} else {

1784

} else {

1785

/* we have to stop all cpus to guaranntee there is no user

1785

/* we have to stop all cpus to guaranntee there is no user

1786

of zonelist */

1786

of zonelist */

1787

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

1787

stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);

1788

/* cpuset refresh routine should be here */

1788

/* cpuset refresh routine should be here */

1789

}

1789

}

1790

vm_total_pages = nr_free_pagecache_pages();

1790

vm_total_pages = nr_free_pagecache_pages();

1791

printk("Built %i zonelists. Total pages: %ld\n",

1791

printk("Built %i zonelists. Total pages: %ld\n",

1792

num_online_nodes(), vm_total_pages);

1792

num_online_nodes(), vm_total_pages);

1793

}

1793

}

1794

1795

/*

1795

/*

1796

* Helper functions to size the waitqueue hash table.

1796

* Helper functions to size the waitqueue hash table.

1797

* Essentially these want to choose hash table sizes sufficiently

1797

* Essentially these want to choose hash table sizes sufficiently

1798

* large so that collisions trying to wait on pages are rare.

1798

* large so that collisions trying to wait on pages are rare.

1799

* But in fact, the number of active page waitqueues on typical

1799

* But in fact, the number of active page waitqueues on typical

1800

* systems is ridiculously low, less than 200. So this is even

1800

* systems is ridiculously low, less than 200. So this is even

1801

* conservative, even though it seems large.

1801

* conservative, even though it seems large.

1802

*

1802

*

1803

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

1803

* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to

1804

* waitqueues, i.e. the size of the waitq table given the number of pages.

1804

* waitqueues, i.e. the size of the waitq table given the number of pages.

1805

*/

1805

*/

1806

#define PAGES_PER_WAITQUEUE 256

1806

#define PAGES_PER_WAITQUEUE 256

1807

1808

#ifndef CONFIG_MEMORY_HOTPLUG

1808

#ifndef CONFIG_MEMORY_HOTPLUG

1809

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

1809

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

1810

{

1810

{

1811

unsigned long size = 1;

1811

unsigned long size = 1;

1812

1813

pages /= PAGES_PER_WAITQUEUE;

1813

pages /= PAGES_PER_WAITQUEUE;

1814

1815

while (size < pages)

1815

while (size < pages)

1816

size <<= 1;

1816

size <<= 1;

1817

1818

/*

1818

/*

1819

* Once we have dozens or even hundreds of threads sleeping

1819

* Once we have dozens or even hundreds of threads sleeping

1820

* on IO we've got bigger problems than wait queue collision.

1820

* on IO we've got bigger problems than wait queue collision.

1821

* Limit the size of the wait table to a reasonable size.

1821

* Limit the size of the wait table to a reasonable size.

1822

*/

1822

*/

1823

size = min(size, 4096UL);

1823

size = min(size, 4096UL);

1824

1825

return max(size, 4UL);

1825

return max(size, 4UL);

1826

}

1826

}

1827

#else

1827

#else

1828

/*

1828

/*

1829

* A zone's size might be changed by hot-add, so it is not possible to determine

1829

* A zone's size might be changed by hot-add, so it is not possible to determine

1830

* a suitable size for its wait_table. So we use the maximum size now.

1830

* a suitable size for its wait_table. So we use the maximum size now.

1831

*

1831

*

1832

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

1832

* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:

1833

*

1833

*

1834

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

1834

* i386 (preemption config) : 4096 x 16 = 64Kbyte.

1835

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

1835

* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.

1836

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

1836

* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.

1837

*

1837

*

1838

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

1838

* The maximum entries are prepared when a zone's memory is (512K + 256) pages

1839

* or more by the traditional way. (See above). It equals:

1839

* or more by the traditional way. (See above). It equals:

1840

*

1840

*

1841

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

1841

* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.

1842

* ia64(16K page size) : = ( 8G + 4M)byte.

1842

* ia64(16K page size) : = ( 8G + 4M)byte.

1843

* powerpc (64K page size) : = (32G +16M)byte.

1843

* powerpc (64K page size) : = (32G +16M)byte.

1844

*/

1844

*/

1845

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

1845

static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)

1846

{

1846

{

1847

return 4096UL;

1847

return 4096UL;

1848

}

1848

}

1849

#endif

1849

#endif

1850

1851

/*

1851

/*

1852

* This is an integer logarithm so that shifts can be used later

1852

* This is an integer logarithm so that shifts can be used later

1853

* to extract the more random high bits from the multiplicative

1853

* to extract the more random high bits from the multiplicative

1854

* hash function before the remainder is taken.

1854

* hash function before the remainder is taken.

1855

*/

1855

*/

1856

static inline unsigned long wait_table_bits(unsigned long size)

1856

static inline unsigned long wait_table_bits(unsigned long size)

1857

{

1857

{

1858

return ffz(~size);

1858

return ffz(~size);

1859

}

1859

}

1860

1861

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

1861

#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

1862

1863

/*

1863

/*

1864

* Initially all pages are reserved - free ones are freed

1864

* Initially all pages are reserved - free ones are freed

1865

* up by free_all_bootmem() once the early boot process is

1865

* up by free_all_bootmem() once the early boot process is

1866

* done. Non-atomic initialization, single-pass.

1866

* done. Non-atomic initialization, single-pass.

1867

*/

1867

*/

1868

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

1868

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,

1869

unsigned long start_pfn)

1869

unsigned long start_pfn)

1870

{

1870

{

1871

struct page *page;

1871

struct page *page;

1872

unsigned long end_pfn = start_pfn + size;

1872

unsigned long end_pfn = start_pfn + size;

1873

unsigned long pfn;

1873

unsigned long pfn;

1874

1875

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

1875

for (pfn = start_pfn; pfn < end_pfn; pfn++) {

1876

if (!early_pfn_valid(pfn))

1876

if (!early_pfn_valid(pfn))

1877

continue;

1877

continue;

1878

if (!early_pfn_in_nid(pfn, nid))

1878

if (!early_pfn_in_nid(pfn, nid))

1879

continue;

1879

continue;

1880

page = pfn_to_page(pfn);

1880

page = pfn_to_page(pfn);

1881

set_page_links(page, zone, nid, pfn);

1881

set_page_links(page, zone, nid, pfn);

1882

init_page_count(page);

1882

init_page_count(page);

1883

reset_page_mapcount(page);

1883

reset_page_mapcount(page);

1884

SetPageReserved(page);

1884

SetPageReserved(page);

1885

INIT_LIST_HEAD(&page->lru);

1885

INIT_LIST_HEAD(&page->lru);

1886

#ifdef WANT_PAGE_VIRTUAL

1886

#ifdef WANT_PAGE_VIRTUAL

1887

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

1887

/* The shift won't overflow because ZONE_NORMAL is below 4G. */

1888

if (!is_highmem_idx(zone))

1888

if (!is_highmem_idx(zone))

1889

set_page_address(page, __va(pfn << PAGE_SHIFT));

1889

set_page_address(page, __va(pfn << PAGE_SHIFT));

1890

#endif

1890

#endif

1891

}

1891

}

1892

}

1892

}

1893

1894

void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,

1894

void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,

1895

unsigned long size)

1895

unsigned long size)

1896

{

1896

{

1897

int order;

1897

int order;

1898

for (order = 0; order < MAX_ORDER ; order++) {

1898

for (order = 0; order < MAX_ORDER ; order++) {

1899

INIT_LIST_HEAD(&zone->free_area[order].free_list);

1899

INIT_LIST_HEAD(&zone->free_area[order].free_list);

1900

zone->free_area[order].nr_free = 0;

1900

zone->free_area[order].nr_free = 0;

1901

}

1901

}

1902

}

1902

}

1903

1904

#ifndef __HAVE_ARCH_MEMMAP_INIT

1904

#ifndef __HAVE_ARCH_MEMMAP_INIT

1905

#define memmap_init(size, nid, zone, start_pfn) \

1905

#define memmap_init(size, nid, zone, start_pfn) \

1906

memmap_init_zone((size), (nid), (zone), (start_pfn))

1906

memmap_init_zone((size), (nid), (zone), (start_pfn))

1907

#endif

1907

#endif

1908

1909

static int __cpuinit zone_batchsize(struct zone *zone)

1909

static int __cpuinit zone_batchsize(struct zone *zone)

1910

{

1910

{

1911

int batch;

1911

int batch;

1912

1913

/*

1913

/*

1914

* The per-cpu-pages pools are set to around 1000th of the

1914

* The per-cpu-pages pools are set to around 1000th of the

1915

* size of the zone. But no more than 1/2 of a meg.

1915

* size of the zone. But no more than 1/2 of a meg.

1916

*

1916

*

1917

* OK, so we don't know how big the cache is. So guess.

1917

* OK, so we don't know how big the cache is. So guess.

1918

*/

1918

*/

1919

batch = zone->present_pages / 1024;

1919

batch = zone->present_pages / 1024;

1920

if (batch * PAGE_SIZE > 512 * 1024)

1920

if (batch * PAGE_SIZE > 512 * 1024)

1921

batch = (512 * 1024) / PAGE_SIZE;

1921

batch = (512 * 1024) / PAGE_SIZE;

1922

batch /= 4; /* We effectively *= 4 below */

1922

batch /= 4; /* We effectively *= 4 below */

1923

if (batch < 1)

1923

if (batch < 1)

1924

batch = 1;

1924

batch = 1;

1925

1926

/*

1926

/*

1927

* Clamp the batch to a 2^n - 1 value. Having a power

1927

* Clamp the batch to a 2^n - 1 value. Having a power

1928

* of 2 value was found to be more likely to have

1928

* of 2 value was found to be more likely to have

1929

* suboptimal cache aliasing properties in some cases.

1929

* suboptimal cache aliasing properties in some cases.

1930

*

1930

*

1931

* For example if 2 tasks are alternately allocating

1931

* For example if 2 tasks are alternately allocating

1932

* batches of pages, one task can end up with a lot

1932

* batches of pages, one task can end up with a lot

1933

* of pages of one half of the possible page colors

1933

* of pages of one half of the possible page colors

1934

* and the other with pages of the other colors.

1934

* and the other with pages of the other colors.

1935

*/

1935

*/

1936

batch = (1 << (fls(batch + batch/2)-1)) - 1;

1936

batch = (1 << (fls(batch + batch/2)-1)) - 1;

1937

1938

return batch;

1938

return batch;

1939

}

1939

}

1940

1941

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

1941

inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)

1942

{

1942

{

1943

struct per_cpu_pages *pcp;

1943

struct per_cpu_pages *pcp;

1944

1945

memset(p, 0, sizeof(*p));

1945

memset(p, 0, sizeof(*p));

1946

1947

pcp = &p->pcp[0]; /* hot */

1947

pcp = &p->pcp[0]; /* hot */

1948

pcp->count = 0;

1948

pcp->count = 0;

1949

pcp->high = 6 * batch;

1949

pcp->high = 6 * batch;

1950

pcp->batch = max(1UL, 1 * batch);

1950

pcp->batch = max(1UL, 1 * batch);

1951

INIT_LIST_HEAD(&pcp->list);

1951

INIT_LIST_HEAD(&pcp->list);

1952

1953

pcp = &p->pcp[1]; /* cold*/

1953

pcp = &p->pcp[1]; /* cold*/

1954

pcp->count = 0;

1954

pcp->count = 0;

1955

pcp->high = 2 * batch;

1955

pcp->high = 2 * batch;

1956

pcp->batch = max(1UL, batch/2);

1956

pcp->batch = max(1UL, batch/2);

1957

INIT_LIST_HEAD(&pcp->list);

1957

INIT_LIST_HEAD(&pcp->list);

1958

}

1958

}

1959

1960

/*

1960

/*

1961

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

1961

* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist

1962

* to the value high for the pageset p.

1962

* to the value high for the pageset p.

1963

*/

1963

*/

1964

1965

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

1965

static void setup_pagelist_highmark(struct per_cpu_pageset *p,

1966

unsigned long high)

1966

unsigned long high)

1967

{

1967

{

1968

struct per_cpu_pages *pcp;

1968

struct per_cpu_pages *pcp;

1969

1970

pcp = &p->pcp[0]; /* hot list */

1970

pcp = &p->pcp[0]; /* hot list */

1971

pcp->high = high;

1971

pcp->high = high;

1972

pcp->batch = max(1UL, high/4);

1972

pcp->batch = max(1UL, high/4);

1973

if ((high/4) > (PAGE_SHIFT * 8))

1973

if ((high/4) > (PAGE_SHIFT * 8))

1974

pcp->batch = PAGE_SHIFT * 8;

1974

pcp->batch = PAGE_SHIFT * 8;

1975

}

1975

}

1976

1977

1978

#ifdef CONFIG_NUMA

1978

#ifdef CONFIG_NUMA

1979

/*

1979

/*

1980

* Boot pageset table. One per cpu which is going to be used for all

1980

* Boot pageset table. One per cpu which is going to be used for all

1981

* zones and all nodes. The parameters will be set in such a way

1981

* zones and all nodes. The parameters will be set in such a way

1982

* that an item put on a list will immediately be handed over to

1982

* that an item put on a list will immediately be handed over to

1983

* the buddy list. This is safe since pageset manipulation is done

1983

* the buddy list. This is safe since pageset manipulation is done

1984

* with interrupts disabled.

1984

* with interrupts disabled.

1985

*

1985

*

1986

* Some NUMA counter updates may also be caught by the boot pagesets.

1986

* Some NUMA counter updates may also be caught by the boot pagesets.

1987

*

1987

*

1988

* The boot_pagesets must be kept even after bootup is complete for

1988

* The boot_pagesets must be kept even after bootup is complete for

1989

* unused processors and/or zones. They do play a role for bootstrapping

1989

* unused processors and/or zones. They do play a role for bootstrapping

1990

* hotplugged processors.

1990

* hotplugged processors.

1991

*

1991

*

1992

* zoneinfo_show() and maybe other functions do

1992

* zoneinfo_show() and maybe other functions do

1993

* not check if the processor is online before following the pageset pointer.

1993

* not check if the processor is online before following the pageset pointer.

1994

* Other parts of the kernel may not check if the zone is available.

1994

* Other parts of the kernel may not check if the zone is available.

1995

*/

1995

*/

1996

static struct per_cpu_pageset boot_pageset[NR_CPUS];

1996

static struct per_cpu_pageset boot_pageset[NR_CPUS];

1997

1998

/*

1998

/*

1999

* Dynamically allocate memory for the

1999

* Dynamically allocate memory for the

2000

* per cpu pageset array in struct zone.

2000

* per cpu pageset array in struct zone.

2001

*/

2001

*/

2002

static int __cpuinit process_zones(int cpu)

2002

static int __cpuinit process_zones(int cpu)

2003

{

2003

{

2004

struct zone *zone, *dzone;

2004

struct zone *zone, *dzone;

2005

2006

for_each_zone(zone) {

2006

for_each_zone(zone) {

2007

2008

if (!populated_zone(zone))

2008

if (!populated_zone(zone))

2009

continue;

2009

continue;

2010

2011

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2011

zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),

2012

GFP_KERNEL, cpu_to_node(cpu));

2012

GFP_KERNEL, cpu_to_node(cpu));

2013

if (!zone_pcp(zone, cpu))

2013

if (!zone_pcp(zone, cpu))

2014

goto bad;

2014

goto bad;

2015

2016

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2016

setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));

2017

2018

if (percpu_pagelist_fraction)

2018

if (percpu_pagelist_fraction)

2019

setup_pagelist_highmark(zone_pcp(zone, cpu),

2019

setup_pagelist_highmark(zone_pcp(zone, cpu),

2020

(zone->present_pages / percpu_pagelist_fraction));

2020

(zone->present_pages / percpu_pagelist_fraction));

2021

}

2021

}

2022

2023

return 0;

2023

return 0;

2024

bad:

2024

bad:

2025

for_each_zone(dzone) {

2025

for_each_zone(dzone) {

2026

if (dzone == zone)

2026

if (dzone == zone)

2027

break;

2027

break;

2028

kfree(zone_pcp(dzone, cpu));

2028

kfree(zone_pcp(dzone, cpu));

2029

zone_pcp(dzone, cpu) = NULL;

2029

zone_pcp(dzone, cpu) = NULL;

2030

}

2030

}

2031

return -ENOMEM;

2031

return -ENOMEM;

2032

}

2032

}

2033

2034

static inline void free_zone_pagesets(int cpu)

2034

static inline void free_zone_pagesets(int cpu)

2035

{

2035

{

2036

struct zone *zone;

2036

struct zone *zone;

2037

2038

for_each_zone(zone) {

2038

for_each_zone(zone) {

2039

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2039

struct per_cpu_pageset *pset = zone_pcp(zone, cpu);

2040

2041

/* Free per_cpu_pageset if it is slab allocated */

2041

/* Free per_cpu_pageset if it is slab allocated */

2042

if (pset != &boot_pageset[cpu])

2042

if (pset != &boot_pageset[cpu])

2043

kfree(pset);

2043

kfree(pset);

2044

zone_pcp(zone, cpu) = NULL;

2044

zone_pcp(zone, cpu) = NULL;

2045

}

2045

}

2046

}

2046

}

2047

2048

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2048

static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,

2049

unsigned long action,

2049

unsigned long action,

2050

void *hcpu)

2050

void *hcpu)

2051

{

2051

{

2052

int cpu = (long)hcpu;

2052

int cpu = (long)hcpu;

2053

int ret = NOTIFY_OK;

2053

int ret = NOTIFY_OK;

2054

2055

switch (action) {

2055

switch (action) {

2056

case CPU_UP_PREPARE:

2056

case CPU_UP_PREPARE:

2057

if (process_zones(cpu))

2057

if (process_zones(cpu))

2058

ret = NOTIFY_BAD;

2058

ret = NOTIFY_BAD;

2059

break;

2059

break;

2060

case CPU_UP_CANCELED:

2060

case CPU_UP_CANCELED:

2061

case CPU_DEAD:

2061

case CPU_DEAD:

2062

free_zone_pagesets(cpu);

2062

free_zone_pagesets(cpu);

2063

break;

2063

break;

2064

default:

2064

default:

2065

break;

2065

break;

2066

}

2066

}

2067

return ret;

2067

return ret;

2068

}

2068

}

2069

2070

static struct notifier_block __cpuinitdata pageset_notifier =

2070

static struct notifier_block __cpuinitdata pageset_notifier =

2071

{ &pageset_cpuup_callback, NULL, 0 };

2071

{ &pageset_cpuup_callback, NULL, 0 };

2072

2073

void __init setup_per_cpu_pageset(void)

2073

void __init setup_per_cpu_pageset(void)

2074

{

2074

{

2075

int err;

2075

int err;

2076

2077

/* Initialize per_cpu_pageset for cpu 0.

2077

/* Initialize per_cpu_pageset for cpu 0.

2078

* A cpuup callback will do this for every cpu

2078

* A cpuup callback will do this for every cpu

2079

* as it comes online

2079

* as it comes online

2080

*/

2080

*/

2081

err = process_zones(smp_processor_id());

2081

err = process_zones(smp_processor_id());

2082

BUG_ON(err);

2082

BUG_ON(err);

2083

register_cpu_notifier(&pageset_notifier);

2083

register_cpu_notifier(&pageset_notifier);

2084

}

2084

}

2085

2086

#endif

2086

#endif

2087

2088

static __meminit

2088

static __meminit

2089

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2089

int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

2090

{

2090

{

2091

int i;

2091

int i;

2092

struct pglist_data *pgdat = zone->zone_pgdat;

2092

struct pglist_data *pgdat = zone->zone_pgdat;

2093

size_t alloc_size;

2093

size_t alloc_size;

2094

2095

/*

2095

/*

2096

* The per-page waitqueue mechanism uses hashed waitqueues

2096

* The per-page waitqueue mechanism uses hashed waitqueues

2097

* per zone.

2097

* per zone.

2098

*/

2098

*/

2099

zone->wait_table_hash_nr_entries =

2099

zone->wait_table_hash_nr_entries =

2100

wait_table_hash_nr_entries(zone_size_pages);

2100

wait_table_hash_nr_entries(zone_size_pages);

2101

zone->wait_table_bits =

2101

zone->wait_table_bits =

2102

wait_table_bits(zone->wait_table_hash_nr_entries);

2102

wait_table_bits(zone->wait_table_hash_nr_entries);

2103

alloc_size = zone->wait_table_hash_nr_entries

2103

alloc_size = zone->wait_table_hash_nr_entries

2104

* sizeof(wait_queue_head_t);

2104

* sizeof(wait_queue_head_t);

2105

2106

if (system_state == SYSTEM_BOOTING) {

2106

if (system_state == SYSTEM_BOOTING) {

2107

zone->wait_table = (wait_queue_head_t *)

2107

zone->wait_table = (wait_queue_head_t *)

2108

alloc_bootmem_node(pgdat, alloc_size);

2108

alloc_bootmem_node(pgdat, alloc_size);

2109

} else {

2109

} else {

2110

/*

2110

/*

2111

* This case means that a zone whose size was 0 gets new memory

2111

* This case means that a zone whose size was 0 gets new memory

2112

* via memory hot-add.

2112

* via memory hot-add.

2113

* But it may be the case that a new node was hot-added. In

2113

* But it may be the case that a new node was hot-added. In

2114

* this case vmalloc() will not be able to use this new node's

2114

* this case vmalloc() will not be able to use this new node's

2115

* memory - this wait_table must be initialized to use this new

2115

* memory - this wait_table must be initialized to use this new

2116

* node itself as well.

2116

* node itself as well.

2117

* To use this new node's memory, further consideration will be

2117

* To use this new node's memory, further consideration will be

2118

* necessary.

2118

* necessary.

2119

*/

2119

*/

2120

zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);

2120

zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);

2121

}

2121

}

2122

if (!zone->wait_table)

2122

if (!zone->wait_table)

2123

return -ENOMEM;

2123

return -ENOMEM;

2124

2125

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2125

for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)

2126

init_waitqueue_head(zone->wait_table + i);

2126

init_waitqueue_head(zone->wait_table + i);

2127

2128

return 0;

2128

return 0;

2129

}

2129

}

2130

2131

static __meminit void zone_pcp_init(struct zone *zone)

2131

static __meminit void zone_pcp_init(struct zone *zone)

2132

{

2132

{

2133

int cpu;

2133

int cpu;

2134

unsigned long batch = zone_batchsize(zone);

2134

unsigned long batch = zone_batchsize(zone);

2135

2136

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2136

for (cpu = 0; cpu < NR_CPUS; cpu++) {

2137

#ifdef CONFIG_NUMA

2137

#ifdef CONFIG_NUMA

2138

/* Early boot. Slab allocator not functional yet */

2138

/* Early boot. Slab allocator not functional yet */

2139

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2139

zone_pcp(zone, cpu) = &boot_pageset[cpu];

2140

setup_pageset(&boot_pageset[cpu],0);

2140

setup_pageset(&boot_pageset[cpu],0);

2141

#else

2141

#else

2142

setup_pageset(zone_pcp(zone,cpu), batch);

2142

setup_pageset(zone_pcp(zone,cpu), batch);

2143

#endif

2143

#endif

2144

}

2144

}

2145

if (zone->present_pages)

2145

if (zone->present_pages)

2146

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2146

printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",

2147

zone->name, zone->present_pages, batch);

2147

zone->name, zone->present_pages, batch);

2148

}

2148

}

2149

2150

__meminit int init_currently_empty_zone(struct zone *zone,

2150

__meminit int init_currently_empty_zone(struct zone *zone,

2151

unsigned long zone_start_pfn,

2151

unsigned long zone_start_pfn,

2152

unsigned long size)

2152

unsigned long size)

2153

{

2153

{

2154

struct pglist_data *pgdat = zone->zone_pgdat;

2154

struct pglist_data *pgdat = zone->zone_pgdat;

2155

int ret;

2155

int ret;

2156

ret = zone_wait_table_init(zone, size);

2156

ret = zone_wait_table_init(zone, size);

2157

if (ret)

2157

if (ret)

2158

return ret;

2158

return ret;

2159

pgdat->nr_zones = zone_idx(zone) + 1;

2159

pgdat->nr_zones = zone_idx(zone) + 1;

2160

2161

zone->zone_start_pfn = zone_start_pfn;

2161

zone->zone_start_pfn = zone_start_pfn;

2162

2163

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

2163

memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);

2164

2165

zone_init_free_lists(pgdat, zone, zone->spanned_pages);

2165

zone_init_free_lists(pgdat, zone, zone->spanned_pages);

2166

2167

return 0;

2167

return 0;

2168

}

2168

}

2169

2170

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2170

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2171

/*

2171

/*

2172

* Basic iterator support. Return the first range of PFNs for a node

2172

* Basic iterator support. Return the first range of PFNs for a node

2173

* Note: nid == MAX_NUMNODES returns first region regardless of node

2173

* Note: nid == MAX_NUMNODES returns first region regardless of node

2174

*/

2174

*/

2175

static int __init first_active_region_index_in_nid(int nid)

2175

static int __init first_active_region_index_in_nid(int nid)

2176

{

2176

{

2177

int i;

2177

int i;

2178

2179

for (i = 0; i < nr_nodemap_entries; i++)

2179

for (i = 0; i < nr_nodemap_entries; i++)

2180

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2180

if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

2181

return i;

2181

return i;

2182

2183

return -1;

2183

return -1;

2184

}

2184

}

2185

2186

/*

2186

/*

2187

* Basic iterator support. Return the next active range of PFNs for a node

2187

* Basic iterator support. Return the next active range of PFNs for a node

2188

* Note: nid == MAX_NUMNODES returns next region regardles of node

2188

* Note: nid == MAX_NUMNODES returns next region regardles of node

2189

*/

2189

*/

2190

static int __init next_active_region_index_in_nid(int index, int nid)

2190

static int __init next_active_region_index_in_nid(int index, int nid)

2191

{

2191

{

2192

for (index = index + 1; index < nr_nodemap_entries; index++)

2192

for (index = index + 1; index < nr_nodemap_entries; index++)

2193

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2193

if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

2194

return index;

2194

return index;

2195

2196

return -1;

2196

return -1;

2197

}

2197

}

2198

2199

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2199

#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID

2200

/*

2200

/*

2201

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2201

* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.

2202

* Architectures may implement their own version but if add_active_range()

2202

* Architectures may implement their own version but if add_active_range()

2203

* was used and there are no special requirements, this is a convenient

2203

* was used and there are no special requirements, this is a convenient

2204

* alternative

2204

* alternative

2205

*/

2205

*/

2206

int __init early_pfn_to_nid(unsigned long pfn)

2206

int __init early_pfn_to_nid(unsigned long pfn)

2207

{

2207

{

2208

int i;

2208

int i;

2209

2210

for (i = 0; i < nr_nodemap_entries; i++) {

2210

for (i = 0; i < nr_nodemap_entries; i++) {

2211

unsigned long start_pfn = early_node_map[i].start_pfn;

2211

unsigned long start_pfn = early_node_map[i].start_pfn;

2212

unsigned long end_pfn = early_node_map[i].end_pfn;

2212

unsigned long end_pfn = early_node_map[i].end_pfn;

2213

2214

if (start_pfn <= pfn && pfn < end_pfn)

2214

if (start_pfn <= pfn && pfn < end_pfn)

2215

return early_node_map[i].nid;

2215

return early_node_map[i].nid;

2216

}

2216

}

2217

2218

return 0;

2218

return 0;

2219

}

2219

}

2220

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2220

#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */

2221

2222

/* Basic iterator support to walk early_node_map[] */

2222

/* Basic iterator support to walk early_node_map[] */

2223

#define for_each_active_range_index_in_nid(i, nid) \

2223

#define for_each_active_range_index_in_nid(i, nid) \

2224

for (i = first_active_region_index_in_nid(nid); i != -1; \

2224

for (i = first_active_region_index_in_nid(nid); i != -1; \

2225

i = next_active_region_index_in_nid(i, nid))

2225

i = next_active_region_index_in_nid(i, nid))

2226

2227

/**

2227

/**

2228

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2228

* free_bootmem_with_active_regions - Call free_bootmem_node for each active range

2229

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2229

* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.

2230

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2230

* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node

2231

*

2231

*

2232

* If an architecture guarantees that all ranges registered with

2232

* If an architecture guarantees that all ranges registered with

2233

* add_active_ranges() contain no holes and may be freed, this

2233

* add_active_ranges() contain no holes and may be freed, this

2234

* this function may be used instead of calling free_bootmem() manually.

2234

* this function may be used instead of calling free_bootmem() manually.

2235

*/

2235

*/

2236

void __init free_bootmem_with_active_regions(int nid,

2236

void __init free_bootmem_with_active_regions(int nid,

2237

unsigned long max_low_pfn)

2237

unsigned long max_low_pfn)

2238

{

2238

{

2239

int i;

2239

int i;

2240

2241

for_each_active_range_index_in_nid(i, nid) {

2241

for_each_active_range_index_in_nid(i, nid) {

2242

unsigned long size_pages = 0;

2242

unsigned long size_pages = 0;

2243

unsigned long end_pfn = early_node_map[i].end_pfn;

2243

unsigned long end_pfn = early_node_map[i].end_pfn;

2244

2245

if (early_node_map[i].start_pfn >= max_low_pfn)

2245

if (early_node_map[i].start_pfn >= max_low_pfn)

2246

continue;

2246

continue;

2247

2248

if (end_pfn > max_low_pfn)

2248

if (end_pfn > max_low_pfn)

2249

end_pfn = max_low_pfn;

2249

end_pfn = max_low_pfn;

2250

2251

size_pages = end_pfn - early_node_map[i].start_pfn;

2251

size_pages = end_pfn - early_node_map[i].start_pfn;

2252

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2252

free_bootmem_node(NODE_DATA(early_node_map[i].nid),

2253

PFN_PHYS(early_node_map[i].start_pfn),

2253

PFN_PHYS(early_node_map[i].start_pfn),

2254

size_pages << PAGE_SHIFT);

2254

size_pages << PAGE_SHIFT);

2255

}

2255

}

2256

}

2256

}

2257

2258

/**

2258

/**

2259

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2259

* sparse_memory_present_with_active_regions - Call memory_present for each active range

2260

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2260

* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.

2261

*

2261

*

2262

* If an architecture guarantees that all ranges registered with

2262

* If an architecture guarantees that all ranges registered with

2263

* add_active_ranges() contain no holes and may be freed, this

2263

* add_active_ranges() contain no holes and may be freed, this

2264

* function may be used instead of calling memory_present() manually.

2264

* function may be used instead of calling memory_present() manually.

2265

*/

2265

*/

2266

void __init sparse_memory_present_with_active_regions(int nid)

2266

void __init sparse_memory_present_with_active_regions(int nid)

2267

{

2267

{

2268

int i;

2268

int i;

2269

2270

for_each_active_range_index_in_nid(i, nid)

2270

for_each_active_range_index_in_nid(i, nid)

2271

memory_present(early_node_map[i].nid,

2271

memory_present(early_node_map[i].nid,

2272

early_node_map[i].start_pfn,

2272

early_node_map[i].start_pfn,

2273

early_node_map[i].end_pfn);

2273

early_node_map[i].end_pfn);

2274

}

2274

}

2275

2276

/**

2276

/**

2277

* push_node_boundaries - Push node boundaries to at least the requested boundary

2277

* push_node_boundaries - Push node boundaries to at least the requested boundary

2278

* @nid: The nid of the node to push the boundary for

2278

* @nid: The nid of the node to push the boundary for

2279

* @start_pfn: The start pfn of the node

2279

* @start_pfn: The start pfn of the node

2280

* @end_pfn: The end pfn of the node

2280

* @end_pfn: The end pfn of the node

2281

*

2281

*

2282

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2282

* In reserve-based hot-add, mem_map is allocated that is unused until hotadd

2283

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2283

* time. Specifically, on x86_64, SRAT will report ranges that can potentially

2284

* be hotplugged even though no physical memory exists. This function allows

2284

* be hotplugged even though no physical memory exists. This function allows

2285

* an arch to push out the node boundaries so mem_map is allocated that can

2285

* an arch to push out the node boundaries so mem_map is allocated that can

2286

* be used later.

2286

* be used later.

2287

*/

2287

*/

2288

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2288

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2289

void __init push_node_boundaries(unsigned int nid,

2289

void __init push_node_boundaries(unsigned int nid,

2290

unsigned long start_pfn, unsigned long end_pfn)

2290

unsigned long start_pfn, unsigned long end_pfn)

2291

{

2291

{

2292

printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",

2292

printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",

2293

nid, start_pfn, end_pfn);

2293

nid, start_pfn, end_pfn);

2294

2295

/* Initialise the boundary for this node if necessary */

2295

/* Initialise the boundary for this node if necessary */

2296

if (node_boundary_end_pfn[nid] == 0)

2296

if (node_boundary_end_pfn[nid] == 0)

2297

node_boundary_start_pfn[nid] = -1UL;

2297

node_boundary_start_pfn[nid] = -1UL;

2298

2299

/* Update the boundaries */

2299

/* Update the boundaries */

2300

if (node_boundary_start_pfn[nid] > start_pfn)

2300

if (node_boundary_start_pfn[nid] > start_pfn)

2301

node_boundary_start_pfn[nid] = start_pfn;

2301

node_boundary_start_pfn[nid] = start_pfn;

2302

if (node_boundary_end_pfn[nid] < end_pfn)

2302

if (node_boundary_end_pfn[nid] < end_pfn)

2303

node_boundary_end_pfn[nid] = end_pfn;

2303

node_boundary_end_pfn[nid] = end_pfn;

2304

}

2304

}

2305

2306

/* If necessary, push the node boundary out for reserve hotadd */

2306

/* If necessary, push the node boundary out for reserve hotadd */

2307

static void __init account_node_boundary(unsigned int nid,

2307

static void __init account_node_boundary(unsigned int nid,

2308

unsigned long *start_pfn, unsigned long *end_pfn)

2308

unsigned long *start_pfn, unsigned long *end_pfn)

2309

{

2309

{

2310

printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",

2310

printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",

2311

nid, *start_pfn, *end_pfn);

2311

nid, *start_pfn, *end_pfn);

2312

2313

/* Return if boundary information has not been provided */

2313

/* Return if boundary information has not been provided */

2314

if (node_boundary_end_pfn[nid] == 0)

2314

if (node_boundary_end_pfn[nid] == 0)

2315

return;

2315

return;

2316

2317

/* Check the boundaries and update if necessary */

2317

/* Check the boundaries and update if necessary */

2318

if (node_boundary_start_pfn[nid] < *start_pfn)

2318

if (node_boundary_start_pfn[nid] < *start_pfn)

2319

*start_pfn = node_boundary_start_pfn[nid];

2319

*start_pfn = node_boundary_start_pfn[nid];

2320

if (node_boundary_end_pfn[nid] > *end_pfn)

2320

if (node_boundary_end_pfn[nid] > *end_pfn)

2321

*end_pfn = node_boundary_end_pfn[nid];

2321

*end_pfn = node_boundary_end_pfn[nid];

2322

}

2322

}

2323

#else

2323

#else

2324

void __init push_node_boundaries(unsigned int nid,

2324

void __init push_node_boundaries(unsigned int nid,

2325

unsigned long start_pfn, unsigned long end_pfn) {}

2325

unsigned long start_pfn, unsigned long end_pfn) {}

2326

2327

static void __init account_node_boundary(unsigned int nid,

2327

static void __init account_node_boundary(unsigned int nid,

2328

unsigned long *start_pfn, unsigned long *end_pfn) {}

2328

unsigned long *start_pfn, unsigned long *end_pfn) {}

2329

#endif

2329

#endif

2330

2331

2332

/**

2332

/**

2333

* get_pfn_range_for_nid - Return the start and end page frames for a node

2333

* get_pfn_range_for_nid - Return the start and end page frames for a node

2334

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

2334

* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.

2335

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

2335

* @start_pfn: Passed by reference. On return, it will have the node start_pfn.

2336

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

2336

* @end_pfn: Passed by reference. On return, it will have the node end_pfn.

2337

*

2337

*

2338

* It returns the start and end page frame of a node based on information

2338

* It returns the start and end page frame of a node based on information

2339

* provided by an arch calling add_active_range(). If called for a node

2339

* provided by an arch calling add_active_range(). If called for a node

2340

* with no available memory, a warning is printed and the start and end

2340

* with no available memory, a warning is printed and the start and end

2341

* PFNs will be 0.

2341

* PFNs will be 0.

2342

*/

2342

*/

2343

void __init get_pfn_range_for_nid(unsigned int nid,

2343

void __init get_pfn_range_for_nid(unsigned int nid,

2344

unsigned long *start_pfn, unsigned long *end_pfn)

2344

unsigned long *start_pfn, unsigned long *end_pfn)

2345

{

2345

{

2346

int i;

2346

int i;

2347

*start_pfn = -1UL;

2347

*start_pfn = -1UL;

2348

*end_pfn = 0;

2348

*end_pfn = 0;

2349

2350

for_each_active_range_index_in_nid(i, nid) {

2350

for_each_active_range_index_in_nid(i, nid) {

2351

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

2351

*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);

2352

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

2352

*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);

2353

}

2353

}

2354

2355

if (*start_pfn == -1UL) {

2355

if (*start_pfn == -1UL) {

2356

printk(KERN_WARNING "Node %u active with no memory\n", nid);

2356

printk(KERN_WARNING "Node %u active with no memory\n", nid);

2357

*start_pfn = 0;

2357

*start_pfn = 0;

2358

}

2358

}

2359

2360

/* Push the node boundaries out if requested */

2360

/* Push the node boundaries out if requested */

2361

account_node_boundary(nid, start_pfn, end_pfn);

2361

account_node_boundary(nid, start_pfn, end_pfn);

2362

}

2362

}

2363

2364

/*

2364

/*

2365

* Return the number of pages a zone spans in a node, including holes

2365

* Return the number of pages a zone spans in a node, including holes

2366

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

2366

* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()

2367

*/

2367

*/

2368

unsigned long __init zone_spanned_pages_in_node(int nid,

2368

unsigned long __init zone_spanned_pages_in_node(int nid,

2369

unsigned long zone_type,

2369

unsigned long zone_type,

2370

unsigned long *ignored)

2370

unsigned long *ignored)

2371

{

2371

{

2372

unsigned long node_start_pfn, node_end_pfn;

2372

unsigned long node_start_pfn, node_end_pfn;

2373

unsigned long zone_start_pfn, zone_end_pfn;

2373

unsigned long zone_start_pfn, zone_end_pfn;

2374

2375

/* Get the start and end of the node and zone */

2375

/* Get the start and end of the node and zone */

2376

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

2376

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

2377

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

2377

zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];

2378

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

2378

zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];

2379

2380

/* Check that this node has pages within the zone's required range */

2380

/* Check that this node has pages within the zone's required range */

2381

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

2381

if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)

2382

return 0;

2382

return 0;

2383

2384

/* Move the zone boundaries inside the node if necessary */

2384

/* Move the zone boundaries inside the node if necessary */

2385

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

2385

zone_end_pfn = min(zone_end_pfn, node_end_pfn);

2386

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

2386

zone_start_pfn = max(zone_start_pfn, node_start_pfn);

2387

2388

/* Return the spanned pages */

2388

/* Return the spanned pages */

2389

return zone_end_pfn - zone_start_pfn;

2389

return zone_end_pfn - zone_start_pfn;

2390

}

2390

}

2391

2392

/*

2392

/*

2393

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

2393

* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,

2394

* then all holes in the requested range will be accounted for.

2394

* then all holes in the requested range will be accounted for.

2395

*/

2395

*/

2396

unsigned long __init __absent_pages_in_range(int nid,

2396

unsigned long __init __absent_pages_in_range(int nid,

2397

unsigned long range_start_pfn,

2397

unsigned long range_start_pfn,

2398

unsigned long range_end_pfn)

2398

unsigned long range_end_pfn)

2399

{

2399

{

2400

int i = 0;

2400

int i = 0;

2401

unsigned long prev_end_pfn = 0, hole_pages = 0;

2401

unsigned long prev_end_pfn = 0, hole_pages = 0;

2402

unsigned long start_pfn;

2402

unsigned long start_pfn;

2403

2404

/* Find the end_pfn of the first active range of pfns in the node */

2404

/* Find the end_pfn of the first active range of pfns in the node */

2405

i = first_active_region_index_in_nid(nid);

2405

i = first_active_region_index_in_nid(nid);

2406

if (i == -1)

2406

if (i == -1)

2407

return 0;

2407

return 0;

2408

2409

/* Account for ranges before physical memory on this node */

2409

/* Account for ranges before physical memory on this node */

2410

if (early_node_map[i].start_pfn > range_start_pfn)

2410

if (early_node_map[i].start_pfn > range_start_pfn)

2411

hole_pages = early_node_map[i].start_pfn - range_start_pfn;

2411

hole_pages = early_node_map[i].start_pfn - range_start_pfn;

2412

2413

prev_end_pfn = early_node_map[i].start_pfn;

2413

prev_end_pfn = early_node_map[i].start_pfn;

2414

2415

/* Find all holes for the zone within the node */

2415

/* Find all holes for the zone within the node */

2416

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

2416

for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {

2417

2418

/* No need to continue if prev_end_pfn is outside the zone */

2418

/* No need to continue if prev_end_pfn is outside the zone */

2419

if (prev_end_pfn >= range_end_pfn)

2419

if (prev_end_pfn >= range_end_pfn)

2420

break;

2420

break;

2421

2422

/* Make sure the end of the zone is not within the hole */

2422

/* Make sure the end of the zone is not within the hole */

2423

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

2423

start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);

2424

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

2424

prev_end_pfn = max(prev_end_pfn, range_start_pfn);

2425

2426

/* Update the hole size cound and move on */

2426

/* Update the hole size cound and move on */

2427

if (start_pfn > range_start_pfn) {

2427

if (start_pfn > range_start_pfn) {

2428

BUG_ON(prev_end_pfn > start_pfn);

2428

BUG_ON(prev_end_pfn > start_pfn);

2429

hole_pages += start_pfn - prev_end_pfn;

2429

hole_pages += start_pfn - prev_end_pfn;

2430

}

2430

}

2431

prev_end_pfn = early_node_map[i].end_pfn;

2431

prev_end_pfn = early_node_map[i].end_pfn;

2432

}

2432

}

2433

2434

/* Account for ranges past physical memory on this node */

2434

/* Account for ranges past physical memory on this node */

2435

if (range_end_pfn > prev_end_pfn)

2435

if (range_end_pfn > prev_end_pfn)

2436

hole_pages += range_end_pfn -

2436

hole_pages += range_end_pfn -

2437

max(range_start_pfn, prev_end_pfn);

2437

max(range_start_pfn, prev_end_pfn);

2438

2439

return hole_pages;

2439

return hole_pages;

2440

}

2440

}

2441

2442

/**

2442

/**

2443

* absent_pages_in_range - Return number of page frames in holes within a range

2443

* absent_pages_in_range - Return number of page frames in holes within a range

2444

* @start_pfn: The start PFN to start searching for holes

2444

* @start_pfn: The start PFN to start searching for holes

2445

* @end_pfn: The end PFN to stop searching for holes

2445

* @end_pfn: The end PFN to stop searching for holes

2446

*

2446

*

2447

* It returns the number of pages frames in memory holes within a range.

2447

* It returns the number of pages frames in memory holes within a range.

2448

*/

2448

*/

2449

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

2449

unsigned long __init absent_pages_in_range(unsigned long start_pfn,

2450

unsigned long end_pfn)

2450

unsigned long end_pfn)

2451

{

2451

{

2452

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

2452

return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);

2453

}

2453

}

2454

2455

/* Return the number of page frames in holes in a zone on a node */

2455

/* Return the number of page frames in holes in a zone on a node */

2456

unsigned long __init zone_absent_pages_in_node(int nid,

2456

unsigned long __init zone_absent_pages_in_node(int nid,

2457

unsigned long zone_type,

2457

unsigned long zone_type,

2458

unsigned long *ignored)

2458

unsigned long *ignored)

2459

{

2459

{

2460

unsigned long node_start_pfn, node_end_pfn;

2460

unsigned long node_start_pfn, node_end_pfn;

2461

unsigned long zone_start_pfn, zone_end_pfn;

2461

unsigned long zone_start_pfn, zone_end_pfn;

2462

2463

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

2463

get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);

2464

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

2464

zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],

2465

node_start_pfn);

2465

node_start_pfn);

2466

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

2466

zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],

2467

node_end_pfn);

2467

node_end_pfn);

2468

2469

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

2469

return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);

2470

}

2470

}

2471

2472

#else

2472

#else

2473

static inline unsigned long zone_spanned_pages_in_node(int nid,

2473

static inline unsigned long zone_spanned_pages_in_node(int nid,

2474

unsigned long zone_type,

2474

unsigned long zone_type,

2475

unsigned long *zones_size)

2475

unsigned long *zones_size)

2476

{

2476

{

2477

return zones_size[zone_type];

2477

return zones_size[zone_type];

2478

}

2478

}

2479

2480

static inline unsigned long zone_absent_pages_in_node(int nid,

2480

static inline unsigned long zone_absent_pages_in_node(int nid,

2481

unsigned long zone_type,

2481

unsigned long zone_type,

2482

unsigned long *zholes_size)

2482

unsigned long *zholes_size)

2483

{

2483

{

2484

if (!zholes_size)

2484

if (!zholes_size)

2485

return 0;

2485

return 0;

2486

2487

return zholes_size[zone_type];

2487

return zholes_size[zone_type];

2488

}

2488

}

2489

2490

#endif

2490

#endif

2491

2492

static void __init calculate_node_totalpages(struct pglist_data *pgdat,

2492

static void __init calculate_node_totalpages(struct pglist_data *pgdat,

2493

unsigned long *zones_size, unsigned long *zholes_size)

2493

unsigned long *zones_size, unsigned long *zholes_size)

2494

{

2494

{

2495

unsigned long realtotalpages, totalpages = 0;

2495

unsigned long realtotalpages, totalpages = 0;

2496

enum zone_type i;

2496

enum zone_type i;

2497

2498

for (i = 0; i < MAX_NR_ZONES; i++)

2498

for (i = 0; i < MAX_NR_ZONES; i++)

2499

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

2499

totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,

2500

zones_size);

2500

zones_size);

2501

pgdat->node_spanned_pages = totalpages;

2501

pgdat->node_spanned_pages = totalpages;

2502

2503

realtotalpages = totalpages;

2503

realtotalpages = totalpages;

2504

for (i = 0; i < MAX_NR_ZONES; i++)

2504

for (i = 0; i < MAX_NR_ZONES; i++)

2505

realtotalpages -=

2505

realtotalpages -=

2506

zone_absent_pages_in_node(pgdat->node_id, i,

2506

zone_absent_pages_in_node(pgdat->node_id, i,

2507

zholes_size);

2507

zholes_size);

2508

pgdat->node_present_pages = realtotalpages;

2508

pgdat->node_present_pages = realtotalpages;

2509

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

2509

printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,

2510

realtotalpages);

2510

realtotalpages);

2511

}

2511

}

2512

2513

/*

2513

/*

2514

* Set up the zone data structures:

2514

* Set up the zone data structures:

2515

* - mark all pages reserved

2515

* - mark all pages reserved

2516

* - mark all memory queues empty

2516

* - mark all memory queues empty

2517

* - clear the memory bitmaps

2517

* - clear the memory bitmaps

2518

*/

2518

*/

2519

static void __meminit free_area_init_core(struct pglist_data *pgdat,

2519

static void __meminit free_area_init_core(struct pglist_data *pgdat,

2520

unsigned long *zones_size, unsigned long *zholes_size)

2520

unsigned long *zones_size, unsigned long *zholes_size)

2521

{

2521

{

2522

enum zone_type j;

2522

enum zone_type j;

2523

int nid = pgdat->node_id;

2523

int nid = pgdat->node_id;

2524

unsigned long zone_start_pfn = pgdat->node_start_pfn;

2524

unsigned long zone_start_pfn = pgdat->node_start_pfn;

2525

int ret;

2525

int ret;

2526

2527

pgdat_resize_init(pgdat);

2527

pgdat_resize_init(pgdat);

2528

pgdat->nr_zones = 0;

2528

pgdat->nr_zones = 0;

2529

init_waitqueue_head(&pgdat->kswapd_wait);

2529

init_waitqueue_head(&pgdat->kswapd_wait);

2530

pgdat->kswapd_max_order = 0;

2530

pgdat->kswapd_max_order = 0;

2531

2532

for (j = 0; j < MAX_NR_ZONES; j++) {

2532

for (j = 0; j < MAX_NR_ZONES; j++) {

2533

struct zone *zone = pgdat->node_zones + j;

2533

struct zone *zone = pgdat->node_zones + j;

2534

unsigned long size, realsize, memmap_pages;

2534

unsigned long size, realsize, memmap_pages;

2535

2536

size = zone_spanned_pages_in_node(nid, j, zones_size);

2536

size = zone_spanned_pages_in_node(nid, j, zones_size);

2537

realsize = size - zone_absent_pages_in_node(nid, j,

2537

realsize = size - zone_absent_pages_in_node(nid, j,

2538

zholes_size);

2538

zholes_size);

2539

2540

/*

2540

/*

2541

* Adjust realsize so that it accounts for how much memory

2541

* Adjust realsize so that it accounts for how much memory

2542

* is used by this zone for memmap. This affects the watermark

2542

* is used by this zone for memmap. This affects the watermark

2543

* and per-cpu initialisations

2543

* and per-cpu initialisations

2544

*/

2544

*/

2545

memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;

2545

memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;

2546

if (realsize >= memmap_pages) {

2546

if (realsize >= memmap_pages) {

2547

realsize -= memmap_pages;

2547

realsize -= memmap_pages;

2548

printk(KERN_DEBUG

2548

printk(KERN_DEBUG

2549

" %s zone: %lu pages used for memmap\n",

2549

" %s zone: %lu pages used for memmap\n",

2550

zone_names[j], memmap_pages);

2550

zone_names[j], memmap_pages);

2551

} else

2551

} else

2552

printk(KERN_WARNING

2552

printk(KERN_WARNING

2553

" %s zone: %lu pages exceeds realsize %lu\n",

2553

" %s zone: %lu pages exceeds realsize %lu\n",

2554

zone_names[j], memmap_pages, realsize);

2554

zone_names[j], memmap_pages, realsize);

2555

2556

/* Account for reserved DMA pages */

2556

/* Account for reserved DMA pages */

2557

if (j == ZONE_DMA && realsize > dma_reserve) {

2557

if (j == ZONE_DMA && realsize > dma_reserve) {

2558

realsize -= dma_reserve;

2558

realsize -= dma_reserve;

2559

printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",

2559

printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",

2560

dma_reserve);

2560

dma_reserve);

2561

}

2561

}

2562

2563

if (!is_highmem_idx(j))

2563

if (!is_highmem_idx(j))

2564

nr_kernel_pages += realsize;

2564

nr_kernel_pages += realsize;

2565

nr_all_pages += realsize;

2565

nr_all_pages += realsize;

2566

2567

zone->spanned_pages = size;

2567

zone->spanned_pages = size;

2568

zone->present_pages = realsize;

2568

zone->present_pages = realsize;

2569

#ifdef CONFIG_NUMA

2569

#ifdef CONFIG_NUMA

2570

zone->node = nid;

2570

zone->node = nid;

2571

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

2571

zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)

2572

/ 100;

2572

/ 100;

2573

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

2573

zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;

2574

#endif

2574

#endif

2575

zone->name = zone_names[j];

2575

zone->name = zone_names[j];

2576

spin_lock_init(&zone->lock);

2576

spin_lock_init(&zone->lock);

2577

spin_lock_init(&zone->lru_lock);

2577

spin_lock_init(&zone->lru_lock);

2578

zone_seqlock_init(zone);

2578

zone_seqlock_init(zone);

2579

zone->zone_pgdat = pgdat;

2579

zone->zone_pgdat = pgdat;

2580

zone->free_pages = 0;

2580

zone->free_pages = 0;

2581

2582

zone->prev_priority = DEF_PRIORITY;

2582

zone->prev_priority = DEF_PRIORITY;

2583

2584

zone_pcp_init(zone);

2584

zone_pcp_init(zone);

2585

INIT_LIST_HEAD(&zone->active_list);

2585

INIT_LIST_HEAD(&zone->active_list);

2586

INIT_LIST_HEAD(&zone->inactive_list);

2586

INIT_LIST_HEAD(&zone->inactive_list);

2587

zone->nr_scan_active = 0;

2587

zone->nr_scan_active = 0;

2588

zone->nr_scan_inactive = 0;

2588

zone->nr_scan_inactive = 0;

2589

zone->nr_active = 0;

2589

zone->nr_active = 0;

2590

zone->nr_inactive = 0;

2590

zone->nr_inactive = 0;

2591

zap_zone_vm_stats(zone);

2591

zap_zone_vm_stats(zone);

2592

atomic_set(&zone->reclaim_in_progress, 0);

2592

atomic_set(&zone->reclaim_in_progress, 0);

2593

if (!size)

2593

if (!size)

2594

continue;

2594

continue;

2595

2596

ret = init_currently_empty_zone(zone, zone_start_pfn, size);

2596

ret = init_currently_empty_zone(zone, zone_start_pfn, size);

2597

BUG_ON(ret);

2597

BUG_ON(ret);

2598

zone_start_pfn += size;

2598

zone_start_pfn += size;

2599

}

2599

}

2600

}

2600

}

2601

2602

static void __init alloc_node_mem_map(struct pglist_data *pgdat)

2602

static void __init alloc_node_mem_map(struct pglist_data *pgdat)

2603

{

2603

{

2604

/* Skip empty nodes */

2604

/* Skip empty nodes */

2605

if (!pgdat->node_spanned_pages)

2605

if (!pgdat->node_spanned_pages)

2606

return;

2606

return;

2607

2608

#ifdef CONFIG_FLAT_NODE_MEM_MAP

2608

#ifdef CONFIG_FLAT_NODE_MEM_MAP

2609

/* ia64 gets its own node_mem_map, before this, without bootmem */

2609

/* ia64 gets its own node_mem_map, before this, without bootmem */

2610

if (!pgdat->node_mem_map) {

2610

if (!pgdat->node_mem_map) {

2611

unsigned long size, start, end;

2611

unsigned long size, start, end;

2612

struct page *map;

2612

struct page *map;

2613

2614

/*

2614

/*

2615

* The zone's endpoints aren't required to be MAX_ORDER

2615

* The zone's endpoints aren't required to be MAX_ORDER

2616

* aligned but the node_mem_map endpoints must be in order

2616

* aligned but the node_mem_map endpoints must be in order

2617

* for the buddy allocator to function correctly.

2617

* for the buddy allocator to function correctly.

2618

*/

2618

*/

2619

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

2619

start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);

2620

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

2620

end = pgdat->node_start_pfn + pgdat->node_spanned_pages;

2621

end = ALIGN(end, MAX_ORDER_NR_PAGES);

2621

end = ALIGN(end, MAX_ORDER_NR_PAGES);

2622

size = (end - start) * sizeof(struct page);

2622

size = (end - start) * sizeof(struct page);

2623

map = alloc_remap(pgdat->node_id, size);

2623

map = alloc_remap(pgdat->node_id, size);

2624

if (!map)

2624

if (!map)

2625

map = alloc_bootmem_node(pgdat, size);

2625

map = alloc_bootmem_node(pgdat, size);

2626

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

2626

pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);

2627

}

2627

}

2628

#ifdef CONFIG_FLATMEM

2628

#ifdef CONFIG_FLATMEM

2629

/*

2629

/*

2630

* With no DISCONTIG, the global mem_map is just set as node 0's

2630

* With no DISCONTIG, the global mem_map is just set as node 0's

2631

*/

2631

*/

2632

if (pgdat == NODE_DATA(0)) {

2632

if (pgdat == NODE_DATA(0)) {

2633

mem_map = NODE_DATA(0)->node_mem_map;

2633

mem_map = NODE_DATA(0)->node_mem_map;

2634

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2634

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2635

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

2635

if (page_to_pfn(mem_map) != pgdat->node_start_pfn)

2636

mem_map -= pgdat->node_start_pfn;

2636

mem_map -= pgdat->node_start_pfn;

2637

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

2637

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

2638

}

2638

}

2639

#endif

2639

#endif

2640

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

2640

#endif /* CONFIG_FLAT_NODE_MEM_MAP */

2641

}

2641

}

2642

2643

void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,

2643

void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,

2644

unsigned long *zones_size, unsigned long node_start_pfn,

2644

unsigned long *zones_size, unsigned long node_start_pfn,

2645

unsigned long *zholes_size)

2645

unsigned long *zholes_size)

2646

{

2646

{

2647

pgdat->node_id = nid;

2647

pgdat->node_id = nid;

2648

pgdat->node_start_pfn = node_start_pfn;

2648

pgdat->node_start_pfn = node_start_pfn;

2649

calculate_node_totalpages(pgdat, zones_size, zholes_size);

2649

calculate_node_totalpages(pgdat, zones_size, zholes_size);

2650

2651

alloc_node_mem_map(pgdat);

2651

alloc_node_mem_map(pgdat);

2652

2653

free_area_init_core(pgdat, zones_size, zholes_size);

2653

free_area_init_core(pgdat, zones_size, zholes_size);

2654

}

2654

}

2655

2656

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2656

#ifdef CONFIG_ARCH_POPULATES_NODE_MAP

2657

/**

2657

/**

2658

* add_active_range - Register a range of PFNs backed by physical memory

2658

* add_active_range - Register a range of PFNs backed by physical memory

2659

* @nid: The node ID the range resides on

2659

* @nid: The node ID the range resides on

2660

* @start_pfn: The start PFN of the available physical memory

2660

* @start_pfn: The start PFN of the available physical memory

2661

* @end_pfn: The end PFN of the available physical memory

2661

* @end_pfn: The end PFN of the available physical memory

2662

*

2662

*

2663

* These ranges are stored in an early_node_map[] and later used by

2663

* These ranges are stored in an early_node_map[] and later used by

2664

* free_area_init_nodes() to calculate zone sizes and holes. If the

2664

* free_area_init_nodes() to calculate zone sizes and holes. If the

2665

* range spans a memory hole, it is up to the architecture to ensure

2665

* range spans a memory hole, it is up to the architecture to ensure

2666

* the memory is not freed by the bootmem allocator. If possible

2666

* the memory is not freed by the bootmem allocator. If possible

2667

* the range being registered will be merged with existing ranges.

2667

* the range being registered will be merged with existing ranges.

2668

*/

2668

*/

2669

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

2669

void __init add_active_range(unsigned int nid, unsigned long start_pfn,

2670

unsigned long end_pfn)

2670

unsigned long end_pfn)

2671

{

2671

{

2672

int i;

2672

int i;

2673

2674

printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "

2674

printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "

2675

"%d entries of %d used\n",

2675

"%d entries of %d used\n",

2676

nid, start_pfn, end_pfn,

2676

nid, start_pfn, end_pfn,

2677

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

2677

nr_nodemap_entries, MAX_ACTIVE_REGIONS);

2678

2679

/* Merge with existing active regions if possible */

2679

/* Merge with existing active regions if possible */

2680

for (i = 0; i < nr_nodemap_entries; i++) {

2680

for (i = 0; i < nr_nodemap_entries; i++) {

2681

if (early_node_map[i].nid != nid)

2681

if (early_node_map[i].nid != nid)

2682

continue;

2682

continue;

2683

2684

/* Skip if an existing region covers this new one */

2684

/* Skip if an existing region covers this new one */

2685

if (start_pfn >= early_node_map[i].start_pfn &&

2685

if (start_pfn >= early_node_map[i].start_pfn &&

2686

end_pfn <= early_node_map[i].end_pfn)

2686

end_pfn <= early_node_map[i].end_pfn)

2687

return;

2687

return;

2688

2689

/* Merge forward if suitable */

2689

/* Merge forward if suitable */

2690

if (start_pfn <= early_node_map[i].end_pfn &&

2690

if (start_pfn <= early_node_map[i].end_pfn &&

2691

end_pfn > early_node_map[i].end_pfn) {

2691

end_pfn > early_node_map[i].end_pfn) {

2692

early_node_map[i].end_pfn = end_pfn;

2692

early_node_map[i].end_pfn = end_pfn;

2693

return;

2693

return;

2694

}

2694

}

2695

2696

/* Merge backward if suitable */

2696

/* Merge backward if suitable */

2697

if (start_pfn < early_node_map[i].end_pfn &&

2697

if (start_pfn < early_node_map[i].end_pfn &&

2698

end_pfn >= early_node_map[i].start_pfn) {

2698

end_pfn >= early_node_map[i].start_pfn) {

2699

early_node_map[i].start_pfn = start_pfn;

2699

early_node_map[i].start_pfn = start_pfn;

2700

return;

2700

return;

2701

}

2701

}

2702

}

2702

}

2703

2704

/* Check that early_node_map is large enough */

2704

/* Check that early_node_map is large enough */

2705

if (i >= MAX_ACTIVE_REGIONS) {

2705

if (i >= MAX_ACTIVE_REGIONS) {

2706

printk(KERN_CRIT "More than %d memory regions, truncating\n",

2706

printk(KERN_CRIT "More than %d memory regions, truncating\n",

2707

MAX_ACTIVE_REGIONS);

2707

MAX_ACTIVE_REGIONS);

2708

return;

2708

return;

2709

}

2709

}

2710

2711

early_node_map[i].nid = nid;

2711

early_node_map[i].nid = nid;

2712

early_node_map[i].start_pfn = start_pfn;

2712

early_node_map[i].start_pfn = start_pfn;

2713

early_node_map[i].end_pfn = end_pfn;

2713

early_node_map[i].end_pfn = end_pfn;

2714

nr_nodemap_entries = i + 1;

2714

nr_nodemap_entries = i + 1;

2715

}

2715

}

2716

2717

/**

2717

/**

2718

* shrink_active_range - Shrink an existing registered range of PFNs

2718

* shrink_active_range - Shrink an existing registered range of PFNs

2719

* @nid: The node id the range is on that should be shrunk

2719

* @nid: The node id the range is on that should be shrunk

2720

* @old_end_pfn: The old end PFN of the range

2720

* @old_end_pfn: The old end PFN of the range

2721

* @new_end_pfn: The new PFN of the range

2721

* @new_end_pfn: The new PFN of the range

2722

*

2722

*

2723

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

2723

* i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.

2724

* The map is kept at the end physical page range that has already been

2724

* The map is kept at the end physical page range that has already been

2725

* registered with add_active_range(). This function allows an arch to shrink

2725

* registered with add_active_range(). This function allows an arch to shrink

2726

* an existing registered range.

2726

* an existing registered range.

2727

*/

2727

*/

2728

void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,

2728

void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,

2729

unsigned long new_end_pfn)

2729

unsigned long new_end_pfn)

2730

{

2730

{

2731

int i;

2731

int i;

2732

2733

/* Find the old active region end and shrink */

2733

/* Find the old active region end and shrink */

2734

for_each_active_range_index_in_nid(i, nid)

2734

for_each_active_range_index_in_nid(i, nid)

2735

if (early_node_map[i].end_pfn == old_end_pfn) {

2735

if (early_node_map[i].end_pfn == old_end_pfn) {

2736

early_node_map[i].end_pfn = new_end_pfn;

2736

early_node_map[i].end_pfn = new_end_pfn;

2737

break;

2737

break;

2738

}

2738

}

2739

}

2739

}

2740

2741

/**

2741

/**

2742

* remove_all_active_ranges - Remove all currently registered regions

2742

* remove_all_active_ranges - Remove all currently registered regions

2743

*

2743

*

2744

* During discovery, it may be found that a table like SRAT is invalid

2744

* During discovery, it may be found that a table like SRAT is invalid

2745

* and an alternative discovery method must be used. This function removes

2745

* and an alternative discovery method must be used. This function removes

2746

* all currently registered regions.

2746

* all currently registered regions.

2747

*/

2747

*/

2748

void __init remove_all_active_ranges(void)

2748

void __init remove_all_active_ranges(void)

2749

{

2749

{

2750

memset(early_node_map, 0, sizeof(early_node_map));

2750

memset(early_node_map, 0, sizeof(early_node_map));

2751

nr_nodemap_entries = 0;

2751

nr_nodemap_entries = 0;

2752

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2752

#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE

2753

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

2753

memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));

2754

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

2754

memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));

2755

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

2755

#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */

2756

}

2756

}

2757

2758

/* Compare two active node_active_regions */

2758

/* Compare two active node_active_regions */

2759

static int __init cmp_node_active_region(const void *a, const void *b)

2759

static int __init cmp_node_active_region(const void *a, const void *b)

2760

{

2760

{

2761

struct node_active_region *arange = (struct node_active_region *)a;

2761

struct node_active_region *arange = (struct node_active_region *)a;

2762

struct node_active_region *brange = (struct node_active_region *)b;

2762

struct node_active_region *brange = (struct node_active_region *)b;

2763

2764

/* Done this way to avoid overflows */

2764

/* Done this way to avoid overflows */

2765

if (arange->start_pfn > brange->start_pfn)

2765

if (arange->start_pfn > brange->start_pfn)

2766

return 1;

2766

return 1;

2767

if (arange->start_pfn < brange->start_pfn)

2767

if (arange->start_pfn < brange->start_pfn)

2768

return -1;

2768

return -1;

2769

2770

return 0;

2770

return 0;

2771

}

2771

}

2772

2773

/* sort the node_map by start_pfn */

2773

/* sort the node_map by start_pfn */

2774

static void __init sort_node_map(void)

2774

static void __init sort_node_map(void)

2775

{

2775

{

2776

sort(early_node_map, (size_t)nr_nodemap_entries,

2776

sort(early_node_map, (size_t)nr_nodemap_entries,

2777

sizeof(struct node_active_region),

2777

sizeof(struct node_active_region),

2778

cmp_node_active_region, NULL);

2778

cmp_node_active_region, NULL);

2779

}

2779

}

2780

2781

/* Find the lowest pfn for a node. This depends on a sorted early_node_map */

2781

/* Find the lowest pfn for a node. This depends on a sorted early_node_map */

2782

unsigned long __init find_min_pfn_for_node(unsigned long nid)

2782

unsigned long __init find_min_pfn_for_node(unsigned long nid)

2783

{

2783

{

2784

int i;

2784

int i;

2785

2786

/* Regions in the early_node_map can be in any order */

2786

/* Regions in the early_node_map can be in any order */

2787

sort_node_map();

2787

sort_node_map();

2788

2789

/* Assuming a sorted map, the first range found has the starting pfn */

2789

/* Assuming a sorted map, the first range found has the starting pfn */

2790

for_each_active_range_index_in_nid(i, nid)

2790

for_each_active_range_index_in_nid(i, nid)

2791

return early_node_map[i].start_pfn;

2791

return early_node_map[i].start_pfn;

2792

2793

printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);

2793

printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);

2794

return 0;

2794

return 0;

2795

}

2795

}

2796

2797

/**

2797

/**

2798

* find_min_pfn_with_active_regions - Find the minimum PFN registered

2798

* find_min_pfn_with_active_regions - Find the minimum PFN registered

2799

*

2799

*

2800

* It returns the minimum PFN based on information provided via

2800

* It returns the minimum PFN based on information provided via

2801

* add_active_range().

2801

* add_active_range().

2802

*/

2802

*/

2803

unsigned long __init find_min_pfn_with_active_regions(void)

2803

unsigned long __init find_min_pfn_with_active_regions(void)

2804

{

2804

{

2805

return find_min_pfn_for_node(MAX_NUMNODES);

2805

return find_min_pfn_for_node(MAX_NUMNODES);

2806

}

2806

}

2807

2808

/**

2808

/**

2809

* find_max_pfn_with_active_regions - Find the maximum PFN registered

2809

* find_max_pfn_with_active_regions - Find the maximum PFN registered

2810

*

2810

*

2811

* It returns the maximum PFN based on information provided via

2811

* It returns the maximum PFN based on information provided via

2812

* add_active_range().

2812

* add_active_range().

2813

*/

2813

*/

2814

unsigned long __init find_max_pfn_with_active_regions(void)

2814

unsigned long __init find_max_pfn_with_active_regions(void)

2815

{

2815

{

2816

int i;

2816

int i;

2817

unsigned long max_pfn = 0;

2817

unsigned long max_pfn = 0;

2818

2819

for (i = 0; i < nr_nodemap_entries; i++)

2819

for (i = 0; i < nr_nodemap_entries; i++)

2820

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

2820

max_pfn = max(max_pfn, early_node_map[i].end_pfn);

2821

2822

return max_pfn;

2822

return max_pfn;

2823

}

2823

}

2824

2825

/**

2825

/**

2826

* free_area_init_nodes - Initialise all pg_data_t and zone data

2826

* free_area_init_nodes - Initialise all pg_data_t and zone data

2827

* @max_zone_pfn: an array of max PFNs for each zone

2827

* @max_zone_pfn: an array of max PFNs for each zone

2828

*

2828

*

2829

* This will call free_area_init_node() for each active node in the system.

2829

* This will call free_area_init_node() for each active node in the system.

2830

* Using the page ranges provided by add_active_range(), the size of each

2830

* Using the page ranges provided by add_active_range(), the size of each

2831

* zone in each node and their holes is calculated. If the maximum PFN

2831

* zone in each node and their holes is calculated. If the maximum PFN

2832

* between two adjacent zones match, it is assumed that the zone is empty.

2832

* between two adjacent zones match, it is assumed that the zone is empty.

2833

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

2833

* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed

2834

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

2834

* that arch_max_dma32_pfn has no pages. It is also assumed that a zone

2835

* starts where the previous one ended. For example, ZONE_DMA32 starts

2835

* starts where the previous one ended. For example, ZONE_DMA32 starts

2836

* at arch_max_dma_pfn.

2836

* at arch_max_dma_pfn.

2837

*/

2837

*/

2838

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

2838

void __init free_area_init_nodes(unsigned long *max_zone_pfn)

2839

{

2839

{

2840

unsigned long nid;

2840

unsigned long nid;

2841

enum zone_type i;

2841

enum zone_type i;

2842

2843

/* Record where the zone boundaries are */

2843

/* Record where the zone boundaries are */

2844

memset(arch_zone_lowest_possible_pfn, 0,

2844

memset(arch_zone_lowest_possible_pfn, 0,

2845

sizeof(arch_zone_lowest_possible_pfn));

2845

sizeof(arch_zone_lowest_possible_pfn));

2846

memset(arch_zone_highest_possible_pfn, 0,

2846

memset(arch_zone_highest_possible_pfn, 0,

2847

sizeof(arch_zone_highest_possible_pfn));

2847

sizeof(arch_zone_highest_possible_pfn));

2848

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

2848

arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();

2849

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

2849

arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];

2850

for (i = 1; i < MAX_NR_ZONES; i++) {

2850

for (i = 1; i < MAX_NR_ZONES; i++) {

2851

arch_zone_lowest_possible_pfn[i] =

2851

arch_zone_lowest_possible_pfn[i] =

2852

arch_zone_highest_possible_pfn[i-1];

2852

arch_zone_highest_possible_pfn[i-1];

2853

arch_zone_highest_possible_pfn[i] =

2853

arch_zone_highest_possible_pfn[i] =

2854

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

2854

max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);

2855

}

2855

}

2856

2857

/* Print out the zone ranges */

2857

/* Print out the zone ranges */

2858

printk("Zone PFN ranges:\n");

2858

printk("Zone PFN ranges:\n");

2859

for (i = 0; i < MAX_NR_ZONES; i++)

2859

for (i = 0; i < MAX_NR_ZONES; i++)

2860

printk(" %-8s %8lu -> %8lu\n",

2860

printk(" %-8s %8lu -> %8lu\n",

2861

zone_names[i],

2861

zone_names[i],

2862

arch_zone_lowest_possible_pfn[i],

2862

arch_zone_lowest_possible_pfn[i],

2863

arch_zone_highest_possible_pfn[i]);

2863

arch_zone_highest_possible_pfn[i]);

2864

2865

/* Print out the early_node_map[] */

2865

/* Print out the early_node_map[] */

2866

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

2866

printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);

2867

for (i = 0; i < nr_nodemap_entries; i++)

2867

for (i = 0; i < nr_nodemap_entries; i++)

2868

printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,

2868

printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,

2869

early_node_map[i].start_pfn,

2869

early_node_map[i].start_pfn,

2870

early_node_map[i].end_pfn);

2870

early_node_map[i].end_pfn);

2871

2872

/* Initialise every node */

2872

/* Initialise every node */

2873

for_each_online_node(nid) {

2873

for_each_online_node(nid) {

2874

pg_data_t *pgdat = NODE_DATA(nid);

2874

pg_data_t *pgdat = NODE_DATA(nid);

2875

free_area_init_node(nid, pgdat, NULL,

2875

free_area_init_node(nid, pgdat, NULL,

2876

find_min_pfn_for_node(nid), NULL);

2876

find_min_pfn_for_node(nid), NULL);

2877

}

2877

}

2878

}

2878

}

2879

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

2879

#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */

2880

2881

/**

2881

/**

2882

* set_dma_reserve - set the specified number of pages reserved in the first zone

2882

* set_dma_reserve - set the specified number of pages reserved in the first zone

2883

* @new_dma_reserve: The number of pages to mark reserved

2883

* @new_dma_reserve: The number of pages to mark reserved

2884

*

2884

*

2885

* The per-cpu batchsize and zone watermarks are determined by present_pages.

2885

* The per-cpu batchsize and zone watermarks are determined by present_pages.

2886

* In the DMA zone, a significant percentage may be consumed by kernel image

2886

* In the DMA zone, a significant percentage may be consumed by kernel image

2887

* and other unfreeable allocations which can skew the watermarks badly. This

2887

* and other unfreeable allocations which can skew the watermarks badly. This

2888

* function may optionally be used to account for unfreeable pages in the

2888

* function may optionally be used to account for unfreeable pages in the

2889

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

2889

* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and

2890

* smaller per-cpu batchsize.

2890

* smaller per-cpu batchsize.

2891

*/

2891

*/

2892

void __init set_dma_reserve(unsigned long new_dma_reserve)

2892

void __init set_dma_reserve(unsigned long new_dma_reserve)

2893

{

2893

{

2894

dma_reserve = new_dma_reserve;

2894

dma_reserve = new_dma_reserve;

2895

}

2895

}

2896

2897

#ifndef CONFIG_NEED_MULTIPLE_NODES

2897

#ifndef CONFIG_NEED_MULTIPLE_NODES

2898

static bootmem_data_t contig_bootmem_data;

2898

static bootmem_data_t contig_bootmem_data;

2899

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

2899

struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };

2900

2901

EXPORT_SYMBOL(contig_page_data);

2901

EXPORT_SYMBOL(contig_page_data);

2902

#endif

2902

#endif

2903

2904

void __init free_area_init(unsigned long *zones_size)

2904

void __init free_area_init(unsigned long *zones_size)

2905

{

2905

{

2906

free_area_init_node(0, NODE_DATA(0), zones_size,

2906

free_area_init_node(0, NODE_DATA(0), zones_size,

2907

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

2907

__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);

2908

}

2908

}

2909

2910

#ifdef CONFIG_HOTPLUG_CPU

2910

#ifdef CONFIG_HOTPLUG_CPU

2911

static int page_alloc_cpu_notify(struct notifier_block *self,

2911

static int page_alloc_cpu_notify(struct notifier_block *self,

2912

unsigned long action, void *hcpu)

2912

unsigned long action, void *hcpu)

2913

{

2913

{

2914

int cpu = (unsigned long)hcpu;

2914

int cpu = (unsigned long)hcpu;

2915

2916

if (action == CPU_DEAD) {

2916

if (action == CPU_DEAD) {

2917

local_irq_disable();

2917

local_irq_disable();

2918

__drain_pages(cpu);

2918

__drain_pages(cpu);

2919

vm_events_fold_cpu(cpu);

2919

vm_events_fold_cpu(cpu);

2920

local_irq_enable();

2920

local_irq_enable();

2921

refresh_cpu_vm_stats(cpu);

2921

refresh_cpu_vm_stats(cpu);

2922

}

2922

}

2923

return NOTIFY_OK;

2923

return NOTIFY_OK;

2924

}

2924

}

2925

#endif /* CONFIG_HOTPLUG_CPU */

2925

#endif /* CONFIG_HOTPLUG_CPU */

2926

2927

void __init page_alloc_init(void)

2927

void __init page_alloc_init(void)

2928

{

2928

{

2929

hotcpu_notifier(page_alloc_cpu_notify, 0);

2929

hotcpu_notifier(page_alloc_cpu_notify, 0);

2930

}

2930

}

2931

2932

/*

2932

/*

2933

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

2933

* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio

2934

* or min_free_kbytes changes.

2934

* or min_free_kbytes changes.

2935

*/

2935

*/

2936

static void calculate_totalreserve_pages(void)

2936

static void calculate_totalreserve_pages(void)

2937

{

2937

{

2938

struct pglist_data *pgdat;

2938

struct pglist_data *pgdat;

2939

unsigned long reserve_pages = 0;

2939

unsigned long reserve_pages = 0;

2940

enum zone_type i, j;

2940

enum zone_type i, j;

2941

2942

for_each_online_pgdat(pgdat) {

2942

for_each_online_pgdat(pgdat) {

2943

for (i = 0; i < MAX_NR_ZONES; i++) {

2943

for (i = 0; i < MAX_NR_ZONES; i++) {

2944

struct zone *zone = pgdat->node_zones + i;

2944

struct zone *zone = pgdat->node_zones + i;

2945

unsigned long max = 0;

2945

unsigned long max = 0;

2946

2947

/* Find valid and maximum lowmem_reserve in the zone */

2947

/* Find valid and maximum lowmem_reserve in the zone */

2948

for (j = i; j < MAX_NR_ZONES; j++) {

2948

for (j = i; j < MAX_NR_ZONES; j++) {

2949

if (zone->lowmem_reserve[j] > max)

2949

if (zone->lowmem_reserve[j] > max)

2950

max = zone->lowmem_reserve[j];

2950

max = zone->lowmem_reserve[j];

2951

}

2951

}

2952

2953

/* we treat pages_high as reserved pages. */

2953

/* we treat pages_high as reserved pages. */

2954

max += zone->pages_high;

2954

max += zone->pages_high;

2955

2956

if (max > zone->present_pages)

2956

if (max > zone->present_pages)

2957

max = zone->present_pages;

2957

max = zone->present_pages;

2958

reserve_pages += max;

2958

reserve_pages += max;

2959

}

2959

}

2960

}

2960

}

2961

totalreserve_pages = reserve_pages;

2961

totalreserve_pages = reserve_pages;

2962

}

2962

}

2963

2964

/*

2964

/*

2965

* setup_per_zone_lowmem_reserve - called whenever

2965

* setup_per_zone_lowmem_reserve - called whenever

2966

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

2966

* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone

2967

* has a correct pages reserved value, so an adequate number of

2967

* has a correct pages reserved value, so an adequate number of

2968

* pages are left in the zone after a successful __alloc_pages().

2968

* pages are left in the zone after a successful __alloc_pages().

2969

*/

2969

*/

2970

static void setup_per_zone_lowmem_reserve(void)

2970

static void setup_per_zone_lowmem_reserve(void)

2971

{

2971

{

2972

struct pglist_data *pgdat;

2972

struct pglist_data *pgdat;

2973

enum zone_type j, idx;

2973

enum zone_type j, idx;

2974

2975

for_each_online_pgdat(pgdat) {

2975

for_each_online_pgdat(pgdat) {

2976

for (j = 0; j < MAX_NR_ZONES; j++) {

2976

for (j = 0; j < MAX_NR_ZONES; j++) {

2977

struct zone *zone = pgdat->node_zones + j;

2977

struct zone *zone = pgdat->node_zones + j;

2978

unsigned long present_pages = zone->present_pages;

2978

unsigned long present_pages = zone->present_pages;

2979

2980

zone->lowmem_reserve[j] = 0;

2980

zone->lowmem_reserve[j] = 0;

2981

2982

idx = j;

2982

idx = j;

2983

while (idx) {

2983

while (idx) {

2984

struct zone *lower_zone;

2984

struct zone *lower_zone;

2985

2986

idx--;

2986

idx--;

2987

2988

if (sysctl_lowmem_reserve_ratio[idx] < 1)

2988

if (sysctl_lowmem_reserve_ratio[idx] < 1)

2989

sysctl_lowmem_reserve_ratio[idx] = 1;

2989

sysctl_lowmem_reserve_ratio[idx] = 1;

2990

2991

lower_zone = pgdat->node_zones + idx;

2991

lower_zone = pgdat->node_zones + idx;

2992

lower_zone->lowmem_reserve[j] = present_pages /

2992

lower_zone->lowmem_reserve[j] = present_pages /

2993

sysctl_lowmem_reserve_ratio[idx];

2993

sysctl_lowmem_reserve_ratio[idx];

2994

present_pages += lower_zone->present_pages;

2994

present_pages += lower_zone->present_pages;

2995

}

2995

}

2996

}

2996

}

2997

}

2997

}

2998

2999

/* update totalreserve_pages */

2999

/* update totalreserve_pages */

3000

calculate_totalreserve_pages();

3000

calculate_totalreserve_pages();

3001

}

3001

}

3002

3003

/**

3003

/**

3004

* setup_per_zone_pages_min - called when min_free_kbytes changes.

3004

* setup_per_zone_pages_min - called when min_free_kbytes changes.

3005

*

3005

*

3006

* Ensures that the pages_{min,low,high} values for each zone are set correctly

3006

* Ensures that the pages_{min,low,high} values for each zone are set correctly

3007

* with respect to min_free_kbytes.

3007

* with respect to min_free_kbytes.

3008

*/

3008

*/

3009

void setup_per_zone_pages_min(void)

3009

void setup_per_zone_pages_min(void)

3010

{

3010

{

3011

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

3011

unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);

3012

unsigned long lowmem_pages = 0;

3012

unsigned long lowmem_pages = 0;

3013

struct zone *zone;

3013

struct zone *zone;

3014

unsigned long flags;

3014

unsigned long flags;

3015

3016

/* Calculate total number of !ZONE_HIGHMEM pages */

3016

/* Calculate total number of !ZONE_HIGHMEM pages */

3017

for_each_zone(zone) {

3017

for_each_zone(zone) {

3018

if (!is_highmem(zone))

3018

if (!is_highmem(zone))

3019

lowmem_pages += zone->present_pages;

3019

lowmem_pages += zone->present_pages;

3020

}

3020

}

3021

3022

for_each_zone(zone) {

3022

for_each_zone(zone) {

3023

u64 tmp;

3023

u64 tmp;

3024

3025

spin_lock_irqsave(&zone->lru_lock, flags);

3025

spin_lock_irqsave(&zone->lru_lock, flags);

3026

tmp = (u64)pages_min * zone->present_pages;

3026

tmp = (u64)pages_min * zone->present_pages;

3027

do_div(tmp, lowmem_pages);

3027

do_div(tmp, lowmem_pages);

3028

if (is_highmem(zone)) {

3028

if (is_highmem(zone)) {

3029

/*

3029

/*

3030

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

3030

* __GFP_HIGH and PF_MEMALLOC allocations usually don't

3031

* need highmem pages, so cap pages_min to a small

3031

* need highmem pages, so cap pages_min to a small

3032

* value here.

3032

* value here.

3033

*

3033

*

3034

* The (pages_high-pages_low) and (pages_low-pages_min)

3034

* The (pages_high-pages_low) and (pages_low-pages_min)

3035

* deltas controls asynch page reclaim, and so should

3035

* deltas controls asynch page reclaim, and so should

3036

* not be capped for highmem.

3036

* not be capped for highmem.

3037

*/

3037

*/

3038

int min_pages;

3038

int min_pages;

3039

3040

min_pages = zone->present_pages / 1024;

3040

min_pages = zone->present_pages / 1024;

3041

if (min_pages < SWAP_CLUSTER_MAX)

3041

if (min_pages < SWAP_CLUSTER_MAX)

3042

min_pages = SWAP_CLUSTER_MAX;

3042

min_pages = SWAP_CLUSTER_MAX;

3043

if (min_pages > 128)

3043

if (min_pages > 128)

3044

min_pages = 128;

3044

min_pages = 128;

3045

zone->pages_min = min_pages;

3045

zone->pages_min = min_pages;

3046

} else {

3046

} else {

3047

/*

3047

/*

3048

* If it's a lowmem zone, reserve a number of pages

3048

* If it's a lowmem zone, reserve a number of pages

3049

* proportionate to the zone's size.

3049

* proportionate to the zone's size.

3050

*/

3050

*/

3051

zone->pages_min = tmp;

3051

zone->pages_min = tmp;

3052

}

3052

}

3053

3054

zone->pages_low = zone->pages_min + (tmp >> 2);

3054

zone->pages_low = zone->pages_min + (tmp >> 2);

3055

zone->pages_high = zone->pages_min + (tmp >> 1);

3055

zone->pages_high = zone->pages_min + (tmp >> 1);

3056

spin_unlock_irqrestore(&zone->lru_lock, flags);

3056

spin_unlock_irqrestore(&zone->lru_lock, flags);

3057

}

3057

}

3058

3059

/* update totalreserve_pages */

3059

/* update totalreserve_pages */

3060

calculate_totalreserve_pages();

3060

calculate_totalreserve_pages();

3061

}

3061

}

3062

3063

/*

3063

/*

3064

* Initialise min_free_kbytes.

3064

* Initialise min_free_kbytes.

3065

*

3065

*

3066

* For small machines we want it small (128k min). For large machines

3066

* For small machines we want it small (128k min). For large machines

3067

* we want it large (64MB max). But it is not linear, because network

3067

* we want it large (64MB max). But it is not linear, because network

3068

* bandwidth does not increase linearly with machine size. We use

3068

* bandwidth does not increase linearly with machine size. We use

3069

*

3069

*

3070

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

3070

* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:

3071

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

3071

* min_free_kbytes = sqrt(lowmem_kbytes * 16)

3072

*

3072

*

3073

* which yields

3073

* which yields

3074

*

3074

*

3075

* 16MB: 512k

3075

* 16MB: 512k

3076

* 32MB: 724k

3076

* 32MB: 724k

3077

* 64MB: 1024k

3077

* 64MB: 1024k

3078

* 128MB: 1448k

3078

* 128MB: 1448k

3079

* 256MB: 2048k

3079

* 256MB: 2048k

3080

* 512MB: 2896k

3080

* 512MB: 2896k

3081

* 1024MB: 4096k

3081

* 1024MB: 4096k

3082

* 2048MB: 5792k

3082

* 2048MB: 5792k

3083

* 4096MB: 8192k

3083

* 4096MB: 8192k

3084

* 8192MB: 11584k

3084

* 8192MB: 11584k

3085

* 16384MB: 16384k

3085

* 16384MB: 16384k

3086

*/

3086

*/

3087

static int __init init_per_zone_pages_min(void)

3087

static int __init init_per_zone_pages_min(void)

3088

{

3088

{

3089

unsigned long lowmem_kbytes;

3089

unsigned long lowmem_kbytes;

3090

3091

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

3091

lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);

3092

3093

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

3093

min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

3094

if (min_free_kbytes < 128)

3094

if (min_free_kbytes < 128)

3095

min_free_kbytes = 128;

3095

min_free_kbytes = 128;

3096

if (min_free_kbytes > 65536)

3096

if (min_free_kbytes > 65536)

3097

min_free_kbytes = 65536;

3097

min_free_kbytes = 65536;

3098

setup_per_zone_pages_min();

3098

setup_per_zone_pages_min();

3099

setup_per_zone_lowmem_reserve();

3099

setup_per_zone_lowmem_reserve();

3100

return 0;

3100

return 0;

3101

}

3101

}

3102

module_init(init_per_zone_pages_min)

3102

module_init(init_per_zone_pages_min)

3103

3104

/*

3104

/*

3105

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

3105

* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so

3106

* that we can call two helper functions whenever min_free_kbytes

3106

* that we can call two helper functions whenever min_free_kbytes

3107

* changes.

3107

* changes.

3108

*/

3108

*/

3109

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

3109

int min_free_kbytes_sysctl_handler(ctl_table *table, int write,

3110

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3110

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3111

{

3111

{

3112

proc_dointvec(table, write, file, buffer, length, ppos);

3112

proc_dointvec(table, write, file, buffer, length, ppos);

3113

setup_per_zone_pages_min();

3113

setup_per_zone_pages_min();

3114

return 0;

3114

return 0;

3115

}

3115

}

3116

3117

#ifdef CONFIG_NUMA

3117

#ifdef CONFIG_NUMA

3118

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

3118

int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,

3119

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3119

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3120

{

3120

{

3121

struct zone *zone;

3121

struct zone *zone;

3122

int rc;

3122

int rc;

3123

3124

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3124

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3125

if (rc)

3125

if (rc)

3126

return rc;

3126

return rc;

3127

3128

for_each_zone(zone)

3128

for_each_zone(zone)

3129

zone->min_unmapped_pages = (zone->present_pages *

3129

zone->min_unmapped_pages = (zone->present_pages *

3130

sysctl_min_unmapped_ratio) / 100;

3130

sysctl_min_unmapped_ratio) / 100;

3131

return 0;

3131

return 0;

3132

}

3132

}

3133

3134

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

3134

int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,

3135

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3135

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3136

{

3136

{

3137

struct zone *zone;

3137

struct zone *zone;

3138

int rc;

3138

int rc;

3139

3140

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3140

rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3141

if (rc)

3141

if (rc)

3142

return rc;

3142

return rc;

3143

3144

for_each_zone(zone)

3144

for_each_zone(zone)

3145

zone->min_slab_pages = (zone->present_pages *

3145

zone->min_slab_pages = (zone->present_pages *

3146

sysctl_min_slab_ratio) / 100;

3146

sysctl_min_slab_ratio) / 100;

3147

return 0;

3147

return 0;

3148

}

3148

}

3149

#endif

3149

#endif

3150

3151

/*

3151

/*

3152

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

3152

* lowmem_reserve_ratio_sysctl_handler - just a wrapper around

3153

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

3153

* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()

3154

* whenever sysctl_lowmem_reserve_ratio changes.

3154

* whenever sysctl_lowmem_reserve_ratio changes.

3155

*

3155

*

3156

* The reserve ratio obviously has absolutely no relation with the

3156

* The reserve ratio obviously has absolutely no relation with the

3157

* pages_min watermarks. The lowmem reserve ratio can only make sense

3157

* pages_min watermarks. The lowmem reserve ratio can only make sense

3158

* if in function of the boot time zone sizes.

3158

* if in function of the boot time zone sizes.

3159

*/

3159

*/

3160

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

3160

int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,

3161

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3161

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3162

{

3162

{

3163

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3163

proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3164

setup_per_zone_lowmem_reserve();

3164

setup_per_zone_lowmem_reserve();

3165

return 0;

3165

return 0;

3166

}

3166

}

3167

3168

/*

3168

/*

3169

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

3169

* percpu_pagelist_fraction - changes the pcp->high for each zone on each

3170

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

3170

* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist

3171

* can have before it gets flushed back to buddy allocator.

3171

* can have before it gets flushed back to buddy allocator.

3172

*/

3172

*/

3173

3174

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

3174

int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,

3175

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3175

struct file *file, void __user *buffer, size_t *length, loff_t *ppos)

3176

{

3176

{

3177

struct zone *zone;

3177

struct zone *zone;

3178

unsigned int cpu;

3178

unsigned int cpu;

3179

int ret;

3179

int ret;

3180

3181

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3181

ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);

3182

if (!write || (ret == -EINVAL))

3182

if (!write || (ret == -EINVAL))

3183

return ret;

3183

return ret;

3184

for_each_zone(zone) {

3184

for_each_zone(zone) {

3185

for_each_online_cpu(cpu) {

3185

for_each_online_cpu(cpu) {

3186

unsigned long high;

3186

unsigned long high;

3187

high = zone->present_pages / percpu_pagelist_fraction;

3187

high = zone->present_pages / percpu_pagelist_fraction;

3188

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

3188

setup_pagelist_highmark(zone_pcp(zone, cpu), high);

3189

}

3189

}

3190

}

3190

}

3191

return 0;

3191

return 0;

3192

}

3192

}

3193

3194

int hashdist = HASHDIST_DEFAULT;

3194

int hashdist = HASHDIST_DEFAULT;

3195

3196

#ifdef CONFIG_NUMA

3196

#ifdef CONFIG_NUMA

3197

static int __init set_hashdist(char *str)

3197

static int __init set_hashdist(char *str)

3198

{

3198

{

3199

if (!str)

3199

if (!str)

3200

return 0;

3200

return 0;

3201

hashdist = simple_strtoul(str, &str, 0);

3201

hashdist = simple_strtoul(str, &str, 0);

3202

return 1;

3202

return 1;

3203

}

3203

}

3204

__setup("hashdist=", set_hashdist);

3204

__setup("hashdist=", set_hashdist);

3205

#endif

3205

#endif

3206

3207

/*

3207

/*

3208

* allocate a large system hash table from bootmem

3208

* allocate a large system hash table from bootmem

3209

* - it is assumed that the hash table must contain an exact power-of-2

3209

* - it is assumed that the hash table must contain an exact power-of-2

3210

* quantity of entries

3210

* quantity of entries

3211

* - limit is the number of hash buckets, not the total allocation size

3211

* - limit is the number of hash buckets, not the total allocation size

3212

*/

3212

*/

3213

void *__init alloc_large_system_hash(const char *tablename,

3213

void *__init alloc_large_system_hash(const char *tablename,

3214

unsigned long bucketsize,

3214

unsigned long bucketsize,

3215

unsigned long numentries,

3215

unsigned long numentries,

3216

int scale,

3216

int scale,

3217

int flags,

3217

int flags,

3218

unsigned int *_hash_shift,

3218

unsigned int *_hash_shift,

3219

unsigned int *_hash_mask,

3219

unsigned int *_hash_mask,

3220

unsigned long limit)

3220

unsigned long limit)

3221

{

3221

{

3222

unsigned long long max = limit;

3222

unsigned long long max = limit;

3223

unsigned long log2qty, size;

3223

unsigned long log2qty, size;

3224

void *table = NULL;

3224

void *table = NULL;

3225

3226

/* allow the kernel cmdline to have a say */

3226

/* allow the kernel cmdline to have a say */

3227

if (!numentries) {

3227

if (!numentries) {

3228

/* round applicable memory size up to nearest megabyte */

3228

/* round applicable memory size up to nearest megabyte */

3229

numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;

3229

numentries = nr_kernel_pages;

3230

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

3230

numentries += (1UL << (20 - PAGE_SHIFT)) - 1;

3231

numentries >>= 20 - PAGE_SHIFT;

3231

numentries >>= 20 - PAGE_SHIFT;

3232

numentries <<= 20 - PAGE_SHIFT;

3232

numentries <<= 20 - PAGE_SHIFT;

3233

3234

/* limit to 1 bucket per 2^scale bytes of low memory */

3234

/* limit to 1 bucket per 2^scale bytes of low memory */

3235

if (scale > PAGE_SHIFT)

3235

if (scale > PAGE_SHIFT)

3236

numentries >>= (scale - PAGE_SHIFT);

3236

numentries >>= (scale - PAGE_SHIFT);

3237

else

3237

else

3238

numentries <<= (PAGE_SHIFT - scale);

3238

numentries <<= (PAGE_SHIFT - scale);

3239

}

3239

}

3240

numentries = roundup_pow_of_two(numentries);

3240

numentries = roundup_pow_of_two(numentries);

3241

3242

/* limit allocation size to 1/16 total memory by default */

3242

/* limit allocation size to 1/16 total memory by default */

3243

if (max == 0) {

3243

if (max == 0) {

3244

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

3244

max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;

3245

do_div(max, bucketsize);

3245

do_div(max, bucketsize);

3246

}

3246

}

3247

3248

if (numentries > max)

3248

if (numentries > max)

3249

numentries = max;

3249

numentries = max;

3250

3251

log2qty = long_log2(numentries);

3251

log2qty = long_log2(numentries);

3252

3253

do {

3253

do {

3254

size = bucketsize << log2qty;

3254

size = bucketsize << log2qty;

3255

if (flags & HASH_EARLY)

3255

if (flags & HASH_EARLY)

3256

table = alloc_bootmem(size);

3256

table = alloc_bootmem(size);

3257

else if (hashdist)

3257

else if (hashdist)

3258

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

3258

table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);

3259

else {

3259

else {

3260

unsigned long order;

3260

unsigned long order;

3261

for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)

3261

for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)

3262

;

3262

;

3263

table = (void*) __get_free_pages(GFP_ATOMIC, order);

3263

table = (void*) __get_free_pages(GFP_ATOMIC, order);

3264

}

3264

}

3265

} while (!table && size > PAGE_SIZE && --log2qty);

3265

} while (!table && size > PAGE_SIZE && --log2qty);

3266

3267

if (!table)

3267

if (!table)

3268

panic("Failed to allocate %s hash table\n", tablename);

3268

panic("Failed to allocate %s hash table\n", tablename);

3269

3270

printk("%s hash table entries: %d (order: %d, %lu bytes)\n",

3270

printk("%s hash table entries: %d (order: %d, %lu bytes)\n",

3271

tablename,

3271

tablename,

3272

(1U << log2qty),

3272

(1U << log2qty),

3273

long_log2(size) - PAGE_SHIFT,

3273

long_log2(size) - PAGE_SHIFT,

3274

size);

3274

size);

3275

3276

if (_hash_shift)

3276

if (_hash_shift)

3277

*_hash_shift = log2qty;

3277

*_hash_shift = log2qty;

3278

if (_hash_mask)

3278

if (_hash_mask)

3279

*_hash_mask = (1 << log2qty) - 1;

3279

*_hash_mask = (1 << log2qty) - 1;

3280

3281

return table;

3281

return table;

3282

}

3282

}

3283

3284

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

3284

#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE

3285

struct page *pfn_to_page(unsigned long pfn)

3285

struct page *pfn_to_page(unsigned long pfn)

3286

{

3286

{

3287

return __pfn_to_page(pfn);

3287

return __pfn_to_page(pfn);

3288

}

3288

}

3289

unsigned long page_to_pfn(struct page *page)

3289

unsigned long page_to_pfn(struct page *page)

3290

{

3290

{

3291

return __page_to_pfn(page);

3291

return __page_to_pfn(page);

3292

}

3292

}

3293

EXPORT_SYMBOL(pfn_to_page);

3293

EXPORT_SYMBOL(pfn_to_page);

3294

EXPORT_SYMBOL(page_to_pfn);

3294

EXPORT_SYMBOL(page_to_pfn);

3295

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

3295

#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */

3296

3297

#if MAX_NUMNODES > 1

3297

#if MAX_NUMNODES > 1

3298

/*

3298

/*

3299

* Find the highest possible node id.

3299

* Find the highest possible node id.

3300

*/

3300

*/

3301

int highest_possible_node_id(void)

3301

int highest_possible_node_id(void)

3302

{

3302

{

3303

unsigned int node;

3303

unsigned int node;

3304

unsigned int highest = 0;

3304

unsigned int highest = 0;

3305

3306

for_each_node_mask(node, node_possible_map)

3306

for_each_node_mask(node, node_possible_map)

3307

highest = node;

3307

highest = node;

3308

return highest;

3308

return highest;

3309

}

3309

}

3310

EXPORT_SYMBOL(highest_possible_node_id);

3310

EXPORT_SYMBOL(highest_possible_node_id);

3311

#endif

3311

#endif

3312

GITLAB

[PATCH] remove HASH_HIGHMEM

 /*
  * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  */
 #ifndef _LINUX_BOOTMEM_H
 #define _LINUX_BOOTMEM_H
 #include <linux/mmzone.h>
 #include <asm/dma.h>
 /*
  *  simple boot-time physical memory area allocator.
  */
 extern unsigned long max_low_pfn;
 extern unsigned long min_low_pfn;
 /*
  * highest page
  */
 extern unsigned long max_pfn;
 #ifdef CONFIG_CRASH_DUMP
 extern unsigned long saved_max_pfn;
 #endif
 /*
  * node_bootmem_map is a map pointer - the bits represent all physical
  * memory pages (including holes) on the node.
  */
 typedef struct bootmem_data {
 	unsigned long node_boot_start;
 	unsigned long node_low_pfn;
 	void *node_bootmem_map;
 	unsigned long last_offset;
 	unsigned long last_pos;
 	unsigned long last_success;	/* Previous allocation point.  To speed
 					 * up searching */
 	struct list_head list;
 } bootmem_data_t;
 extern unsigned long bootmem_bootmap_pages(unsigned long);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 extern void free_bootmem(unsigned long addr, unsigned long size);
 extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long align,
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
 				     unsigned long goal);
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
 				 unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
 				      unsigned long goal);
 extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
 				  unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void reserve_bootmem(unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
 	__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low(x) \
 	__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
 	__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages(x) \
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 extern unsigned long free_all_bootmem(void);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal);
 extern unsigned long init_bootmem_node(pg_data_t *pgdat,
 				       unsigned long freepfn,
 				       unsigned long startpfn,
 				       unsigned long endpfn);
 extern void reserve_bootmem_node(pg_data_t *pgdat,
 				 unsigned long physaddr,
 				 unsigned long size);
 extern void free_bootmem_node(pg_data_t *pgdat,
 			      unsigned long addr,
 			      unsigned long size);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 #define alloc_bootmem_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
 	__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_low_pages_node(pgdat, x) \
 	__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
 extern void *alloc_remap(int nid, unsigned long size);
 #else
 static inline void *alloc_remap(int nid, unsigned long size)
 {
 	return NULL;
 }
 #endif /* CONFIG_HAVE_ARCH_ALLOC_REMAP */
 extern unsigned long __meminitdata nr_kernel_pages;
 extern unsigned long nr_all_pages;
 extern void *alloc_large_system_hash(const char *tablename,
 				     unsigned long bucketsize,
 				     unsigned long numentries,
 				     int scale,
 				     int flags,
 				     unsigned int *_hash_shift,
 				     unsigned int *_hash_mask,
 				     unsigned long limit);
-#define HASH_HIGHMEM	0x00000001	/* Consider highmem? */
+#define HASH_EARLY	0x00000001	/* Allocating during early boot? */
-#define HASH_EARLY	0x00000002	/* Allocating during early boot? */
 /* Only NUMA needs hash distribution.
  * IA64 is known to have sufficient vmalloc space.
  */
 #if defined(CONFIG_NUMA) && defined(CONFIG_IA64)
 #define HASHDIST_DEFAULT 1
 #else
 #define HASHDIST_DEFAULT 0
 #endif
 extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 #endif /* _LINUX_BOOTMEM_H */