Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/vmscan.c

2

* linux/mm/vmscan.c

3

*

3

*

4

5

*

5

*

6

* Swap reorganised 29.12.95, Stephen Tweedie.

6

* Swap reorganised 29.12.95, Stephen Tweedie.

7

* kswapd added: 7.1.96 sct

7

* kswapd added: 7.1.96 sct

8

* Removed kswapd_ctl limits, and swap out as many pages as needed

8

* Removed kswapd_ctl limits, and swap out as many pages as needed

9

* to bring the system back to freepages.high: 2.4.97, Rik van Riel.

9

* to bring the system back to freepages.high: 2.4.97, Rik van Riel.

10

* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).

10

* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).

11

* Multiqueue VM started 5.8.00, Rik van Riel.

11

* Multiqueue VM started 5.8.00, Rik van Riel.

12

*/

12

*/

13

14

#include <linux/mm.h>

14

#include <linux/mm.h>

15

#include <linux/module.h>

15

#include <linux/module.h>

16

#include <linux/gfp.h>

16

#include <linux/gfp.h>

17

#include <linux/kernel_stat.h>

17

#include <linux/kernel_stat.h>

18

#include <linux/swap.h>

18

#include <linux/swap.h>

19

#include <linux/pagemap.h>

19

#include <linux/pagemap.h>

20

#include <linux/init.h>

20

#include <linux/init.h>

21

#include <linux/highmem.h>

21

#include <linux/highmem.h>

22

#include <linux/vmpressure.h>

22

#include <linux/vmpressure.h>

23

#include <linux/vmstat.h>

23

#include <linux/vmstat.h>

24

#include <linux/file.h>

24

#include <linux/file.h>

25

#include <linux/writeback.h>

25

#include <linux/writeback.h>

26

#include <linux/blkdev.h>

26

#include <linux/blkdev.h>

27

#include <linux/buffer_head.h> /* for try_to_release_page(),

27

#include <linux/buffer_head.h> /* for try_to_release_page(),

28

buffer_heads_over_limit */

28

buffer_heads_over_limit */

29

#include <linux/mm_inline.h>

29

#include <linux/mm_inline.h>

30

#include <linux/backing-dev.h>

30

#include <linux/backing-dev.h>

31

#include <linux/rmap.h>

31

#include <linux/rmap.h>

32

#include <linux/topology.h>

32

#include <linux/topology.h>

33

#include <linux/cpu.h>

33

#include <linux/cpu.h>

34

#include <linux/cpuset.h>

34

#include <linux/cpuset.h>

35

#include <linux/compaction.h>

35

#include <linux/compaction.h>

36

#include <linux/notifier.h>

36

#include <linux/notifier.h>

37

#include <linux/rwsem.h>

37

#include <linux/rwsem.h>

38

#include <linux/delay.h>

38

#include <linux/delay.h>

39

#include <linux/kthread.h>

39

#include <linux/kthread.h>

40

#include <linux/freezer.h>

40

#include <linux/freezer.h>

41

#include <linux/memcontrol.h>

41

#include <linux/memcontrol.h>

42

#include <linux/delayacct.h>

42

#include <linux/delayacct.h>

43

#include <linux/sysctl.h>

43

#include <linux/sysctl.h>

44

#include <linux/oom.h>

44

#include <linux/oom.h>

45

#include <linux/prefetch.h>

45

#include <linux/prefetch.h>

46

47

#include <asm/tlbflush.h>

47

#include <asm/tlbflush.h>

48

#include <asm/div64.h>

48

#include <asm/div64.h>

49

50

#include <linux/swapops.h>

50

#include <linux/swapops.h>

51

#include <linux/balloon_compaction.h>

51

#include <linux/balloon_compaction.h>

52

53

#include "internal.h"

53

#include "internal.h"

54

55

#define CREATE_TRACE_POINTS

55

#define CREATE_TRACE_POINTS

56

#include <trace/events/vmscan.h>

56

#include <trace/events/vmscan.h>

57

58

struct scan_control {

58

struct scan_control {

59

/* Incremented by the number of inactive pages that were scanned */

59

/* Incremented by the number of inactive pages that were scanned */

60

unsigned long nr_scanned;

60

unsigned long nr_scanned;

61

62

/* Number of pages freed so far during a call to shrink_zones() */

62

/* Number of pages freed so far during a call to shrink_zones() */

63

unsigned long nr_reclaimed;

63

unsigned long nr_reclaimed;

64

65

/* How many pages shrink_list() should reclaim */

65

/* How many pages shrink_list() should reclaim */

66

unsigned long nr_to_reclaim;

66

unsigned long nr_to_reclaim;

67

68

unsigned long hibernation_mode;

68

unsigned long hibernation_mode;

69

70

/* This context's GFP mask */

70

/* This context's GFP mask */

71

gfp_t gfp_mask;

71

gfp_t gfp_mask;

72

73

int may_writepage;

73

int may_writepage;

74

75

/* Can mapped pages be reclaimed? */

75

/* Can mapped pages be reclaimed? */

76

int may_unmap;

76

int may_unmap;

77

78

/* Can pages be swapped as part of reclaim? */

78

/* Can pages be swapped as part of reclaim? */

79

int may_swap;

79

int may_swap;

80

81

int order;

81

int order;

82

83

/* Scan (total_size >> priority) pages at once */

83

/* Scan (total_size >> priority) pages at once */

84

int priority;

84

int priority;

85

86

/*

86

/*

87

* The memory cgroup that hit its limit and as a result is the

87

* The memory cgroup that hit its limit and as a result is the

88

* primary target of this reclaim invocation.

88

* primary target of this reclaim invocation.

89

*/

89

*/

90

struct mem_cgroup *target_mem_cgroup;

90

struct mem_cgroup *target_mem_cgroup;

91

92

/*

92

/*

93

* Nodemask of nodes allowed by the caller. If NULL, all nodes

93

* Nodemask of nodes allowed by the caller. If NULL, all nodes

94

* are scanned.

94

* are scanned.

95

*/

95

*/

96

nodemask_t *nodemask;

96

nodemask_t *nodemask;

97

};

97

};

98

99

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

99

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

100

101

#ifdef ARCH_HAS_PREFETCH

101

#ifdef ARCH_HAS_PREFETCH

102

#define prefetch_prev_lru_page(_page, _base, _field) \

102

#define prefetch_prev_lru_page(_page, _base, _field) \

103

do { \

103

do { \

104

if ((_page)->lru.prev != _base) { \

104

if ((_page)->lru.prev != _base) { \

105

struct page *prev; \

105

struct page *prev; \

106

\

106

\

107

prev = lru_to_page(&(_page->lru)); \

107

prev = lru_to_page(&(_page->lru)); \

108

prefetch(&prev->_field); \

108

prefetch(&prev->_field); \

109

} \

109

} \

110

} while (0)

110

} while (0)

111

#else

111

#else

112

#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)

112

#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)

113

#endif

113

#endif

114

115

#ifdef ARCH_HAS_PREFETCHW

115

#ifdef ARCH_HAS_PREFETCHW

116

#define prefetchw_prev_lru_page(_page, _base, _field) \

116

#define prefetchw_prev_lru_page(_page, _base, _field) \

117

do { \

117

do { \

118

if ((_page)->lru.prev != _base) { \

118

if ((_page)->lru.prev != _base) { \

119

struct page *prev; \

119

struct page *prev; \

120

\

120

\

121

prev = lru_to_page(&(_page->lru)); \

121

prev = lru_to_page(&(_page->lru)); \

122

prefetchw(&prev->_field); \

122

prefetchw(&prev->_field); \

123

} \

123

} \

124

} while (0)

124

} while (0)

125

#else

125

#else

126

#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)

126

#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)

127

#endif

127

#endif

128

129

/*

129

/*

130

* From 0 .. 100. Higher means more swappy.

130

* From 0 .. 100. Higher means more swappy.

131

*/

131

*/

132

int vm_swappiness = 60;

132

int vm_swappiness = 60;

133

unsigned long vm_total_pages; /* The total number of pages which the VM controls */

133

unsigned long vm_total_pages; /* The total number of pages which the VM controls */

134

135

static LIST_HEAD(shrinker_list);

135

static LIST_HEAD(shrinker_list);

136

static DECLARE_RWSEM(shrinker_rwsem);

136

static DECLARE_RWSEM(shrinker_rwsem);

137

138

#ifdef CONFIG_MEMCG

138

#ifdef CONFIG_MEMCG

139

static bool global_reclaim(struct scan_control *sc)

139

static bool global_reclaim(struct scan_control *sc)

140

{

140

{

141

return !sc->target_mem_cgroup;

141

return !sc->target_mem_cgroup;

142

}

142

}

143

#else

143

#else

144

static bool global_reclaim(struct scan_control *sc)

144

static bool global_reclaim(struct scan_control *sc)

145

{

145

{

146

return true;

146

return true;

147

}

147

}

148

#endif

148

#endif

149

150

static unsigned long zone_reclaimable_pages(struct zone *zone)

150

static unsigned long zone_reclaimable_pages(struct zone *zone)

151

{

151

{

152

int nr;

152

int nr;

153

154

nr = zone_page_state(zone, NR_ACTIVE_FILE) +

154

nr = zone_page_state(zone, NR_ACTIVE_FILE) +

155

zone_page_state(zone, NR_INACTIVE_FILE);

155

zone_page_state(zone, NR_INACTIVE_FILE);

156

157

if (get_nr_swap_pages() > 0)

157

if (get_nr_swap_pages() > 0)

158

nr += zone_page_state(zone, NR_ACTIVE_ANON) +

158

nr += zone_page_state(zone, NR_ACTIVE_ANON) +

159

zone_page_state(zone, NR_INACTIVE_ANON);

159

zone_page_state(zone, NR_INACTIVE_ANON);

160

161

return nr;

161

return nr;

162

}

162

}

163

164

bool zone_reclaimable(struct zone *zone)

164

bool zone_reclaimable(struct zone *zone)

165

{

165

{

166

return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;

166

return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;

167

}

167

}

168

169

static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)

169

static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)

170

{

170

{

171

if (!mem_cgroup_disabled())

171

if (!mem_cgroup_disabled())

172

return mem_cgroup_get_lru_size(lruvec, lru);

172

return mem_cgroup_get_lru_size(lruvec, lru);

173

174

return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);

174

return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);

175

}

175

}

176

177

/*

177

/*

178

* Add a shrinker callback to be called from the vm.

178

* Add a shrinker callback to be called from the vm.

179

*/

179

*/

180

int register_shrinker(struct shrinker *shrinker)

180

int register_shrinker(struct shrinker *shrinker)

181

{

181

{

182

size_t size = sizeof(*shrinker->nr_deferred);

182

size_t size = sizeof(*shrinker->nr_deferred);

183

184

/*

184

/*

185

* If we only have one possible node in the system anyway, save

185

* If we only have one possible node in the system anyway, save

186

* ourselves the trouble and disable NUMA aware behavior. This way we

186

* ourselves the trouble and disable NUMA aware behavior. This way we

187

* will save memory and some small loop time later.

187

* will save memory and some small loop time later.

188

*/

188

*/

189

if (nr_node_ids == 1)

189

if (nr_node_ids == 1)

190

shrinker->flags &= ~SHRINKER_NUMA_AWARE;

190

shrinker->flags &= ~SHRINKER_NUMA_AWARE;

191

192

if (shrinker->flags & SHRINKER_NUMA_AWARE)

192

if (shrinker->flags & SHRINKER_NUMA_AWARE)

193

size *= nr_node_ids;

193

size *= nr_node_ids;

194

195

shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);

195

shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);

196

if (!shrinker->nr_deferred)

196

if (!shrinker->nr_deferred)

197

return -ENOMEM;

197

return -ENOMEM;

198

199

down_write(&shrinker_rwsem);

199

down_write(&shrinker_rwsem);

200

list_add_tail(&shrinker->list, &shrinker_list);

200

list_add_tail(&shrinker->list, &shrinker_list);

201

up_write(&shrinker_rwsem);

201

up_write(&shrinker_rwsem);

202

return 0;

202

return 0;

203

}

203

}

204

EXPORT_SYMBOL(register_shrinker);

204

EXPORT_SYMBOL(register_shrinker);

205

206

/*

206

/*

207

* Remove one

207

* Remove one

208

*/

208

*/

209

void unregister_shrinker(struct shrinker *shrinker)

209

void unregister_shrinker(struct shrinker *shrinker)

210

{

210

{

211

down_write(&shrinker_rwsem);

211

down_write(&shrinker_rwsem);

212

list_del(&shrinker->list);

212

list_del(&shrinker->list);

213

up_write(&shrinker_rwsem);

213

up_write(&shrinker_rwsem);

214

kfree(shrinker->nr_deferred);

214

kfree(shrinker->nr_deferred);

215

}

215

}

216

EXPORT_SYMBOL(unregister_shrinker);

216

EXPORT_SYMBOL(unregister_shrinker);

217

218

#define SHRINK_BATCH 128

218

#define SHRINK_BATCH 128

219

220

static unsigned long

220

static unsigned long

221

shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,

221

shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,

222

unsigned long nr_pages_scanned, unsigned long lru_pages)

222

unsigned long nr_pages_scanned, unsigned long lru_pages)

223

{

223

{

224

unsigned long freed = 0;

224

unsigned long freed = 0;

225

unsigned long long delta;

225

unsigned long long delta;

226

long total_scan;

226

long total_scan;

227

long freeable;

227

long freeable;

228

long nr;

228

long nr;

229

long new_nr;

229

long new_nr;

230

int nid = shrinkctl->nid;

230

int nid = shrinkctl->nid;

231

long batch_size = shrinker->batch ? shrinker->batch

231

long batch_size = shrinker->batch ? shrinker->batch

232

: SHRINK_BATCH;

232

: SHRINK_BATCH;

233

234

freeable = shrinker->count_objects(shrinker, shrinkctl);

234

freeable = shrinker->count_objects(shrinker, shrinkctl);

235

if (freeable == 0)

235

if (freeable == 0)

236

return 0;

236

return 0;

237

238

/*

238

/*

239

* copy the current shrinker scan count into a local variable

239

* copy the current shrinker scan count into a local variable

240

* and zero it so that other concurrent shrinker invocations

240

* and zero it so that other concurrent shrinker invocations

241

* don't also do this scanning work.

241

* don't also do this scanning work.

242

*/

242

*/

243

nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);

243

nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);

244

245

total_scan = nr;

245

total_scan = nr;

246

delta = (4 * nr_pages_scanned) / shrinker->seeks;

246

delta = (4 * nr_pages_scanned) / shrinker->seeks;

247

delta *= freeable;

247

delta *= freeable;

248

do_div(delta, lru_pages + 1);

248

do_div(delta, lru_pages + 1);

249

total_scan += delta;

249

total_scan += delta;

250

if (total_scan < 0) {

250

if (total_scan < 0) {

251

printk(KERN_ERR

251

printk(KERN_ERR

252

"shrink_slab: %pF negative objects to delete nr=%ld\n",

252

"shrink_slab: %pF negative objects to delete nr=%ld\n",

253

shrinker->scan_objects, total_scan);

253

shrinker->scan_objects, total_scan);

254

total_scan = freeable;

254

total_scan = freeable;

255

}

255

}

256

257

/*

257

/*

258

* We need to avoid excessive windup on filesystem shrinkers

258

* We need to avoid excessive windup on filesystem shrinkers

259

* due to large numbers of GFP_NOFS allocations causing the

259

* due to large numbers of GFP_NOFS allocations causing the

260

* shrinkers to return -1 all the time. This results in a large

260

* shrinkers to return -1 all the time. This results in a large

261

* nr being built up so when a shrink that can do some work

261

* nr being built up so when a shrink that can do some work

262

* comes along it empties the entire cache due to nr >>>

262

* comes along it empties the entire cache due to nr >>>

263

* freeable. This is bad for sustaining a working set in

263

* freeable. This is bad for sustaining a working set in

264

* memory.

264

* memory.

265

*

265

*

266

* Hence only allow the shrinker to scan the entire cache when

266

* Hence only allow the shrinker to scan the entire cache when

267

* a large delta change is calculated directly.

267

* a large delta change is calculated directly.

268

*/

268

*/

269

if (delta < freeable / 4)

269

if (delta < freeable / 4)

270

total_scan = min(total_scan, freeable / 2);

270

total_scan = min(total_scan, freeable / 2);

271

272

/*

272

/*

273

* Avoid risking looping forever due to too large nr value:

273

* Avoid risking looping forever due to too large nr value:

274

* never try to free more than twice the estimate number of

274

* never try to free more than twice the estimate number of

275

* freeable entries.

275

* freeable entries.

276

*/

276

*/

277

if (total_scan > freeable * 2)

277

if (total_scan > freeable * 2)

278

total_scan = freeable * 2;

278

total_scan = freeable * 2;

279

280

trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,

280

trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,

281

nr_pages_scanned, lru_pages,

281

nr_pages_scanned, lru_pages,

282

freeable, delta, total_scan);

282

freeable, delta, total_scan);

283

284

/*

284

/*

285

* Normally, we should not scan less than batch_size objects in one

285

* Normally, we should not scan less than batch_size objects in one

286

* pass to avoid too frequent shrinker calls, but if the slab has less

286

* pass to avoid too frequent shrinker calls, but if the slab has less

287

* than batch_size objects in total and we are really tight on memory,

287

* than batch_size objects in total and we are really tight on memory,

288

* we will try to reclaim all available objects, otherwise we can end

288

* we will try to reclaim all available objects, otherwise we can end

289

* up failing allocations although there are plenty of reclaimable

289

* up failing allocations although there are plenty of reclaimable

290

* objects spread over several slabs with usage less than the

290

* objects spread over several slabs with usage less than the

291

* batch_size.

291

* batch_size.

292

*

292

*

293

* We detect the "tight on memory" situations by looking at the total

293

* We detect the "tight on memory" situations by looking at the total

294

* number of objects we want to scan (total_scan). If it is greater

294

* number of objects we want to scan (total_scan). If it is greater

295

* than the total number of objects on slab (freeable), we must be

295

* than the total number of objects on slab (freeable), we must be

296

* scanning at high prio and therefore should try to reclaim as much as

296

* scanning at high prio and therefore should try to reclaim as much as

297

* possible.

297

* possible.

298

*/

298

*/

299

while (total_scan >= batch_size ||

299

while (total_scan >= batch_size ||

300

total_scan >= freeable) {

300

total_scan >= freeable) {

301

unsigned long ret;

301

unsigned long ret;

302

unsigned long nr_to_scan = min(batch_size, total_scan);

302

unsigned long nr_to_scan = min(batch_size, total_scan);

303

304

shrinkctl->nr_to_scan = nr_to_scan;

304

shrinkctl->nr_to_scan = nr_to_scan;

305

ret = shrinker->scan_objects(shrinker, shrinkctl);

305

ret = shrinker->scan_objects(shrinker, shrinkctl);

306

if (ret == SHRINK_STOP)

306

if (ret == SHRINK_STOP)

307

break;

307

break;

308

freed += ret;

308

freed += ret;

309

310

count_vm_events(SLABS_SCANNED, nr_to_scan);

310

count_vm_events(SLABS_SCANNED, nr_to_scan);

311

total_scan -= nr_to_scan;

311

total_scan -= nr_to_scan;

312

313

cond_resched();

313

cond_resched();

314

}

314

}

315

316

/*

316

/*

317

* move the unused scan count back into the shrinker in a

317

* move the unused scan count back into the shrinker in a

318

* manner that handles concurrent updates. If we exhausted the

318

* manner that handles concurrent updates. If we exhausted the

319

* scan, there is no need to do an update.

319

* scan, there is no need to do an update.

320

*/

320

*/

321

if (total_scan > 0)

321

if (total_scan > 0)

322

new_nr = atomic_long_add_return(total_scan,

322

new_nr = atomic_long_add_return(total_scan,

323

&shrinker->nr_deferred[nid]);

323

&shrinker->nr_deferred[nid]);

324

else

324

else

325

new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);

325

new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);

326

327

trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);

327

trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);

328

return freed;

328

return freed;

329

}

329

}

330

331

/*

331

/*

332

* Call the shrink functions to age shrinkable caches

332

* Call the shrink functions to age shrinkable caches

333

*

333

*

334

* Here we assume it costs one seek to replace a lru page and that it also

334

* Here we assume it costs one seek to replace a lru page and that it also

335

* takes a seek to recreate a cache object. With this in mind we age equal

335

* takes a seek to recreate a cache object. With this in mind we age equal

336

* percentages of the lru and ageable caches. This should balance the seeks

336

* percentages of the lru and ageable caches. This should balance the seeks

337

* generated by these structures.

337

* generated by these structures.

338

*

338

*

339

* If the vm encountered mapped pages on the LRU it increase the pressure on

339

* If the vm encountered mapped pages on the LRU it increase the pressure on

340

* slab to avoid swapping.

340

* slab to avoid swapping.

341

*

341

*

342

* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.

342

* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.

343

*

343

*

344

* `lru_pages' represents the number of on-LRU pages in all the zones which

344

* `lru_pages' represents the number of on-LRU pages in all the zones which

345

* are eligible for the caller's allocation attempt. It is used for balancing

345

* are eligible for the caller's allocation attempt. It is used for balancing

346

* slab reclaim versus page reclaim.

346

* slab reclaim versus page reclaim.

347

*

347

*

348

* Returns the number of slab objects which we shrunk.

348

* Returns the number of slab objects which we shrunk.

349

*/

349

*/

350

unsigned long shrink_slab(struct shrink_control *shrinkctl,

350

unsigned long shrink_slab(struct shrink_control *shrinkctl,

351

unsigned long nr_pages_scanned,

351

unsigned long nr_pages_scanned,

352

unsigned long lru_pages)

352

unsigned long lru_pages)

353

{

353

{

354

struct shrinker *shrinker;

354

struct shrinker *shrinker;

355

unsigned long freed = 0;

355

unsigned long freed = 0;

356

357

if (nr_pages_scanned == 0)

357

if (nr_pages_scanned == 0)

358

nr_pages_scanned = SWAP_CLUSTER_MAX;

358

nr_pages_scanned = SWAP_CLUSTER_MAX;

359

360

if (!down_read_trylock(&shrinker_rwsem)) {

360

if (!down_read_trylock(&shrinker_rwsem)) {

361

/*

361

/*

362

* If we would return 0, our callers would understand that we

362

* If we would return 0, our callers would understand that we

363

* have nothing else to shrink and give up trying. By returning

363

* have nothing else to shrink and give up trying. By returning

364

* 1 we keep it going and assume we'll be able to shrink next

364

* 1 we keep it going and assume we'll be able to shrink next

365

* time.

365

* time.

366

*/

366

*/

367

freed = 1;

367

freed = 1;

368

goto out;

368

goto out;

369

}

369

}

370

371

list_for_each_entry(shrinker, &shrinker_list, list) {

371

list_for_each_entry(shrinker, &shrinker_list, list) {

372

if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {

372

if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {

373

shrinkctl->nid = 0;

373

shrinkctl->nid = 0;

374

freed += shrink_slab_node(shrinkctl, shrinker,

374

freed += shrink_slab_node(shrinkctl, shrinker,

375

nr_pages_scanned, lru_pages);

375

nr_pages_scanned, lru_pages);

376

continue;

376

continue;

377

}

377

}

378

379

for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {

379

for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {

380

if (node_online(shrinkctl->nid))

380

if (node_online(shrinkctl->nid))

381

freed += shrink_slab_node(shrinkctl, shrinker,

381

freed += shrink_slab_node(shrinkctl, shrinker,

382

nr_pages_scanned, lru_pages);

382

nr_pages_scanned, lru_pages);

383

384

}

384

}

385

}

385

}

386

up_read(&shrinker_rwsem);

386

up_read(&shrinker_rwsem);

387

out:

387

out:

388

cond_resched();

388

cond_resched();

389

return freed;

389

return freed;

390

}

390

}

391

392

static inline int is_page_cache_freeable(struct page *page)

392

static inline int is_page_cache_freeable(struct page *page)

393

{

393

{

394

/*

394

/*

395

* A freeable page cache page is referenced only by the caller

395

* A freeable page cache page is referenced only by the caller

396

* that isolated the page, the page cache radix tree and

396

* that isolated the page, the page cache radix tree and

397

* optional buffer heads at page->private.

397

* optional buffer heads at page->private.

398

*/

398

*/

399

return page_count(page) - page_has_private(page) == 2;

399

return page_count(page) - page_has_private(page) == 2;

400

}

400

}

401

402

static int may_write_to_queue(struct backing_dev_info *bdi,

402

static int may_write_to_queue(struct backing_dev_info *bdi,

403

struct scan_control *sc)

403

struct scan_control *sc)

404

{

404

{

405

if (current->flags & PF_SWAPWRITE)

405

if (current->flags & PF_SWAPWRITE)

406

return 1;

406

return 1;

407

if (!bdi_write_congested(bdi))

407

if (!bdi_write_congested(bdi))

408

return 1;

408

return 1;

409

if (bdi == current->backing_dev_info)

409

if (bdi == current->backing_dev_info)

410

return 1;

410

return 1;

411

return 0;

411

return 0;

412

}

412

}

413

414

/*

414

/*

415

* We detected a synchronous write error writing a page out. Probably

415

* We detected a synchronous write error writing a page out. Probably

416

* -ENOSPC. We need to propagate that into the address_space for a subsequent

416

* -ENOSPC. We need to propagate that into the address_space for a subsequent

417

* fsync(), msync() or close().

417

* fsync(), msync() or close().

418

*

418

*

419

* The tricky part is that after writepage we cannot touch the mapping: nothing

419

* The tricky part is that after writepage we cannot touch the mapping: nothing

420

* prevents it from being freed up. But we have a ref on the page and once

420

* prevents it from being freed up. But we have a ref on the page and once

421

* that page is locked, the mapping is pinned.

421

* that page is locked, the mapping is pinned.

422

*

422

*

423

* We're allowed to run sleeping lock_page() here because we know the caller has

423

* We're allowed to run sleeping lock_page() here because we know the caller has

424

* __GFP_FS.

424

* __GFP_FS.

425

*/

425

*/

426

static void handle_write_error(struct address_space *mapping,

426

static void handle_write_error(struct address_space *mapping,

427

struct page *page, int error)

427

struct page *page, int error)

428

{

428

{

429

lock_page(page);

429

lock_page(page);

430

if (page_mapping(page) == mapping)

430

if (page_mapping(page) == mapping)

431

mapping_set_error(mapping, error);

431

mapping_set_error(mapping, error);

432

unlock_page(page);

432

unlock_page(page);

433

}

433

}

434

435

/* possible outcome of pageout() */

435

/* possible outcome of pageout() */

436

typedef enum {

436

typedef enum {

437

/* failed to write page out, page is locked */

437

/* failed to write page out, page is locked */

438

PAGE_KEEP,

438

PAGE_KEEP,

439

/* move page to the active list, page is locked */

439

/* move page to the active list, page is locked */

440

PAGE_ACTIVATE,

440

PAGE_ACTIVATE,

441

/* page has been sent to the disk successfully, page is unlocked */

441

/* page has been sent to the disk successfully, page is unlocked */

442

PAGE_SUCCESS,

442

PAGE_SUCCESS,

443

/* page is clean and locked */

443

/* page is clean and locked */

444

PAGE_CLEAN,

444

PAGE_CLEAN,

445

} pageout_t;

445

} pageout_t;

446

447

/*

447

/*

448

* pageout is called by shrink_page_list() for each dirty page.

448

* pageout is called by shrink_page_list() for each dirty page.

449

* Calls ->writepage().

449

* Calls ->writepage().

450

*/

450

*/

451

static pageout_t pageout(struct page *page, struct address_space *mapping,

451

static pageout_t pageout(struct page *page, struct address_space *mapping,

452

struct scan_control *sc)

452

struct scan_control *sc)

453

{

453

{

454

/*

454

/*

455

* If the page is dirty, only perform writeback if that write

455

* If the page is dirty, only perform writeback if that write

456

* will be non-blocking. To prevent this allocation from being

456

* will be non-blocking. To prevent this allocation from being

457

* stalled by pagecache activity. But note that there may be

457

* stalled by pagecache activity. But note that there may be

458

* stalls if we need to run get_block(). We could test

458

* stalls if we need to run get_block(). We could test

459

* PagePrivate for that.

459

* PagePrivate for that.

460

*

460

*

461

* If this process is currently in __generic_file_aio_write() against

461

* If this process is currently in __generic_file_aio_write() against

462

* this page's queue, we can perform writeback even if that

462

* this page's queue, we can perform writeback even if that

463

* will block.

463

* will block.

464

*

464

*

465

* If the page is swapcache, write it back even if that would

465

* If the page is swapcache, write it back even if that would

466

* block, for some throttling. This happens by accident, because

466

* block, for some throttling. This happens by accident, because

467

* swap_backing_dev_info is bust: it doesn't reflect the

467

* swap_backing_dev_info is bust: it doesn't reflect the

468

* congestion state of the swapdevs. Easy to fix, if needed.

468

* congestion state of the swapdevs. Easy to fix, if needed.

469

*/

469

*/

470

if (!is_page_cache_freeable(page))

470

if (!is_page_cache_freeable(page))

471

return PAGE_KEEP;

471

return PAGE_KEEP;

472

if (!mapping) {

472

if (!mapping) {

473

/*

473

/*

474

* Some data journaling orphaned pages can have

474

* Some data journaling orphaned pages can have

475

* page->mapping == NULL while being dirty with clean buffers.

475

* page->mapping == NULL while being dirty with clean buffers.

476

*/

476

*/

477

if (page_has_private(page)) {

477

if (page_has_private(page)) {

478

if (try_to_free_buffers(page)) {

478

if (try_to_free_buffers(page)) {

479

ClearPageDirty(page);

479

ClearPageDirty(page);

480

printk("%s: orphaned page\n", __func__);

480

printk("%s: orphaned page\n", __func__);

481

return PAGE_CLEAN;

481

return PAGE_CLEAN;

482

}

482

}

483

}

483

}

484

return PAGE_KEEP;

484

return PAGE_KEEP;

485

}

485

}

486

if (mapping->a_ops->writepage == NULL)

486

if (mapping->a_ops->writepage == NULL)

487

return PAGE_ACTIVATE;

487

return PAGE_ACTIVATE;

488

if (!may_write_to_queue(mapping->backing_dev_info, sc))

488

if (!may_write_to_queue(mapping->backing_dev_info, sc))

489

return PAGE_KEEP;

489

return PAGE_KEEP;

490

491

if (clear_page_dirty_for_io(page)) {

491

if (clear_page_dirty_for_io(page)) {

492

int res;

492

int res;

493

struct writeback_control wbc = {

493

struct writeback_control wbc = {

494

.sync_mode = WB_SYNC_NONE,

494

.sync_mode = WB_SYNC_NONE,

495

.nr_to_write = SWAP_CLUSTER_MAX,

495

.nr_to_write = SWAP_CLUSTER_MAX,

496

.range_start = 0,

496

.range_start = 0,

497

.range_end = LLONG_MAX,

497

.range_end = LLONG_MAX,

498

.for_reclaim = 1,

498

.for_reclaim = 1,

499

};

499

};

500

501

SetPageReclaim(page);

501

SetPageReclaim(page);

502

res = mapping->a_ops->writepage(page, &wbc);

502

res = mapping->a_ops->writepage(page, &wbc);

503

if (res < 0)

503

if (res < 0)

504

handle_write_error(mapping, page, res);

504

handle_write_error(mapping, page, res);

505

if (res == AOP_WRITEPAGE_ACTIVATE) {

505

if (res == AOP_WRITEPAGE_ACTIVATE) {

506

ClearPageReclaim(page);

506

ClearPageReclaim(page);

507

return PAGE_ACTIVATE;

507

return PAGE_ACTIVATE;

508

}

508

}

509

510

if (!PageWriteback(page)) {

510

if (!PageWriteback(page)) {

511

/* synchronous write or broken a_ops? */

511

/* synchronous write or broken a_ops? */

512

ClearPageReclaim(page);

512

ClearPageReclaim(page);

513

}

513

}

514

trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));

514

trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));

515

inc_zone_page_state(page, NR_VMSCAN_WRITE);

515

inc_zone_page_state(page, NR_VMSCAN_WRITE);

516

return PAGE_SUCCESS;

516

return PAGE_SUCCESS;

517

}

517

}

518

519

return PAGE_CLEAN;

519

return PAGE_CLEAN;

520

}

520

}

521

522

/*

522

/*

523

* Same as remove_mapping, but if the page is removed from the mapping, it

523

* Same as remove_mapping, but if the page is removed from the mapping, it

524

* gets returned with a refcount of 0.

524

* gets returned with a refcount of 0.

525

*/

525

*/

526

static int __remove_mapping(struct address_space *mapping, struct page *page)

526

static int __remove_mapping(struct address_space *mapping, struct page *page)

527

{

527

{

528

BUG_ON(!PageLocked(page));

528

BUG_ON(!PageLocked(page));

529

BUG_ON(mapping != page_mapping(page));

529

BUG_ON(mapping != page_mapping(page));

530

531

spin_lock_irq(&mapping->tree_lock);

531

spin_lock_irq(&mapping->tree_lock);

532

/*

532

/*

533

* The non racy check for a busy page.

533

* The non racy check for a busy page.

534

*

534

*

535

* Must be careful with the order of the tests. When someone has

535

* Must be careful with the order of the tests. When someone has

536

* a ref to the page, it may be possible that they dirty it then

536

* a ref to the page, it may be possible that they dirty it then

537

* drop the reference. So if PageDirty is tested before page_count

537

* drop the reference. So if PageDirty is tested before page_count

538

* here, then the following race may occur:

538

* here, then the following race may occur:

539

*

539

*

540

* get_user_pages(&page);

540

* get_user_pages(&page);

541

* [user mapping goes away]

541

* [user mapping goes away]

542

* write_to(page);

542

* write_to(page);

543

* !PageDirty(page) [good]

543

* !PageDirty(page) [good]

544

* SetPageDirty(page);

544

* SetPageDirty(page);

545

* put_page(page);

545

* put_page(page);

546

* !page_count(page) [good, discard it]

546

* !page_count(page) [good, discard it]

547

*

547

*

548

* [oops, our write_to data is lost]

548

* [oops, our write_to data is lost]

549

*

549

*

550

* Reversing the order of the tests ensures such a situation cannot

550

* Reversing the order of the tests ensures such a situation cannot

551

* escape unnoticed. The smp_rmb is needed to ensure the page->flags

551

* escape unnoticed. The smp_rmb is needed to ensure the page->flags

552

* load is not satisfied before that of page->_count.

552

* load is not satisfied before that of page->_count.

553

*

553

*

554

* Note that if SetPageDirty is always performed via set_page_dirty,

554

* Note that if SetPageDirty is always performed via set_page_dirty,

555

* and thus under tree_lock, then this ordering is not required.

555

* and thus under tree_lock, then this ordering is not required.

556

*/

556

*/

557

if (!page_freeze_refs(page, 2))

557

if (!page_freeze_refs(page, 2))

558

goto cannot_free;

558

goto cannot_free;

559

/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */

559

/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */

560

if (unlikely(PageDirty(page))) {

560

if (unlikely(PageDirty(page))) {

561

page_unfreeze_refs(page, 2);

561

page_unfreeze_refs(page, 2);

562

goto cannot_free;

562

goto cannot_free;

563

}

563

}

564

565

if (PageSwapCache(page)) {

565

if (PageSwapCache(page)) {

566

swp_entry_t swap = { .val = page_private(page) };

566

swp_entry_t swap = { .val = page_private(page) };

567

__delete_from_swap_cache(page);

567

__delete_from_swap_cache(page);

568

spin_unlock_irq(&mapping->tree_lock);

568

spin_unlock_irq(&mapping->tree_lock);

569

swapcache_free(swap, page);

569

swapcache_free(swap, page);

570

} else {

570

} else {

571

void (*freepage)(struct page *);

571

void (*freepage)(struct page *);

572

573

freepage = mapping->a_ops->freepage;

573

freepage = mapping->a_ops->freepage;

574

575

__delete_from_page_cache(page);

575

__delete_from_page_cache(page);

576

spin_unlock_irq(&mapping->tree_lock);

576

spin_unlock_irq(&mapping->tree_lock);

577

mem_cgroup_uncharge_cache_page(page);

577

mem_cgroup_uncharge_cache_page(page);

578

579

if (freepage != NULL)

579

if (freepage != NULL)

580

freepage(page);

580

freepage(page);

581

}

581

}

582

583

return 1;

583

return 1;

584

585

cannot_free:

585

cannot_free:

586

spin_unlock_irq(&mapping->tree_lock);

586

spin_unlock_irq(&mapping->tree_lock);

587

return 0;

587

return 0;

588

}

588

}

589

590

/*

590

/*

591

* Attempt to detach a locked page from its ->mapping. If it is dirty or if

591

* Attempt to detach a locked page from its ->mapping. If it is dirty or if

592

* someone else has a ref on the page, abort and return 0. If it was

592

* someone else has a ref on the page, abort and return 0. If it was

593

* successfully detached, return 1. Assumes the caller has a single ref on

593

* successfully detached, return 1. Assumes the caller has a single ref on

594

* this page.

594

* this page.

595

*/

595

*/

596

int remove_mapping(struct address_space *mapping, struct page *page)

596

int remove_mapping(struct address_space *mapping, struct page *page)

597

{

597

{

598

if (__remove_mapping(mapping, page)) {

598

if (__remove_mapping(mapping, page)) {

599

/*

599

/*

600

* Unfreezing the refcount with 1 rather than 2 effectively

600

* Unfreezing the refcount with 1 rather than 2 effectively

601

* drops the pagecache ref for us without requiring another

601

* drops the pagecache ref for us without requiring another

602

* atomic operation.

602

* atomic operation.

603

*/

603

*/

604

page_unfreeze_refs(page, 1);

604

page_unfreeze_refs(page, 1);

605

return 1;

605

return 1;

606

}

606

}

607

return 0;

607

return 0;

608

}

608

}

609

610

/**

610

/**

611

* putback_lru_page - put previously isolated page onto appropriate LRU list

611

* putback_lru_page - put previously isolated page onto appropriate LRU list

612

* @page: page to be put back to appropriate lru list

612

* @page: page to be put back to appropriate lru list

613

*

613

*

614

* Add previously isolated @page to appropriate LRU list.

614

* Add previously isolated @page to appropriate LRU list.

615

* Page may still be unevictable for other reasons.

615

* Page may still be unevictable for other reasons.

616

*

616

*

617

* lru_lock must not be held, interrupts must be enabled.

617

* lru_lock must not be held, interrupts must be enabled.

618

*/

618

*/

619

void putback_lru_page(struct page *page)

619

void putback_lru_page(struct page *page)

620

{

620

{

621

bool is_unevictable;

621

bool is_unevictable;

622

int was_unevictable = PageUnevictable(page);

622

int was_unevictable = PageUnevictable(page);

623

624

VM_BUG_ON(PageLRU(page));

624

VM_BUG_ON(PageLRU(page));

625

626

redo:

626

redo:

627

ClearPageUnevictable(page);

627

ClearPageUnevictable(page);

628

629

if (page_evictable(page)) {

629

if (page_evictable(page)) {

630

/*

630

/*

631

* For evictable pages, we can use the cache.

631

* For evictable pages, we can use the cache.

632

* In event of a race, worst case is we end up with an

632

* In event of a race, worst case is we end up with an

633

* unevictable page on [in]active list.

633

* unevictable page on [in]active list.

634

* We know how to handle that.

634

* We know how to handle that.

635

*/

635

*/

636

is_unevictable = false;

636

is_unevictable = false;

637

lru_cache_add(page);

637

lru_cache_add(page);

638

} else {

638

} else {

639

/*

639

/*

640

* Put unevictable pages directly on zone's unevictable

640

* Put unevictable pages directly on zone's unevictable

641

* list.

641

* list.

642

*/

642

*/

643

is_unevictable = true;

643

is_unevictable = true;

644

add_page_to_unevictable_list(page);

644

add_page_to_unevictable_list(page);

645

/*

645

/*

646

* When racing with an mlock or AS_UNEVICTABLE clearing

646

* When racing with an mlock or AS_UNEVICTABLE clearing

647

* (page is unlocked) make sure that if the other thread

647

* (page is unlocked) make sure that if the other thread

648

* does not observe our setting of PG_lru and fails

648

* does not observe our setting of PG_lru and fails

649

* isolation/check_move_unevictable_pages,

649

* isolation/check_move_unevictable_pages,

650

* we see PG_mlocked/AS_UNEVICTABLE cleared below and move

650

* we see PG_mlocked/AS_UNEVICTABLE cleared below and move

651

* the page back to the evictable list.

651

* the page back to the evictable list.

652

*

652

*

653

* The other side is TestClearPageMlocked() or shmem_lock().

653

* The other side is TestClearPageMlocked() or shmem_lock().

654

*/

654

*/

655

smp_mb();

655

smp_mb();

656

}

656

}

657

658

/*

658

/*

659

* page's status can change while we move it among lru. If an evictable

659

* page's status can change while we move it among lru. If an evictable

660

* page is on unevictable list, it never be freed. To avoid that,

660

* page is on unevictable list, it never be freed. To avoid that,

661

* check after we added it to the list, again.

661

* check after we added it to the list, again.

662

*/

662

*/

663

if (is_unevictable && page_evictable(page)) {

663

if (is_unevictable && page_evictable(page)) {

664

if (!isolate_lru_page(page)) {

664

if (!isolate_lru_page(page)) {

665

put_page(page);

665

put_page(page);

666

goto redo;

666

goto redo;

667

}

667

}

668

/* This means someone else dropped this page from LRU

668

/* This means someone else dropped this page from LRU

669

* So, it will be freed or putback to LRU again. There is

669

* So, it will be freed or putback to LRU again. There is

670

* nothing to do here.

670

* nothing to do here.

671

*/

671

*/

672

}

672

}

673

674

if (was_unevictable && !is_unevictable)

674

if (was_unevictable && !is_unevictable)

675

count_vm_event(UNEVICTABLE_PGRESCUED);

675

count_vm_event(UNEVICTABLE_PGRESCUED);

676

else if (!was_unevictable && is_unevictable)

676

else if (!was_unevictable && is_unevictable)

677

count_vm_event(UNEVICTABLE_PGCULLED);

677

count_vm_event(UNEVICTABLE_PGCULLED);

678

679

put_page(page); /* drop ref from isolate */

679

put_page(page); /* drop ref from isolate */

680

}

680

}

681

682

enum page_references {

682

enum page_references {

683

PAGEREF_RECLAIM,

683

PAGEREF_RECLAIM,

684

PAGEREF_RECLAIM_CLEAN,

684

PAGEREF_RECLAIM_CLEAN,

685

PAGEREF_KEEP,

685

PAGEREF_KEEP,

686

PAGEREF_ACTIVATE,

686

PAGEREF_ACTIVATE,

687

};

687

};

688

689

static enum page_references page_check_references(struct page *page,

689

static enum page_references page_check_references(struct page *page,

690

struct scan_control *sc)

690

struct scan_control *sc)

691

{

691

{

692

int referenced_ptes, referenced_page;

692

int referenced_ptes, referenced_page;

693

unsigned long vm_flags;

693

unsigned long vm_flags;

694

695

referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,

695

referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,

696

&vm_flags);

696

&vm_flags);

697

referenced_page = TestClearPageReferenced(page);

697

referenced_page = TestClearPageReferenced(page);

698

699

/*

699

/*

700

* Mlock lost the isolation race with us. Let try_to_unmap()

700

* Mlock lost the isolation race with us. Let try_to_unmap()

701

* move the page to the unevictable list.

701

* move the page to the unevictable list.

702

*/

702

*/

703

if (vm_flags & VM_LOCKED)

703

if (vm_flags & VM_LOCKED)

704

return PAGEREF_RECLAIM;

704

return PAGEREF_RECLAIM;

705

706

if (referenced_ptes) {

706

if (referenced_ptes) {

707

if (PageSwapBacked(page))

707

if (PageSwapBacked(page))

708

return PAGEREF_ACTIVATE;

708

return PAGEREF_ACTIVATE;

709

/*

709

/*

710

* All mapped pages start out with page table

710

* All mapped pages start out with page table

711

* references from the instantiating fault, so we need

711

* references from the instantiating fault, so we need

712

* to look twice if a mapped file page is used more

712

* to look twice if a mapped file page is used more

713

* than once.

713

* than once.

714

*

714

*

715

* Mark it and spare it for another trip around the

715

* Mark it and spare it for another trip around the

716

* inactive list. Another page table reference will

716

* inactive list. Another page table reference will

717

* lead to its activation.

717

* lead to its activation.

718

*

718

*

719

* Note: the mark is set for activated pages as well

719

* Note: the mark is set for activated pages as well

720

* so that recently deactivated but used pages are

720

* so that recently deactivated but used pages are

721

* quickly recovered.

721

* quickly recovered.

722

*/

722

*/

723

SetPageReferenced(page);

723

SetPageReferenced(page);

724

725

if (referenced_page || referenced_ptes > 1)

725

if (referenced_page || referenced_ptes > 1)

726

return PAGEREF_ACTIVATE;

726

return PAGEREF_ACTIVATE;

727

728

/*

728

/*

729

* Activate file-backed executable pages after first usage.

729

* Activate file-backed executable pages after first usage.

730

*/

730

*/

731

if (vm_flags & VM_EXEC)

731

if (vm_flags & VM_EXEC)

732

return PAGEREF_ACTIVATE;

732

return PAGEREF_ACTIVATE;

733

734

return PAGEREF_KEEP;

734

return PAGEREF_KEEP;

735

}

735

}

736

737

/* Reclaim if clean, defer dirty pages to writeback */

737

/* Reclaim if clean, defer dirty pages to writeback */

738

if (referenced_page && !PageSwapBacked(page))

738

if (referenced_page && !PageSwapBacked(page))

739

return PAGEREF_RECLAIM_CLEAN;

739

return PAGEREF_RECLAIM_CLEAN;

740

741

return PAGEREF_RECLAIM;

741

return PAGEREF_RECLAIM;

742

}

742

}

743

744

/* Check if a page is dirty or under writeback */

744

/* Check if a page is dirty or under writeback */

745

static void page_check_dirty_writeback(struct page *page,

745

static void page_check_dirty_writeback(struct page *page,

746

bool *dirty, bool *writeback)

746

bool *dirty, bool *writeback)

747

{

747

{

748

struct address_space *mapping;

748

struct address_space *mapping;

749

750

/*

750

/*

751

* Anonymous pages are not handled by flushers and must be written

751

* Anonymous pages are not handled by flushers and must be written

752

* from reclaim context. Do not stall reclaim based on them

752

* from reclaim context. Do not stall reclaim based on them

753

*/

753

*/

754

if (!page_is_file_cache(page)) {

754

if (!page_is_file_cache(page)) {

755

*dirty = false;

755

*dirty = false;

756

*writeback = false;

756

*writeback = false;

757

return;

757

return;

758

}

758

}

759

760

/* By default assume that the page flags are accurate */

760

/* By default assume that the page flags are accurate */

761

*dirty = PageDirty(page);

761

*dirty = PageDirty(page);

762

*writeback = PageWriteback(page);

762

*writeback = PageWriteback(page);

763

764

/* Verify dirty/writeback state if the filesystem supports it */

764

/* Verify dirty/writeback state if the filesystem supports it */

765

if (!page_has_private(page))

765

if (!page_has_private(page))

766

return;

766

return;

767

768

mapping = page_mapping(page);

768

mapping = page_mapping(page);

769

if (mapping && mapping->a_ops->is_dirty_writeback)

769

if (mapping && mapping->a_ops->is_dirty_writeback)

770

mapping->a_ops->is_dirty_writeback(page, dirty, writeback);

770

mapping->a_ops->is_dirty_writeback(page, dirty, writeback);

771

}

771

}

772

773

/*

773

/*

774

* shrink_page_list() returns the number of reclaimed pages

774

* shrink_page_list() returns the number of reclaimed pages

775

*/

775

*/

776

static unsigned long shrink_page_list(struct list_head *page_list,

776

static unsigned long shrink_page_list(struct list_head *page_list,

777

struct zone *zone,

777

struct zone *zone,

778

struct scan_control *sc,

778

struct scan_control *sc,

779

enum ttu_flags ttu_flags,

779

enum ttu_flags ttu_flags,

780

unsigned long *ret_nr_dirty,

780

unsigned long *ret_nr_dirty,

781

unsigned long *ret_nr_unqueued_dirty,

781

unsigned long *ret_nr_unqueued_dirty,

782

unsigned long *ret_nr_congested,

782

unsigned long *ret_nr_congested,

783

unsigned long *ret_nr_writeback,

783

unsigned long *ret_nr_writeback,

784

unsigned long *ret_nr_immediate,

784

unsigned long *ret_nr_immediate,

785

bool force_reclaim)

785

bool force_reclaim)

786

{

786

{

787

LIST_HEAD(ret_pages);

787

LIST_HEAD(ret_pages);

788

LIST_HEAD(free_pages);

788

LIST_HEAD(free_pages);

789

int pgactivate = 0;

789

int pgactivate = 0;

790

unsigned long nr_unqueued_dirty = 0;

790

unsigned long nr_unqueued_dirty = 0;

791

unsigned long nr_dirty = 0;

791

unsigned long nr_dirty = 0;

792

unsigned long nr_congested = 0;

792

unsigned long nr_congested = 0;

793

unsigned long nr_reclaimed = 0;

793

unsigned long nr_reclaimed = 0;

794

unsigned long nr_writeback = 0;

794

unsigned long nr_writeback = 0;

795

unsigned long nr_immediate = 0;

795

unsigned long nr_immediate = 0;

796

797

cond_resched();

797

cond_resched();

798

799

mem_cgroup_uncharge_start();

799

mem_cgroup_uncharge_start();

800

while (!list_empty(page_list)) {

800

while (!list_empty(page_list)) {

801

struct address_space *mapping;

801

struct address_space *mapping;

802

struct page *page;

802

struct page *page;

803

int may_enter_fs;

803

int may_enter_fs;

804

enum page_references references = PAGEREF_RECLAIM_CLEAN;

804

enum page_references references = PAGEREF_RECLAIM_CLEAN;

805

bool dirty, writeback;

805

bool dirty, writeback;

806

807

cond_resched();

807

cond_resched();

808

809

page = lru_to_page(page_list);

809

page = lru_to_page(page_list);

810

list_del(&page->lru);

810

list_del(&page->lru);

811

812

if (!trylock_page(page))

812

if (!trylock_page(page))

813

goto keep;

813

goto keep;

814

815

VM_BUG_ON(PageActive(page));

815

VM_BUG_ON(PageActive(page));

816

VM_BUG_ON(page_zone(page) != zone);

816

VM_BUG_ON(page_zone(page) != zone);

817

818

sc->nr_scanned++;

818

sc->nr_scanned++;

819

820

if (unlikely(!page_evictable(page)))

820

if (unlikely(!page_evictable(page)))

821

goto cull_mlocked;

821

goto cull_mlocked;

822

823

if (!sc->may_unmap && page_mapped(page))

823

if (!sc->may_unmap && page_mapped(page))

824

goto keep_locked;

824

goto keep_locked;

825

826

/* Double the slab pressure for mapped and swapcache pages */

826

/* Double the slab pressure for mapped and swapcache pages */

827

if (page_mapped(page) || PageSwapCache(page))

827

if (page_mapped(page) || PageSwapCache(page))

828

sc->nr_scanned++;

828

sc->nr_scanned++;

829

830

may_enter_fs = (sc->gfp_mask & __GFP_FS) ||

830

may_enter_fs = (sc->gfp_mask & __GFP_FS) ||

831

(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

831

(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

832

833

/*

833

/*

834

* The number of dirty pages determines if a zone is marked

834

* The number of dirty pages determines if a zone is marked

835

* reclaim_congested which affects wait_iff_congested. kswapd

835

* reclaim_congested which affects wait_iff_congested. kswapd

836

* will stall and start writing pages if the tail of the LRU

836

* will stall and start writing pages if the tail of the LRU

837

* is all dirty unqueued pages.

837

* is all dirty unqueued pages.

838

*/

838

*/

839

page_check_dirty_writeback(page, &dirty, &writeback);

839

page_check_dirty_writeback(page, &dirty, &writeback);

840

if (dirty || writeback)

840

if (dirty || writeback)

841

nr_dirty++;

841

nr_dirty++;

842

843

if (dirty && !writeback)

843

if (dirty && !writeback)

844

nr_unqueued_dirty++;

844

nr_unqueued_dirty++;

845

846

/*

846

/*

847

* Treat this page as congested if the underlying BDI is or if

847

* Treat this page as congested if the underlying BDI is or if

848

* pages are cycling through the LRU so quickly that the

848

* pages are cycling through the LRU so quickly that the

849

* pages marked for immediate reclaim are making it to the

849

* pages marked for immediate reclaim are making it to the

850

* end of the LRU a second time.

850

* end of the LRU a second time.

851

*/

851

*/

852

mapping = page_mapping(page);

852

mapping = page_mapping(page);

853

if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||

853

if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||

854

(writeback && PageReclaim(page)))

854

(writeback && PageReclaim(page)))

855

nr_congested++;

855

nr_congested++;

856

857

/*

857

/*

858

* If a page at the tail of the LRU is under writeback, there

858

* If a page at the tail of the LRU is under writeback, there

859

* are three cases to consider.

859

* are three cases to consider.

860

*

860

*

861

* 1) If reclaim is encountering an excessive number of pages

861

* 1) If reclaim is encountering an excessive number of pages

862

* under writeback and this page is both under writeback and

862

* under writeback and this page is both under writeback and

863

* PageReclaim then it indicates that pages are being queued

863

* PageReclaim then it indicates that pages are being queued

864

* for IO but are being recycled through the LRU before the

864

* for IO but are being recycled through the LRU before the

865

* IO can complete. Waiting on the page itself risks an

865

* IO can complete. Waiting on the page itself risks an

866

* indefinite stall if it is impossible to writeback the

866

* indefinite stall if it is impossible to writeback the

867

* page due to IO error or disconnected storage so instead

867

* page due to IO error or disconnected storage so instead

868

* note that the LRU is being scanned too quickly and the

868

* note that the LRU is being scanned too quickly and the

869

* caller can stall after page list has been processed.

869

* caller can stall after page list has been processed.

870

*

870

*

871

* 2) Global reclaim encounters a page, memcg encounters a

871

* 2) Global reclaim encounters a page, memcg encounters a

872

* page that is not marked for immediate reclaim or

872

* page that is not marked for immediate reclaim or

873

* the caller does not have __GFP_IO. In this case mark

873

* the caller does not have __GFP_IO. In this case mark

874

* the page for immediate reclaim and continue scanning.

874

* the page for immediate reclaim and continue scanning.

875

*

875

*

876

* __GFP_IO is checked because a loop driver thread might

876

* __GFP_IO is checked because a loop driver thread might

877

* enter reclaim, and deadlock if it waits on a page for

877

* enter reclaim, and deadlock if it waits on a page for

878

* which it is needed to do the write (loop masks off

878

* which it is needed to do the write (loop masks off

879

* __GFP_IO|__GFP_FS for this reason); but more thought

879

* __GFP_IO|__GFP_FS for this reason); but more thought

880

* would probably show more reasons.

880

* would probably show more reasons.

881

*

881

*

882

* Don't require __GFP_FS, since we're not going into the

882

* Don't require __GFP_FS, since we're not going into the

883

* FS, just waiting on its writeback completion. Worryingly,

883

* FS, just waiting on its writeback completion. Worryingly,

884

* ext4 gfs2 and xfs allocate pages with

884

* ext4 gfs2 and xfs allocate pages with

885

* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing

885

* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing

886

* may_enter_fs here is liable to OOM on them.

886

* may_enter_fs here is liable to OOM on them.

887

*

887

*

888

* 3) memcg encounters a page that is not already marked

888

* 3) memcg encounters a page that is not already marked

889

* PageReclaim. memcg does not have any dirty pages

889

* PageReclaim. memcg does not have any dirty pages

890

* throttling so we could easily OOM just because too many

890

* throttling so we could easily OOM just because too many

891

* pages are in writeback and there is nothing else to

891

* pages are in writeback and there is nothing else to

892

* reclaim. Wait for the writeback to complete.

892

* reclaim. Wait for the writeback to complete.

893

*/

893

*/

894

if (PageWriteback(page)) {

894

if (PageWriteback(page)) {

895

/* Case 1 above */

895

/* Case 1 above */

896

if (current_is_kswapd() &&

896

if (current_is_kswapd() &&

897

PageReclaim(page) &&

897

PageReclaim(page) &&

898

zone_is_reclaim_writeback(zone)) {

898

zone_is_reclaim_writeback(zone)) {

899

nr_immediate++;

899

nr_immediate++;

900

goto keep_locked;

900

goto keep_locked;

901

902

/* Case 2 above */

902

/* Case 2 above */

903

} else if (global_reclaim(sc) ||

903

} else if (global_reclaim(sc) ||

904

!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {

904

!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {

905

/*

905

/*

906

* This is slightly racy - end_page_writeback()

906

* This is slightly racy - end_page_writeback()

907

* might have just cleared PageReclaim, then

907

* might have just cleared PageReclaim, then

908

* setting PageReclaim here end up interpreted

908

* setting PageReclaim here end up interpreted

909

* as PageReadahead - but that does not matter

909

* as PageReadahead - but that does not matter

910

* enough to care. What we do want is for this

910

* enough to care. What we do want is for this

911

* page to have PageReclaim set next time memcg

911

* page to have PageReclaim set next time memcg

912

* reclaim reaches the tests above, so it will

912

* reclaim reaches the tests above, so it will

913

* then wait_on_page_writeback() to avoid OOM;

913

* then wait_on_page_writeback() to avoid OOM;

914

* and it's also appropriate in global reclaim.

914

* and it's also appropriate in global reclaim.

915

*/

915

*/

916

SetPageReclaim(page);

916

SetPageReclaim(page);

917

nr_writeback++;

917

nr_writeback++;

918

919

goto keep_locked;

919

goto keep_locked;

920

921

/* Case 3 above */

921

/* Case 3 above */

922

} else {

922

} else {

923

wait_on_page_writeback(page);

923

wait_on_page_writeback(page);

924

}

924

}

925

}

925

}

926

927

if (!force_reclaim)

927

if (!force_reclaim)

928

references = page_check_references(page, sc);

928

references = page_check_references(page, sc);

929

930

switch (references) {

930

switch (references) {

931

case PAGEREF_ACTIVATE:

931

case PAGEREF_ACTIVATE:

932

goto activate_locked;

932

goto activate_locked;

933

case PAGEREF_KEEP:

933

case PAGEREF_KEEP:

934

goto keep_locked;

934

goto keep_locked;

935

case PAGEREF_RECLAIM:

935

case PAGEREF_RECLAIM:

936

case PAGEREF_RECLAIM_CLEAN:

936

case PAGEREF_RECLAIM_CLEAN:

937

; /* try to reclaim the page below */

937

; /* try to reclaim the page below */

938

}

938

}

939

940

/*

940

/*

941

* Anonymous process memory has backing store?

941

* Anonymous process memory has backing store?

942

* Try to allocate it some swap space here.

942

* Try to allocate it some swap space here.

943

*/

943

*/

944

if (PageAnon(page) && !PageSwapCache(page)) {

944

if (PageAnon(page) && !PageSwapCache(page)) {

945

if (!(sc->gfp_mask & __GFP_IO))

945

if (!(sc->gfp_mask & __GFP_IO))

946

goto keep_locked;

946

goto keep_locked;

947

if (!add_to_swap(page, page_list))

947

if (!add_to_swap(page, page_list))

948

goto activate_locked;

948

goto activate_locked;

949

may_enter_fs = 1;

949

may_enter_fs = 1;

950

951

/* Adding to swap updated mapping */

951

/* Adding to swap updated mapping */

952

mapping = page_mapping(page);

952

mapping = page_mapping(page);

953

}

953

}

954

955

/*

955

/*

956

* The page is mapped into the page tables of one or more

956

* The page is mapped into the page tables of one or more

957

* processes. Try to unmap it here.

957

* processes. Try to unmap it here.

958

*/

958

*/

959

if (page_mapped(page) && mapping) {

959

if (page_mapped(page) && mapping) {

960

switch (try_to_unmap(page, ttu_flags)) {

960

switch (try_to_unmap(page, ttu_flags)) {

961

case SWAP_FAIL:

961

case SWAP_FAIL:

962

goto activate_locked;

962

goto activate_locked;

963

case SWAP_AGAIN:

963

case SWAP_AGAIN:

964

goto keep_locked;

964

goto keep_locked;

965

case SWAP_MLOCK:

965

case SWAP_MLOCK:

966

goto cull_mlocked;

966

goto cull_mlocked;

967

case SWAP_SUCCESS:

967

case SWAP_SUCCESS:

968

; /* try to free the page below */

968

; /* try to free the page below */

969

}

969

}

970

}

970

}

971

972

if (PageDirty(page)) {

972

if (PageDirty(page)) {

973

/*

973

/*

974

* Only kswapd can writeback filesystem pages to

974

* Only kswapd can writeback filesystem pages to

975

* avoid risk of stack overflow but only writeback

975

* avoid risk of stack overflow but only writeback

976

* if many dirty pages have been encountered.

976

* if many dirty pages have been encountered.

977

*/

977

*/

978

if (page_is_file_cache(page) &&

978

if (page_is_file_cache(page) &&

979

(!current_is_kswapd() ||

979

(!current_is_kswapd() ||

980

!zone_is_reclaim_dirty(zone))) {

980

!zone_is_reclaim_dirty(zone))) {

981

/*

981

/*

982

* Immediately reclaim when written back.

982

* Immediately reclaim when written back.

983

* Similar in principal to deactivate_page()

983

* Similar in principal to deactivate_page()

984

* except we already have the page isolated

984

* except we already have the page isolated

985

* and know it's dirty

985

* and know it's dirty

986

*/

986

*/

987

inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);

987

inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);

988

SetPageReclaim(page);

988

SetPageReclaim(page);

989

990

goto keep_locked;

990

goto keep_locked;

991

}

991

}

992

993

if (references == PAGEREF_RECLAIM_CLEAN)

993

if (references == PAGEREF_RECLAIM_CLEAN)

994

goto keep_locked;

994

goto keep_locked;

995

if (!may_enter_fs)

995

if (!may_enter_fs)

996

goto keep_locked;

996

goto keep_locked;

997

if (!sc->may_writepage)

997

if (!sc->may_writepage)

998

goto keep_locked;

998

goto keep_locked;

999

1000

/* Page is dirty, try to write it out here */

1000

/* Page is dirty, try to write it out here */

1001

switch (pageout(page, mapping, sc)) {

1001

switch (pageout(page, mapping, sc)) {

1002

case PAGE_KEEP:

1002

case PAGE_KEEP:

1003

goto keep_locked;

1003

goto keep_locked;

1004

case PAGE_ACTIVATE:

1004

case PAGE_ACTIVATE:

1005

goto activate_locked;

1005

goto activate_locked;

1006

case PAGE_SUCCESS:

1006

case PAGE_SUCCESS:

1007

if (PageWriteback(page))

1007

if (PageWriteback(page))

1008

goto keep;

1008

goto keep;

1009

if (PageDirty(page))

1009

if (PageDirty(page))

1010

goto keep;

1010

goto keep;

1011

1012

/*

1012

/*

1013

* A synchronous write - probably a ramdisk. Go

1013

* A synchronous write - probably a ramdisk. Go

1014

* ahead and try to reclaim the page.

1014

* ahead and try to reclaim the page.

1015

*/

1015

*/

1016

if (!trylock_page(page))

1016

if (!trylock_page(page))

1017

goto keep;

1017

goto keep;

1018

if (PageDirty(page) || PageWriteback(page))

1018

if (PageDirty(page) || PageWriteback(page))

1019

goto keep_locked;

1019

goto keep_locked;

1020

mapping = page_mapping(page);

1020

mapping = page_mapping(page);

1021

case PAGE_CLEAN:

1021

case PAGE_CLEAN:

1022

; /* try to free the page below */

1022

; /* try to free the page below */

1023

}

1023

}

1024

}

1024

}

1025

1026

/*

1026

/*

1027

* If the page has buffers, try to free the buffer mappings

1027

* If the page has buffers, try to free the buffer mappings

1028

* associated with this page. If we succeed we try to free

1028

* associated with this page. If we succeed we try to free

1029

* the page as well.

1029

* the page as well.

1030

*

1030

*

1031

* We do this even if the page is PageDirty().

1031

* We do this even if the page is PageDirty().

1032

* try_to_release_page() does not perform I/O, but it is

1032

* try_to_release_page() does not perform I/O, but it is

1033

* possible for a page to have PageDirty set, but it is actually

1033

* possible for a page to have PageDirty set, but it is actually

1034

* clean (all its buffers are clean). This happens if the

1034

* clean (all its buffers are clean). This happens if the

1035

* buffers were written out directly, with submit_bh(). ext3

1035

* buffers were written out directly, with submit_bh(). ext3

1036

* will do this, as well as the blockdev mapping.

1036

* will do this, as well as the blockdev mapping.

1037

* try_to_release_page() will discover that cleanness and will

1037

* try_to_release_page() will discover that cleanness and will

1038

* drop the buffers and mark the page clean - it can be freed.

1038

* drop the buffers and mark the page clean - it can be freed.

1039

*

1039

*

1040

* Rarely, pages can have buffers and no ->mapping. These are

1040

* Rarely, pages can have buffers and no ->mapping. These are

1041

* the pages which were not successfully invalidated in

1041

* the pages which were not successfully invalidated in

1042

* truncate_complete_page(). We try to drop those buffers here

1042

* truncate_complete_page(). We try to drop those buffers here

1043

* and if that worked, and the page is no longer mapped into

1043

* and if that worked, and the page is no longer mapped into

1044

* process address space (page_count == 1) it can be freed.

1044

* process address space (page_count == 1) it can be freed.

1045

* Otherwise, leave the page on the LRU so it is swappable.

1045

* Otherwise, leave the page on the LRU so it is swappable.

1046

*/

1046

*/

1047

if (page_has_private(page)) {

1047

if (page_has_private(page)) {

1048

if (!try_to_release_page(page, sc->gfp_mask))

1048

if (!try_to_release_page(page, sc->gfp_mask))

1049

goto activate_locked;

1049

goto activate_locked;

1050

if (!mapping && page_count(page) == 1) {

1050

if (!mapping && page_count(page) == 1) {

1051

unlock_page(page);

1051

unlock_page(page);

1052

if (put_page_testzero(page))

1052

if (put_page_testzero(page))

1053

goto free_it;

1053

goto free_it;

1054

else {

1054

else {

1055

/*

1055

/*

1056

* rare race with speculative reference.

1056

* rare race with speculative reference.

1057

* the speculative reference will free

1057

* the speculative reference will free

1058

* this page shortly, so we may

1058

* this page shortly, so we may

1059

* increment nr_reclaimed here (and

1059

* increment nr_reclaimed here (and

1060

* leave it off the LRU).

1060

* leave it off the LRU).

1061

*/

1061

*/

1062

nr_reclaimed++;

1062

nr_reclaimed++;

1063

continue;

1063

continue;

1064

}

1064

}

1065

}

1065

}

1066

}

1066

}

1067

1068

if (!mapping || !__remove_mapping(mapping, page))

1068

if (!mapping || !__remove_mapping(mapping, page))

1069

goto keep_locked;

1069

goto keep_locked;

1070

1071

/*

1071

/*

1072

* At this point, we have no other references and there is

1072

* At this point, we have no other references and there is

1073

* no way to pick any more up (removed from LRU, removed

1073

* no way to pick any more up (removed from LRU, removed

1074

* from pagecache). Can use non-atomic bitops now (and

1074

* from pagecache). Can use non-atomic bitops now (and

1075

* we obviously don't have to worry about waking up a process

1075

* we obviously don't have to worry about waking up a process

1076

* waiting on the page lock, because there are no references.

1076

* waiting on the page lock, because there are no references.

1077

*/

1077

*/

1078

__clear_page_locked(page);

1078

__clear_page_locked(page);

1079

free_it:

1079

free_it:

1080

nr_reclaimed++;

1080

nr_reclaimed++;

1081

1082

/*

1082

/*

1083

* Is there need to periodically free_page_list? It would

1083

* Is there need to periodically free_page_list? It would

1084

* appear not as the counts should be low

1084

* appear not as the counts should be low

1085

*/

1085

*/

1086

list_add(&page->lru, &free_pages);

1086

list_add(&page->lru, &free_pages);

1087

continue;

1087

continue;

1088

1089

cull_mlocked:

1089

cull_mlocked:

1090

if (PageSwapCache(page))

1090

if (PageSwapCache(page))

1091

try_to_free_swap(page);

1091

try_to_free_swap(page);

1092

unlock_page(page);

1092

unlock_page(page);

1093

putback_lru_page(page);

1093

putback_lru_page(page);

1094

continue;

1094

continue;

1095

1096

activate_locked:

1096

activate_locked:

1097

/* Not a candidate for swapping, so reclaim swap space. */

1097

/* Not a candidate for swapping, so reclaim swap space. */

1098

if (PageSwapCache(page) && vm_swap_full())

1098

if (PageSwapCache(page) && vm_swap_full())

1099

try_to_free_swap(page);

1099

try_to_free_swap(page);

1100

VM_BUG_ON(PageActive(page));

1100

VM_BUG_ON(PageActive(page));

1101

SetPageActive(page);

1101

SetPageActive(page);

1102

pgactivate++;

1102

pgactivate++;

1103

keep_locked:

1103

keep_locked:

1104

unlock_page(page);

1104

unlock_page(page);

1105

keep:

1105

keep:

1106

list_add(&page->lru, &ret_pages);

1106

list_add(&page->lru, &ret_pages);

1107

VM_BUG_ON(PageLRU(page) || PageUnevictable(page));

1107

VM_BUG_ON(PageLRU(page) || PageUnevictable(page));

1108

}

1108

}

1109

1110

free_hot_cold_page_list(&free_pages, true);

1110

free_hot_cold_page_list(&free_pages, true);

1111

1112

list_splice(&ret_pages, page_list);

1112

list_splice(&ret_pages, page_list);

1113

count_vm_events(PGACTIVATE, pgactivate);

1113

count_vm_events(PGACTIVATE, pgactivate);

1114

mem_cgroup_uncharge_end();

1114

mem_cgroup_uncharge_end();

1115

*ret_nr_dirty += nr_dirty;

1115

*ret_nr_dirty += nr_dirty;

1116

*ret_nr_congested += nr_congested;

1116

*ret_nr_congested += nr_congested;

1117

*ret_nr_unqueued_dirty += nr_unqueued_dirty;

1117

*ret_nr_unqueued_dirty += nr_unqueued_dirty;

1118

*ret_nr_writeback += nr_writeback;

1118

*ret_nr_writeback += nr_writeback;

1119

*ret_nr_immediate += nr_immediate;

1119

*ret_nr_immediate += nr_immediate;

1120

return nr_reclaimed;

1120

return nr_reclaimed;

1121

}

1121

}

1122

1123

unsigned long reclaim_clean_pages_from_list(struct zone *zone,

1123

unsigned long reclaim_clean_pages_from_list(struct zone *zone,

1124

struct list_head *page_list)

1124

struct list_head *page_list)

1125

{

1125

{

1126

struct scan_control sc = {

1126

struct scan_control sc = {

1127

.gfp_mask = GFP_KERNEL,

1127

.gfp_mask = GFP_KERNEL,

1128

.priority = DEF_PRIORITY,

1128

.priority = DEF_PRIORITY,

1129

.may_unmap = 1,

1129

.may_unmap = 1,

1130

};

1130

};

1131

unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;

1131

unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;

1132

struct page *page, *next;

1132

struct page *page, *next;

1133

LIST_HEAD(clean_pages);

1133

LIST_HEAD(clean_pages);

1134

1135

list_for_each_entry_safe(page, next, page_list, lru) {

1135

list_for_each_entry_safe(page, next, page_list, lru) {

1136

if (page_is_file_cache(page) && !PageDirty(page) &&

1136

if (page_is_file_cache(page) && !PageDirty(page) &&

1137

!isolated_balloon_page(page)) {

1137

!isolated_balloon_page(page)) {

1138

ClearPageActive(page);

1138

ClearPageActive(page);

1139

list_move(&page->lru, &clean_pages);

1139

list_move(&page->lru, &clean_pages);

1140

}

1140

}

1141

}

1141

}

1142

1143

ret = shrink_page_list(&clean_pages, zone, &sc,

1143

ret = shrink_page_list(&clean_pages, zone, &sc,

1144

TTU_UNMAP|TTU_IGNORE_ACCESS,

1144

TTU_UNMAP|TTU_IGNORE_ACCESS,

1145

&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);

1145

&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);

1146

list_splice(&clean_pages, page_list);

1146

list_splice(&clean_pages, page_list);

1147

mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);

1147

mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);

1148

return ret;

1148

return ret;

1149

}

1149

}

1150

1151

/*

1151

/*

1152

* Attempt to remove the specified page from its LRU. Only take this page

1152

* Attempt to remove the specified page from its LRU. Only take this page

1153

* if it is of the appropriate PageActive status. Pages which are being

1153

* if it is of the appropriate PageActive status. Pages which are being

1154

* freed elsewhere are also ignored.

1154

* freed elsewhere are also ignored.

1155

*

1155

*

1156

* page: page to consider

1156

* page: page to consider

1157

* mode: one of the LRU isolation modes defined above

1157

* mode: one of the LRU isolation modes defined above

1158

*

1158

*

1159

* returns 0 on success, -ve errno on failure.

1159

* returns 0 on success, -ve errno on failure.

1160

*/

1160

*/

1161

int __isolate_lru_page(struct page *page, isolate_mode_t mode)

1161

int __isolate_lru_page(struct page *page, isolate_mode_t mode)

1162

{

1162

{

1163

int ret = -EINVAL;

1163

int ret = -EINVAL;

1164

1165

/* Only take pages on the LRU. */

1165

/* Only take pages on the LRU. */

1166

if (!PageLRU(page))

1166

if (!PageLRU(page))

1167

return ret;

1167

return ret;

1168

1169

/* Compaction should not handle unevictable pages but CMA can do so */

1169

/* Compaction should not handle unevictable pages but CMA can do so */

1170

if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))

1170

if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))

1171

return ret;

1171

return ret;

1172

1173

ret = -EBUSY;

1173

ret = -EBUSY;

1174

1175

/*

1175

/*

1176

* To minimise LRU disruption, the caller can indicate that it only

1176

* To minimise LRU disruption, the caller can indicate that it only

1177

* wants to isolate pages it will be able to operate on without

1177

* wants to isolate pages it will be able to operate on without

1178

* blocking - clean pages for the most part.

1178

* blocking - clean pages for the most part.

1179

*

1179

*

1180

* ISOLATE_CLEAN means that only clean pages should be isolated. This

1180

* ISOLATE_CLEAN means that only clean pages should be isolated. This

1181

* is used by reclaim when it is cannot write to backing storage

1181

* is used by reclaim when it is cannot write to backing storage

1182

*

1182

*

1183

* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages

1183

* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages

1184

* that it is possible to migrate without blocking

1184

* that it is possible to migrate without blocking

1185

*/

1185

*/

1186

if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {

1186

if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {

1187

/* All the caller can do on PageWriteback is block */

1187

/* All the caller can do on PageWriteback is block */

1188

if (PageWriteback(page))

1188

if (PageWriteback(page))

1189

return ret;

1189

return ret;

1190

1191

if (PageDirty(page)) {

1191

if (PageDirty(page)) {

1192

struct address_space *mapping;

1192

struct address_space *mapping;

1193

1194

/* ISOLATE_CLEAN means only clean pages */

1194

/* ISOLATE_CLEAN means only clean pages */

1195

if (mode & ISOLATE_CLEAN)

1195

if (mode & ISOLATE_CLEAN)

1196

return ret;

1196

return ret;

1197

1198

/*

1198

/*

1199

* Only pages without mappings or that have a

1199

* Only pages without mappings or that have a

1200

* ->migratepage callback are possible to migrate

1200

* ->migratepage callback are possible to migrate

1201

* without blocking

1201

* without blocking

1202

*/

1202

*/

1203

mapping = page_mapping(page);

1203

mapping = page_mapping(page);

1204

if (mapping && !mapping->a_ops->migratepage)

1204

if (mapping && !mapping->a_ops->migratepage)

1205

return ret;

1205

return ret;

1206

}

1206

}

1207

}

1207

}

1208

1209

if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))

1209

if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))

1210

return ret;

1210

return ret;

1211

1212

if (likely(get_page_unless_zero(page))) {

1212

if (likely(get_page_unless_zero(page))) {

1213

/*

1213

/*

1214

* Be careful not to clear PageLRU until after we're

1214

* Be careful not to clear PageLRU until after we're

1215

* sure the page is not being freed elsewhere -- the

1215

* sure the page is not being freed elsewhere -- the

1216

* page release code relies on it.

1216

* page release code relies on it.

1217

*/

1217

*/

1218

ClearPageLRU(page);

1218

ClearPageLRU(page);

1219

ret = 0;

1219

ret = 0;

1220

}

1220

}

1221

1222

return ret;

1222

return ret;

1223

}

1223

}

1224

1225

/*

1225

/*

1226

* zone->lru_lock is heavily contended. Some of the functions that

1226

* zone->lru_lock is heavily contended. Some of the functions that

1227

* shrink the lists perform better by taking out a batch of pages

1227

* shrink the lists perform better by taking out a batch of pages

1228

* and working on them outside the LRU lock.

1228

* and working on them outside the LRU lock.

1229

*

1229

*

1230

* For pagecache intensive workloads, this function is the hottest

1230

* For pagecache intensive workloads, this function is the hottest

1231

* spot in the kernel (apart from copy_*_user functions).

1231

* spot in the kernel (apart from copy_*_user functions).

1232

*

1232

*

1233

* Appropriate locks must be held before calling this function.

1233

* Appropriate locks must be held before calling this function.

1234

*

1234

*

1235

* @nr_to_scan: The number of pages to look through on the list.

1235

* @nr_to_scan: The number of pages to look through on the list.

1236

* @lruvec: The LRU vector to pull pages from.

1236

* @lruvec: The LRU vector to pull pages from.

1237

* @dst: The temp list to put pages on to.

1237

* @dst: The temp list to put pages on to.

1238

* @nr_scanned: The number of pages that were scanned.

1238

* @nr_scanned: The number of pages that were scanned.

1239

* @sc: The scan_control struct for this reclaim session

1239

* @sc: The scan_control struct for this reclaim session

1240

* @mode: One of the LRU isolation modes

1240

* @mode: One of the LRU isolation modes

1241

* @lru: LRU list id for isolating

1241

* @lru: LRU list id for isolating

1242

*

1242

*

1243

* returns how many pages were moved onto *@dst.

1243

* returns how many pages were moved onto *@dst.

1244

*/

1244

*/

1245

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,

1245

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,

1246

struct lruvec *lruvec, struct list_head *dst,

1246

struct lruvec *lruvec, struct list_head *dst,

1247

unsigned long *nr_scanned, struct scan_control *sc,

1247

unsigned long *nr_scanned, struct scan_control *sc,

1248

isolate_mode_t mode, enum lru_list lru)

1248

isolate_mode_t mode, enum lru_list lru)

1249

{

1249

{

1250

struct list_head *src = &lruvec->lists[lru];

1250

struct list_head *src = &lruvec->lists[lru];

1251

unsigned long nr_taken = 0;

1251

unsigned long nr_taken = 0;

1252

unsigned long scan;

1252

unsigned long scan;

1253

1254

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {

1254

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {

1255

struct page *page;

1255

struct page *page;

1256

int nr_pages;

1256

int nr_pages;

1257

1258

page = lru_to_page(src);

1258

page = lru_to_page(src);

1259

prefetchw_prev_lru_page(page, src, flags);

1259

prefetchw_prev_lru_page(page, src, flags);

1260

1261

VM_BUG_ON(!PageLRU(page));

1261

VM_BUG_ON(!PageLRU(page));

1262

1263

switch (__isolate_lru_page(page, mode)) {

1263

switch (__isolate_lru_page(page, mode)) {

1264

case 0:

1264

case 0:

1265

nr_pages = hpage_nr_pages(page);

1265

nr_pages = hpage_nr_pages(page);

1266

mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);

1266

mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);

1267

list_move(&page->lru, dst);

1267

list_move(&page->lru, dst);

1268

nr_taken += nr_pages;

1268

nr_taken += nr_pages;

1269

break;

1269

break;

1270

1271

case -EBUSY:

1271

case -EBUSY:

1272

/* else it is being freed elsewhere */

1272

/* else it is being freed elsewhere */

1273

list_move(&page->lru, src);

1273

list_move(&page->lru, src);

1274

continue;

1274

continue;

1275

1276

default:

1276

default:

1277

BUG();

1277

BUG();

1278

}

1278

}

1279

}

1279

}

1280

1281

*nr_scanned = scan;

1281

*nr_scanned = scan;

1282

trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,

1282

trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,

1283

nr_taken, mode, is_file_lru(lru));

1283

nr_taken, mode, is_file_lru(lru));

1284

return nr_taken;

1284

return nr_taken;

1285

}

1285

}

1286

1287

/**

1287

/**

1288

* isolate_lru_page - tries to isolate a page from its LRU list

1288

* isolate_lru_page - tries to isolate a page from its LRU list

1289

* @page: page to isolate from its LRU list

1289

* @page: page to isolate from its LRU list

1290

*

1290

*

1291

* Isolates a @page from an LRU list, clears PageLRU and adjusts the

1291

* Isolates a @page from an LRU list, clears PageLRU and adjusts the

1292

* vmstat statistic corresponding to whatever LRU list the page was on.

1292

* vmstat statistic corresponding to whatever LRU list the page was on.

1293

*

1293

*

1294

* Returns 0 if the page was removed from an LRU list.

1294

* Returns 0 if the page was removed from an LRU list.

1295

* Returns -EBUSY if the page was not on an LRU list.

1295

* Returns -EBUSY if the page was not on an LRU list.

1296

*

1296

*

1297

* The returned page will have PageLRU() cleared. If it was found on

1297

* The returned page will have PageLRU() cleared. If it was found on

1298

* the active list, it will have PageActive set. If it was found on

1298

* the active list, it will have PageActive set. If it was found on

1299

* the unevictable list, it will have the PageUnevictable bit set. That flag

1299

* the unevictable list, it will have the PageUnevictable bit set. That flag

1300

* may need to be cleared by the caller before letting the page go.

1300

* may need to be cleared by the caller before letting the page go.

1301

*

1301

*

1302

* The vmstat statistic corresponding to the list on which the page was

1302

* The vmstat statistic corresponding to the list on which the page was

1303

* found will be decremented.

1303

* found will be decremented.

1304

*

1304

*

1305

* Restrictions:

1305

* Restrictions:

1306

* (1) Must be called with an elevated refcount on the page. This is a

1306

* (1) Must be called with an elevated refcount on the page. This is a

1307

* fundamentnal difference from isolate_lru_pages (which is called

1307

* fundamentnal difference from isolate_lru_pages (which is called

1308

* without a stable reference).

1308

* without a stable reference).

1309

* (2) the lru_lock must not be held.

1309

* (2) the lru_lock must not be held.

1310

* (3) interrupts must be enabled.

1310

* (3) interrupts must be enabled.

1311

*/

1311

*/

1312

int isolate_lru_page(struct page *page)

1312

int isolate_lru_page(struct page *page)

1313

{

1313

{

1314

int ret = -EBUSY;

1314

int ret = -EBUSY;

1315

1316

VM_BUG_ON(!page_count(page));

1316

VM_BUG_ON(!page_count(page));

1317

1318

if (PageLRU(page)) {

1318

if (PageLRU(page)) {

1319

struct zone *zone = page_zone(page);

1319

struct zone *zone = page_zone(page);

1320

struct lruvec *lruvec;

1320

struct lruvec *lruvec;

1321

1322

spin_lock_irq(&zone->lru_lock);

1322

spin_lock_irq(&zone->lru_lock);

1323

lruvec = mem_cgroup_page_lruvec(page, zone);

1323

lruvec = mem_cgroup_page_lruvec(page, zone);

1324

if (PageLRU(page)) {

1324

if (PageLRU(page)) {

1325

int lru = page_lru(page);

1325

int lru = page_lru(page);

1326

get_page(page);

1326

get_page(page);

1327

ClearPageLRU(page);

1327

ClearPageLRU(page);

1328

del_page_from_lru_list(page, lruvec, lru);

1328

del_page_from_lru_list(page, lruvec, lru);

1329

ret = 0;

1329

ret = 0;

1330

}

1330

}

1331

spin_unlock_irq(&zone->lru_lock);

1331

spin_unlock_irq(&zone->lru_lock);

1332

}

1332

}

1333

return ret;

1333

return ret;

1334

}

1334

}

1335

1336

/*

1336

/*

1337

* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and

1337

* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and

1338

* then get resheduled. When there are massive number of tasks doing page

1338

* then get resheduled. When there are massive number of tasks doing page

1339

* allocation, such sleeping direct reclaimers may keep piling up on each CPU,

1339

* allocation, such sleeping direct reclaimers may keep piling up on each CPU,

1340

* the LRU list will go small and be scanned faster than necessary, leading to

1340

* the LRU list will go small and be scanned faster than necessary, leading to

1341

* unnecessary swapping, thrashing and OOM.

1341

* unnecessary swapping, thrashing and OOM.

1342

*/

1342

*/

1343

static int too_many_isolated(struct zone *zone, int file,

1343

static int too_many_isolated(struct zone *zone, int file,

1344

struct scan_control *sc)

1344

struct scan_control *sc)

1345

{

1345

{

1346

unsigned long inactive, isolated;

1346

unsigned long inactive, isolated;

1347

1348

if (current_is_kswapd())

1348

if (current_is_kswapd())

1349

return 0;

1349

return 0;

1350

1351

if (!global_reclaim(sc))

1351

if (!global_reclaim(sc))

1352

return 0;

1352

return 0;

1353

1354

if (file) {

1354

if (file) {

1355

inactive = zone_page_state(zone, NR_INACTIVE_FILE);

1355

inactive = zone_page_state(zone, NR_INACTIVE_FILE);

1356

isolated = zone_page_state(zone, NR_ISOLATED_FILE);

1356

isolated = zone_page_state(zone, NR_ISOLATED_FILE);

1357

} else {

1357

} else {

1358

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1358

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1359

isolated = zone_page_state(zone, NR_ISOLATED_ANON);

1359

isolated = zone_page_state(zone, NR_ISOLATED_ANON);

1360

}

1360

}

1361

1362

/*

1362

/*

1363

* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they

1363

* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they

1364

* won't get blocked by normal direct-reclaimers, forming a circular

1364

* won't get blocked by normal direct-reclaimers, forming a circular

1365

* deadlock.

1365

* deadlock.

1366

*/

1366

*/

1367

if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)

1367

if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)

1368

inactive >>= 3;

1368

inactive >>= 3;

1369

1370

return isolated > inactive;

1370

return isolated > inactive;

1371

}

1371

}

1372

1373

static noinline_for_stack void

1373

static noinline_for_stack void

1374

putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)

1374

putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)

1375

{

1375

{

1376

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1376

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1377

struct zone *zone = lruvec_zone(lruvec);

1377

struct zone *zone = lruvec_zone(lruvec);

1378

LIST_HEAD(pages_to_free);

1378

LIST_HEAD(pages_to_free);

1379

1380

/*

1380

/*

1381

* Put back any unfreeable pages.

1381

* Put back any unfreeable pages.

1382

*/

1382

*/

1383

while (!list_empty(page_list)) {

1383

while (!list_empty(page_list)) {

1384

struct page *page = lru_to_page(page_list);

1384

struct page *page = lru_to_page(page_list);

1385

int lru;

1385

int lru;

1386

1387

VM_BUG_ON(PageLRU(page));

1387

VM_BUG_ON(PageLRU(page));

1388

list_del(&page->lru);

1388

list_del(&page->lru);

1389

if (unlikely(!page_evictable(page))) {

1389

if (unlikely(!page_evictable(page))) {

1390

spin_unlock_irq(&zone->lru_lock);

1390

spin_unlock_irq(&zone->lru_lock);

1391

putback_lru_page(page);

1391

putback_lru_page(page);

1392

spin_lock_irq(&zone->lru_lock);

1392

spin_lock_irq(&zone->lru_lock);

1393

continue;

1393

continue;

1394

}

1394

}

1395

1396

lruvec = mem_cgroup_page_lruvec(page, zone);

1396

lruvec = mem_cgroup_page_lruvec(page, zone);

1397

1398

SetPageLRU(page);

1398

SetPageLRU(page);

1399

lru = page_lru(page);

1399

lru = page_lru(page);

1400

add_page_to_lru_list(page, lruvec, lru);

1400

add_page_to_lru_list(page, lruvec, lru);

1401

1402

if (is_active_lru(lru)) {

1402

if (is_active_lru(lru)) {

1403

int file = is_file_lru(lru);

1403

int file = is_file_lru(lru);

1404

int numpages = hpage_nr_pages(page);

1404

int numpages = hpage_nr_pages(page);

1405

reclaim_stat->recent_rotated[file] += numpages;

1405

reclaim_stat->recent_rotated[file] += numpages;

1406

}

1406

}

1407

if (put_page_testzero(page)) {

1407

if (put_page_testzero(page)) {

1408

__ClearPageLRU(page);

1408

__ClearPageLRU(page);

1409

__ClearPageActive(page);

1409

__ClearPageActive(page);

1410

del_page_from_lru_list(page, lruvec, lru);

1410

del_page_from_lru_list(page, lruvec, lru);

1411

1412

if (unlikely(PageCompound(page))) {

1412

if (unlikely(PageCompound(page))) {

1413

spin_unlock_irq(&zone->lru_lock);

1413

spin_unlock_irq(&zone->lru_lock);

1414

(*get_compound_page_dtor(page))(page);

1414

(*get_compound_page_dtor(page))(page);

1415

spin_lock_irq(&zone->lru_lock);

1415

spin_lock_irq(&zone->lru_lock);

1416

} else

1416

} else

1417

list_add(&page->lru, &pages_to_free);

1417

list_add(&page->lru, &pages_to_free);

1418

}

1418

}

1419

}

1419

}

1420

1421

/*

1421

/*

1422

* To save our caller's stack, now use input list for pages to free.

1422

* To save our caller's stack, now use input list for pages to free.

1423

*/

1423

*/

1424

list_splice(&pages_to_free, page_list);

1424

list_splice(&pages_to_free, page_list);

1425

}

1425

}

1426

1427

/*

1427

/*

1428

* shrink_inactive_list() is a helper for shrink_zone(). It returns the number

1428

* shrink_inactive_list() is a helper for shrink_zone(). It returns the number

1429

* of reclaimed pages

1429

* of reclaimed pages

1430

*/

1430

*/

1431

static noinline_for_stack unsigned long

1431

static noinline_for_stack unsigned long

1432

shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,

1432

shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,

1433

struct scan_control *sc, enum lru_list lru)

1433

struct scan_control *sc, enum lru_list lru)

1434

{

1434

{

1435

LIST_HEAD(page_list);

1435

LIST_HEAD(page_list);

1436

unsigned long nr_scanned;

1436

unsigned long nr_scanned;

1437

unsigned long nr_reclaimed = 0;

1437

unsigned long nr_reclaimed = 0;

1438

unsigned long nr_taken;

1438

unsigned long nr_taken;

1439

unsigned long nr_dirty = 0;

1439

unsigned long nr_dirty = 0;

1440

unsigned long nr_congested = 0;

1440

unsigned long nr_congested = 0;

1441

unsigned long nr_unqueued_dirty = 0;

1441

unsigned long nr_unqueued_dirty = 0;

1442

unsigned long nr_writeback = 0;

1442

unsigned long nr_writeback = 0;

1443

unsigned long nr_immediate = 0;

1443

unsigned long nr_immediate = 0;

1444

isolate_mode_t isolate_mode = 0;

1444

isolate_mode_t isolate_mode = 0;

1445

int file = is_file_lru(lru);

1445

int file = is_file_lru(lru);

1446

struct zone *zone = lruvec_zone(lruvec);

1446

struct zone *zone = lruvec_zone(lruvec);

1447

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1447

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1448

1449

while (unlikely(too_many_isolated(zone, file, sc))) {

1449

while (unlikely(too_many_isolated(zone, file, sc))) {

1450

congestion_wait(BLK_RW_ASYNC, HZ/10);

1450

congestion_wait(BLK_RW_ASYNC, HZ/10);

1451

1452

/* We are about to die and free our memory. Return now. */

1452

/* We are about to die and free our memory. Return now. */

1453

if (fatal_signal_pending(current))

1453

if (fatal_signal_pending(current))

1454

return SWAP_CLUSTER_MAX;

1454

return SWAP_CLUSTER_MAX;

1455

}

1455

}

1456

1457

lru_add_drain();

1457

lru_add_drain();

1458

1459

if (!sc->may_unmap)

1459

if (!sc->may_unmap)

1460

isolate_mode |= ISOLATE_UNMAPPED;

1460

isolate_mode |= ISOLATE_UNMAPPED;

1461

if (!sc->may_writepage)

1461

if (!sc->may_writepage)

1462

isolate_mode |= ISOLATE_CLEAN;

1462

isolate_mode |= ISOLATE_CLEAN;

1463

1464

spin_lock_irq(&zone->lru_lock);

1464

spin_lock_irq(&zone->lru_lock);

1465

1466

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,

1466

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,

1467

&nr_scanned, sc, isolate_mode, lru);

1467

&nr_scanned, sc, isolate_mode, lru);

1468

1469

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1469

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1470

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1470

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1471

1472

if (global_reclaim(sc)) {

1472

if (global_reclaim(sc)) {

1473

zone->pages_scanned += nr_scanned;

1473

zone->pages_scanned += nr_scanned;

1474

if (current_is_kswapd())

1474

if (current_is_kswapd())

1475

__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);

1475

__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);

1476

else

1476

else

1477

__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);

1477

__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);

1478

}

1478

}

1479

spin_unlock_irq(&zone->lru_lock);

1479

spin_unlock_irq(&zone->lru_lock);

1480

1481

if (nr_taken == 0)

1481

if (nr_taken == 0)

1482

return 0;

1482

return 0;

1483

1484

nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,

1484

nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,

1485

&nr_dirty, &nr_unqueued_dirty, &nr_congested,

1485

&nr_dirty, &nr_unqueued_dirty, &nr_congested,

1486

&nr_writeback, &nr_immediate,

1486

&nr_writeback, &nr_immediate,

1487

false);

1487

false);

1488

1489

spin_lock_irq(&zone->lru_lock);

1489

spin_lock_irq(&zone->lru_lock);

1490

1491

reclaim_stat->recent_scanned[file] += nr_taken;

1491

reclaim_stat->recent_scanned[file] += nr_taken;

1492

1493

if (global_reclaim(sc)) {

1493

if (global_reclaim(sc)) {

1494

if (current_is_kswapd())

1494

if (current_is_kswapd())

1495

__count_zone_vm_events(PGSTEAL_KSWAPD, zone,

1495

__count_zone_vm_events(PGSTEAL_KSWAPD, zone,

1496

nr_reclaimed);

1496

nr_reclaimed);

1497

else

1497

else

1498

__count_zone_vm_events(PGSTEAL_DIRECT, zone,

1498

__count_zone_vm_events(PGSTEAL_DIRECT, zone,

1499

nr_reclaimed);

1499

nr_reclaimed);

1500

}

1500

}

1501

1502

putback_inactive_pages(lruvec, &page_list);

1502

putback_inactive_pages(lruvec, &page_list);

1503

1504

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1504

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1505

1506

spin_unlock_irq(&zone->lru_lock);

1506

spin_unlock_irq(&zone->lru_lock);

1507

1508

free_hot_cold_page_list(&page_list, true);

1508

free_hot_cold_page_list(&page_list, true);

1509

1510

/*

1510

/*

1511

* If reclaim is isolating dirty pages under writeback, it implies

1511

* If reclaim is isolating dirty pages under writeback, it implies

1512

* that the long-lived page allocation rate is exceeding the page

1512

* that the long-lived page allocation rate is exceeding the page

1513

* laundering rate. Either the global limits are not being effective

1513

* laundering rate. Either the global limits are not being effective

1514

* at throttling processes due to the page distribution throughout

1514

* at throttling processes due to the page distribution throughout

1515

* zones or there is heavy usage of a slow backing device. The

1515

* zones or there is heavy usage of a slow backing device. The

1516

* only option is to throttle from reclaim context which is not ideal

1516

* only option is to throttle from reclaim context which is not ideal

1517

* as there is no guarantee the dirtying process is throttled in the

1517

* as there is no guarantee the dirtying process is throttled in the

1518

* same way balance_dirty_pages() manages.

1518

* same way balance_dirty_pages() manages.

1519

*

1519

*

1520

* Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number

1520

* Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number

1521

* of pages under pages flagged for immediate reclaim and stall if any

1521

* of pages under pages flagged for immediate reclaim and stall if any

1522

* are encountered in the nr_immediate check below.

1522

* are encountered in the nr_immediate check below.

1523

*/

1523

*/

1524

if (nr_writeback && nr_writeback == nr_taken)

1524

if (nr_writeback && nr_writeback == nr_taken)

1525

zone_set_flag(zone, ZONE_WRITEBACK);

1525

zone_set_flag(zone, ZONE_WRITEBACK);

1526

1527

/*

1527

/*

1528

* memcg will stall in page writeback so only consider forcibly

1528

* memcg will stall in page writeback so only consider forcibly

1529

* stalling for global reclaim

1529

* stalling for global reclaim

1530

*/

1530

*/

1531

if (global_reclaim(sc)) {

1531

if (global_reclaim(sc)) {

1532

/*

1532

/*

1533

* Tag a zone as congested if all the dirty pages scanned were

1533

* Tag a zone as congested if all the dirty pages scanned were

1534

* backed by a congested BDI and wait_iff_congested will stall.

1534

* backed by a congested BDI and wait_iff_congested will stall.

1535

*/

1535

*/

1536

if (nr_dirty && nr_dirty == nr_congested)

1536

if (nr_dirty && nr_dirty == nr_congested)

1537

zone_set_flag(zone, ZONE_CONGESTED);

1537

zone_set_flag(zone, ZONE_CONGESTED);

1538

1539

/*

1539

/*

1540

* If dirty pages are scanned that are not queued for IO, it

1540

* If dirty pages are scanned that are not queued for IO, it

1541

* implies that flushers are not keeping up. In this case, flag

1541

* implies that flushers are not keeping up. In this case, flag

1542

* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing

1542

* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing

1543

* pages from reclaim context.

1543

* pages from reclaim context.

1544

*/

1544

*/

1545

if (nr_unqueued_dirty == nr_taken)

1545

if (nr_unqueued_dirty == nr_taken)

1546

zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);

1546

zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);

1547

1548

/*

1548

/*

1549

* If kswapd scans pages marked marked for immediate

1549

* If kswapd scans pages marked marked for immediate

1550

* reclaim and under writeback (nr_immediate), it implies

1550

* reclaim and under writeback (nr_immediate), it implies

1551

* that pages are cycling through the LRU faster than

1551

* that pages are cycling through the LRU faster than

1552

* they are written so also forcibly stall.

1552

* they are written so also forcibly stall.

1553

*/

1553

*/

1554

if (nr_immediate)

1554

if (nr_immediate)

1555

congestion_wait(BLK_RW_ASYNC, HZ/10);

1555

congestion_wait(BLK_RW_ASYNC, HZ/10);

1556

}

1556

}

1557

1558

/*

1558

/*

1559

* Stall direct reclaim for IO completions if underlying BDIs or zone

1559

* Stall direct reclaim for IO completions if underlying BDIs or zone

1560

* is congested. Allow kswapd to continue until it starts encountering

1560

* is congested. Allow kswapd to continue until it starts encountering

1561

* unqueued dirty pages or cycling through the LRU too quickly.

1561

* unqueued dirty pages or cycling through the LRU too quickly.

1562

*/

1562

*/

1563

if (!sc->hibernation_mode && !current_is_kswapd())

1563

if (!sc->hibernation_mode && !current_is_kswapd())

1564

wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

1564

wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

1565

1566

trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,

1566

trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,

1567

zone_idx(zone),

1567

zone_idx(zone),

1568

nr_scanned, nr_reclaimed,

1568

nr_scanned, nr_reclaimed,

1569

sc->priority,

1569

sc->priority,

1570

trace_shrink_flags(file));

1570

trace_shrink_flags(file));

1571

return nr_reclaimed;

1571

return nr_reclaimed;

1572

}

1572

}

1573

1574

/*

1574

/*

1575

* This moves pages from the active list to the inactive list.

1575

* This moves pages from the active list to the inactive list.

1576

*

1576

*

1577

* We move them the other way if the page is referenced by one or more

1577

* We move them the other way if the page is referenced by one or more

1578

* processes, from rmap.

1578

* processes, from rmap.

1579

*

1579

*

1580

* If the pages are mostly unmapped, the processing is fast and it is

1580

* If the pages are mostly unmapped, the processing is fast and it is

1581

* appropriate to hold zone->lru_lock across the whole operation. But if

1581

* appropriate to hold zone->lru_lock across the whole operation. But if

1582

* the pages are mapped, the processing is slow (page_referenced()) so we

1582

* the pages are mapped, the processing is slow (page_referenced()) so we

1583

* should drop zone->lru_lock around each page. It's impossible to balance

1583

* should drop zone->lru_lock around each page. It's impossible to balance

1584

* this, so instead we remove the pages from the LRU while processing them.

1584

* this, so instead we remove the pages from the LRU while processing them.

1585

* It is safe to rely on PG_active against the non-LRU pages in here because

1585

* It is safe to rely on PG_active against the non-LRU pages in here because

1586

* nobody will play with that bit on a non-LRU page.

1586

* nobody will play with that bit on a non-LRU page.

1587

*

1587

*

1588

* The downside is that we have to touch page->_count against each page.

1588

* The downside is that we have to touch page->_count against each page.

1589

* But we had to alter page->flags anyway.

1589

* But we had to alter page->flags anyway.

1590

*/

1590

*/

1591

1592

static void move_active_pages_to_lru(struct lruvec *lruvec,

1592

static void move_active_pages_to_lru(struct lruvec *lruvec,

1593

struct list_head *list,

1593

struct list_head *list,

1594

struct list_head *pages_to_free,

1594

struct list_head *pages_to_free,

1595

enum lru_list lru)

1595

enum lru_list lru)

1596

{

1596

{

1597

struct zone *zone = lruvec_zone(lruvec);

1597

struct zone *zone = lruvec_zone(lruvec);

1598

unsigned long pgmoved = 0;

1598

unsigned long pgmoved = 0;

1599

struct page *page;

1599

struct page *page;

1600

int nr_pages;

1600

int nr_pages;

1601

1602

while (!list_empty(list)) {

1602

while (!list_empty(list)) {

1603

page = lru_to_page(list);

1603

page = lru_to_page(list);

1604

lruvec = mem_cgroup_page_lruvec(page, zone);

1604

lruvec = mem_cgroup_page_lruvec(page, zone);

1605

1606

VM_BUG_ON(PageLRU(page));

1606

VM_BUG_ON(PageLRU(page));

1607

SetPageLRU(page);

1607

SetPageLRU(page);

1608

1609

nr_pages = hpage_nr_pages(page);

1609

nr_pages = hpage_nr_pages(page);

1610

mem_cgroup_update_lru_size(lruvec, lru, nr_pages);

1610

mem_cgroup_update_lru_size(lruvec, lru, nr_pages);

1611

list_move(&page->lru, &lruvec->lists[lru]);

1611

list_move(&page->lru, &lruvec->lists[lru]);

1612

pgmoved += nr_pages;

1612

pgmoved += nr_pages;

1613

1614

if (put_page_testzero(page)) {

1614

if (put_page_testzero(page)) {

1615

__ClearPageLRU(page);

1615

__ClearPageLRU(page);

1616

__ClearPageActive(page);

1616

__ClearPageActive(page);

1617

del_page_from_lru_list(page, lruvec, lru);

1617

del_page_from_lru_list(page, lruvec, lru);

1618

1619

if (unlikely(PageCompound(page))) {

1619

if (unlikely(PageCompound(page))) {

1620

spin_unlock_irq(&zone->lru_lock);

1620

spin_unlock_irq(&zone->lru_lock);

1621

(*get_compound_page_dtor(page))(page);

1621

(*get_compound_page_dtor(page))(page);

1622

spin_lock_irq(&zone->lru_lock);

1622

spin_lock_irq(&zone->lru_lock);

1623

} else

1623

} else

1624

list_add(&page->lru, pages_to_free);

1624

list_add(&page->lru, pages_to_free);

1625

}

1625

}

1626

}

1626

}

1627

__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);

1627

__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);

1628

if (!is_active_lru(lru))

1628

if (!is_active_lru(lru))

1629

__count_vm_events(PGDEACTIVATE, pgmoved);

1629

__count_vm_events(PGDEACTIVATE, pgmoved);

1630

}

1630

}

1631

1632

static void shrink_active_list(unsigned long nr_to_scan,

1632

static void shrink_active_list(unsigned long nr_to_scan,

1633

struct lruvec *lruvec,

1633

struct lruvec *lruvec,

1634

struct scan_control *sc,

1634

struct scan_control *sc,

1635

enum lru_list lru)

1635

enum lru_list lru)

1636

{

1636

{

1637

unsigned long nr_taken;

1637

unsigned long nr_taken;

1638

unsigned long nr_scanned;

1638

unsigned long nr_scanned;

1639

unsigned long vm_flags;

1639

unsigned long vm_flags;

1640

LIST_HEAD(l_hold); /* The pages which were snipped off */

1640

LIST_HEAD(l_hold); /* The pages which were snipped off */

1641

LIST_HEAD(l_active);

1641

LIST_HEAD(l_active);

1642

LIST_HEAD(l_inactive);

1642

LIST_HEAD(l_inactive);

1643

struct page *page;

1643

struct page *page;

1644

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1644

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1645

unsigned long nr_rotated = 0;

1645

unsigned long nr_rotated = 0;

1646

isolate_mode_t isolate_mode = 0;

1646

isolate_mode_t isolate_mode = 0;

1647

int file = is_file_lru(lru);

1647

int file = is_file_lru(lru);

1648

struct zone *zone = lruvec_zone(lruvec);

1648

struct zone *zone = lruvec_zone(lruvec);

1649

1650

lru_add_drain();

1650

lru_add_drain();

1651

1652

if (!sc->may_unmap)

1652

if (!sc->may_unmap)

1653

isolate_mode |= ISOLATE_UNMAPPED;

1653

isolate_mode |= ISOLATE_UNMAPPED;

1654

if (!sc->may_writepage)

1654

if (!sc->may_writepage)

1655

isolate_mode |= ISOLATE_CLEAN;

1655

isolate_mode |= ISOLATE_CLEAN;

1656

1657

spin_lock_irq(&zone->lru_lock);

1657

spin_lock_irq(&zone->lru_lock);

1658

1659

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,

1659

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,

1660

&nr_scanned, sc, isolate_mode, lru);

1660

&nr_scanned, sc, isolate_mode, lru);

1661

if (global_reclaim(sc))

1661

if (global_reclaim(sc))

1662

zone->pages_scanned += nr_scanned;

1662

zone->pages_scanned += nr_scanned;

1663

1664

reclaim_stat->recent_scanned[file] += nr_taken;

1664

reclaim_stat->recent_scanned[file] += nr_taken;

1665

1666

__count_zone_vm_events(PGREFILL, zone, nr_scanned);

1666

__count_zone_vm_events(PGREFILL, zone, nr_scanned);

1667

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1667

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1668

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1668

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1669

spin_unlock_irq(&zone->lru_lock);

1669

spin_unlock_irq(&zone->lru_lock);

1670

1671

while (!list_empty(&l_hold)) {

1671

while (!list_empty(&l_hold)) {

1672

cond_resched();

1672

cond_resched();

1673

page = lru_to_page(&l_hold);

1673

page = lru_to_page(&l_hold);

1674

list_del(&page->lru);

1674

list_del(&page->lru);

1675

1676

if (unlikely(!page_evictable(page))) {

1676

if (unlikely(!page_evictable(page))) {

1677

putback_lru_page(page);

1677

putback_lru_page(page);

1678

continue;

1678

continue;

1679

}

1679

}

1680

1681

if (unlikely(buffer_heads_over_limit)) {

1681

if (unlikely(buffer_heads_over_limit)) {

1682

if (page_has_private(page) && trylock_page(page)) {

1682

if (page_has_private(page) && trylock_page(page)) {

1683

if (page_has_private(page))

1683

if (page_has_private(page))

1684

try_to_release_page(page, 0);

1684

try_to_release_page(page, 0);

1685

unlock_page(page);

1685

unlock_page(page);

1686

}

1686

}

1687

}

1687

}

1688

1689

if (page_referenced(page, 0, sc->target_mem_cgroup,

1689

if (page_referenced(page, 0, sc->target_mem_cgroup,

1690

&vm_flags)) {

1690

&vm_flags)) {

1691

nr_rotated += hpage_nr_pages(page);

1691

nr_rotated += hpage_nr_pages(page);

1692

/*

1692

/*

1693

* Identify referenced, file-backed active pages and

1693

* Identify referenced, file-backed active pages and

1694

* give them one more trip around the active list. So

1694

* give them one more trip around the active list. So

1695

* that executable code get better chances to stay in

1695

* that executable code get better chances to stay in

1696

* memory under moderate memory pressure. Anon pages

1696

* memory under moderate memory pressure. Anon pages

1697

* are not likely to be evicted by use-once streaming

1697

* are not likely to be evicted by use-once streaming

1698

* IO, plus JVM can create lots of anon VM_EXEC pages,

1698

* IO, plus JVM can create lots of anon VM_EXEC pages,

1699

* so we ignore them here.

1699

* so we ignore them here.

1700

*/

1700

*/

1701

if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {

1701

if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {

1702

list_add(&page->lru, &l_active);

1702

list_add(&page->lru, &l_active);

1703

continue;

1703

continue;

1704

}

1704

}

1705

}

1705

}

1706

1707

ClearPageActive(page); /* we are de-activating */

1707

ClearPageActive(page); /* we are de-activating */

1708

list_add(&page->lru, &l_inactive);

1708

list_add(&page->lru, &l_inactive);

1709

}

1709

}

1710

1711

/*

1711

/*

1712

* Move pages back to the lru list.

1712

* Move pages back to the lru list.

1713

*/

1713

*/

1714

spin_lock_irq(&zone->lru_lock);

1714

spin_lock_irq(&zone->lru_lock);

1715

/*

1715

/*

1716

* Count referenced pages from currently used mappings as rotated,

1716

* Count referenced pages from currently used mappings as rotated,

1717

* even though only some of them are actually re-activated. This

1717

* even though only some of them are actually re-activated. This

1718

* helps balance scan pressure between file and anonymous pages in

1718

* helps balance scan pressure between file and anonymous pages in

1719

* get_scan_ratio.

1719

* get_scan_ratio.

1720

*/

1720

*/

1721

reclaim_stat->recent_rotated[file] += nr_rotated;

1721

reclaim_stat->recent_rotated[file] += nr_rotated;

1722

1723

move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);

1723

move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);

1724

move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);

1724

move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);

1725

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1725

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1726

spin_unlock_irq(&zone->lru_lock);

1726

spin_unlock_irq(&zone->lru_lock);

1727

1728

free_hot_cold_page_list(&l_hold, true);

1728

free_hot_cold_page_list(&l_hold, true);

1729

}

1729

}

1730

1731

#ifdef CONFIG_SWAP

1731

#ifdef CONFIG_SWAP

1732

static int inactive_anon_is_low_global(struct zone *zone)

1732

static int inactive_anon_is_low_global(struct zone *zone)

1733

{

1733

{

1734

unsigned long active, inactive;

1734

unsigned long active, inactive;

1735

1736

active = zone_page_state(zone, NR_ACTIVE_ANON);

1736

active = zone_page_state(zone, NR_ACTIVE_ANON);

1737

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1737

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1738

1739

if (inactive * zone->inactive_ratio < active)

1739

if (inactive * zone->inactive_ratio < active)

1740

return 1;

1740

return 1;

1741

1742

return 0;

1742

return 0;

1743

}

1743

}

1744

1745

/**

1745

/**

1746

* inactive_anon_is_low - check if anonymous pages need to be deactivated

1746

* inactive_anon_is_low - check if anonymous pages need to be deactivated

1747

* @lruvec: LRU vector to check

1747

* @lruvec: LRU vector to check

1748

*

1748

*

1749

* Returns true if the zone does not have enough inactive anon pages,

1749

* Returns true if the zone does not have enough inactive anon pages,

1750

* meaning some active anon pages need to be deactivated.

1750

* meaning some active anon pages need to be deactivated.

1751

*/

1751

*/

1752

static int inactive_anon_is_low(struct lruvec *lruvec)

1752

static int inactive_anon_is_low(struct lruvec *lruvec)

1753

{

1753

{

1754

/*

1754

/*

1755

* If we don't have swap space, anonymous page deactivation

1755

* If we don't have swap space, anonymous page deactivation

1756

* is pointless.

1756

* is pointless.

1757

*/

1757

*/

1758

if (!total_swap_pages)

1758

if (!total_swap_pages)

1759

return 0;

1759

return 0;

1760

1761

if (!mem_cgroup_disabled())

1761

if (!mem_cgroup_disabled())

1762

return mem_cgroup_inactive_anon_is_low(lruvec);

1762

return mem_cgroup_inactive_anon_is_low(lruvec);

1763

1764

return inactive_anon_is_low_global(lruvec_zone(lruvec));

1764

return inactive_anon_is_low_global(lruvec_zone(lruvec));

1765

}

1765

}

1766

#else

1766

#else

1767

static inline int inactive_anon_is_low(struct lruvec *lruvec)

1767

static inline int inactive_anon_is_low(struct lruvec *lruvec)

1768

{

1768

{

1769

return 0;

1769

return 0;

1770

}

1770

}

1771

#endif

1771

#endif

1772

1773

/**

1773

/**

1774

* inactive_file_is_low - check if file pages need to be deactivated

1774

* inactive_file_is_low - check if file pages need to be deactivated

1775

* @lruvec: LRU vector to check

1775

* @lruvec: LRU vector to check

1776

*

1776

*

1777

* When the system is doing streaming IO, memory pressure here

1777

* When the system is doing streaming IO, memory pressure here

1778

* ensures that active file pages get deactivated, until more

1778

* ensures that active file pages get deactivated, until more

1779

* than half of the file pages are on the inactive list.

1779

* than half of the file pages are on the inactive list.

1780

*

1780

*

1781

* Once we get to that situation, protect the system's working

1781

* Once we get to that situation, protect the system's working

1782

* set from being evicted by disabling active file page aging.

1782

* set from being evicted by disabling active file page aging.

1783

*

1783

*

1784

* This uses a different ratio than the anonymous pages, because

1784

* This uses a different ratio than the anonymous pages, because

1785

* the page cache uses a use-once replacement algorithm.

1785

* the page cache uses a use-once replacement algorithm.

1786

*/

1786

*/

1787

static int inactive_file_is_low(struct lruvec *lruvec)

1787

static int inactive_file_is_low(struct lruvec *lruvec)

1788

{

1788

{

1789

unsigned long inactive;

1789

unsigned long inactive;

1790

unsigned long active;

1790

unsigned long active;

1791

1792

inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);

1792

inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);

1793

active = get_lru_size(lruvec, LRU_ACTIVE_FILE);

1793

active = get_lru_size(lruvec, LRU_ACTIVE_FILE);

1794

1795

return active > inactive;

1795

return active > inactive;

1796

}

1796

}

1797

1798

static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)

1798

static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)

1799

{

1799

{

1800

if (is_file_lru(lru))

1800

if (is_file_lru(lru))

1801

return inactive_file_is_low(lruvec);

1801

return inactive_file_is_low(lruvec);

1802

else

1802

else

1803

return inactive_anon_is_low(lruvec);

1803

return inactive_anon_is_low(lruvec);

1804

}

1804

}

1805

1806

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,

1806

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,

1807

struct lruvec *lruvec, struct scan_control *sc)

1807

struct lruvec *lruvec, struct scan_control *sc)

1808

{

1808

{

1809

if (is_active_lru(lru)) {

1809

if (is_active_lru(lru)) {

1810

if (inactive_list_is_low(lruvec, lru))

1810

if (inactive_list_is_low(lruvec, lru))

1811

shrink_active_list(nr_to_scan, lruvec, sc, lru);

1811

shrink_active_list(nr_to_scan, lruvec, sc, lru);

1812

return 0;

1812

return 0;

1813

}

1813

}

1814

1815

return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);

1815

return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);

1816

}

1816

}

1817

1818

static int vmscan_swappiness(struct scan_control *sc)

1818

static int vmscan_swappiness(struct scan_control *sc)

1819

{

1819

{

1820

if (global_reclaim(sc))

1820

if (global_reclaim(sc))

1821

return vm_swappiness;

1821

return vm_swappiness;

1822

return mem_cgroup_swappiness(sc->target_mem_cgroup);

1822

return mem_cgroup_swappiness(sc->target_mem_cgroup);

1823

}

1823

}

1824

1825

enum scan_balance {

1825

enum scan_balance {

1826

SCAN_EQUAL,

1826

SCAN_EQUAL,

1827

SCAN_FRACT,

1827

SCAN_FRACT,

1828

SCAN_ANON,

1828

SCAN_ANON,

1829

SCAN_FILE,

1829

SCAN_FILE,

1830

};

1830

};

1831

1832

/*

1832

/*

1833

* Determine how aggressively the anon and file LRU lists should be

1833

* Determine how aggressively the anon and file LRU lists should be

1834

* scanned. The relative value of each set of LRU lists is determined

1834

* scanned. The relative value of each set of LRU lists is determined

1835

* by looking at the fraction of the pages scanned we did rotate back

1835

* by looking at the fraction of the pages scanned we did rotate back

1836

* onto the active list instead of evict.

1836

* onto the active list instead of evict.

1837

*

1837

*

1838

* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan

1838

* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan

1839

* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan

1839

* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan

1840

*/

1840

*/

1841

static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,

1841

static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,

1842

unsigned long *nr)

1842

unsigned long *nr)

1843

{

1843

{

1844

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1844

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1845

u64 fraction[2];

1845

u64 fraction[2];

1846

u64 denominator = 0; /* gcc */

1846

u64 denominator = 0; /* gcc */

1847

struct zone *zone = lruvec_zone(lruvec);

1847

struct zone *zone = lruvec_zone(lruvec);

1848

unsigned long anon_prio, file_prio;

1848

unsigned long anon_prio, file_prio;

1849

enum scan_balance scan_balance;

1849

enum scan_balance scan_balance;

1850

unsigned long anon, file, free;

1850

unsigned long anon, file;

1851

bool force_scan = false;

1851

bool force_scan = false;

1852

unsigned long ap, fp;

1852

unsigned long ap, fp;

1853

enum lru_list lru;

1853

enum lru_list lru;

1854

1855

/*

1855

/*

1856

* If the zone or memcg is small, nr[l] can be 0. This

1856

* If the zone or memcg is small, nr[l] can be 0. This

1857

* results in no scanning on this priority and a potential

1857

* results in no scanning on this priority and a potential

1858

* priority drop. Global direct reclaim can go to the next

1858

* priority drop. Global direct reclaim can go to the next

1859

* zone and tends to have no problems. Global kswapd is for

1859

* zone and tends to have no problems. Global kswapd is for

1860

* zone balancing and it needs to scan a minimum amount. When

1860

* zone balancing and it needs to scan a minimum amount. When

1861

* reclaiming for a memcg, a priority drop can cause high

1861

* reclaiming for a memcg, a priority drop can cause high

1862

* latencies, so it's better to scan a minimum amount there as

1862

* latencies, so it's better to scan a minimum amount there as

1863

* well.

1863

* well.

1864

*/

1864

*/

1865

if (current_is_kswapd() && !zone_reclaimable(zone))

1865

if (current_is_kswapd() && !zone_reclaimable(zone))

1866

force_scan = true;

1866

force_scan = true;

1867

if (!global_reclaim(sc))

1867

if (!global_reclaim(sc))

1868

force_scan = true;

1868

force_scan = true;

1869

1870

/* If we have no swap space, do not bother scanning anon pages. */

1870

/* If we have no swap space, do not bother scanning anon pages. */

1871

if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {

1871

if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {

1872

scan_balance = SCAN_FILE;

1872

scan_balance = SCAN_FILE;

1873

goto out;

1873

goto out;

1874

}

1874

}

1875

1876

/*

1876

/*

1877

* Global reclaim will swap to prevent OOM even with no

1877

* Global reclaim will swap to prevent OOM even with no

1878

* swappiness, but memcg users want to use this knob to

1878

* swappiness, but memcg users want to use this knob to

1879

* disable swapping for individual groups completely when

1879

* disable swapping for individual groups completely when

1880

* using the memory controller's swap limit feature would be

1880

* using the memory controller's swap limit feature would be

1881

* too expensive.

1881

* too expensive.

1882

*/

1882

*/

1883

if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {

1883

if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {

1884

scan_balance = SCAN_FILE;

1884

scan_balance = SCAN_FILE;

1885

goto out;

1885

goto out;

1886

}

1886

}

1887

1888

/*

1888

/*

1889

* Do not apply any pressure balancing cleverness when the

1889

* Do not apply any pressure balancing cleverness when the

1890

* system is close to OOM, scan both anon and file equally

1890

* system is close to OOM, scan both anon and file equally

1891

* (unless the swappiness setting disagrees with swapping).

1891

* (unless the swappiness setting disagrees with swapping).

1892

*/

1892

*/

1893

if (!sc->priority && vmscan_swappiness(sc)) {

1893

if (!sc->priority && vmscan_swappiness(sc)) {

1894

scan_balance = SCAN_EQUAL;

1894

scan_balance = SCAN_EQUAL;

1895

goto out;

1895

goto out;

1896

}

1896

}

1897

1898

anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +

1899

get_lru_size(lruvec, LRU_INACTIVE_ANON);

1900

file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +

1901

get_lru_size(lruvec, LRU_INACTIVE_FILE);

1902

1903

/*

1898

/*

1904

* If it's foreseeable that reclaiming the file cache won't be

1899

* If it's foreseeable that reclaiming the file cache won't be

1905

* enough to get the zone back into a desirable shape, we have

1900

* enough to get the zone back into a desirable shape, we have

1906

* to swap. Better start now and leave the - probably heavily

1901

* to swap. Better start now and leave the - probably heavily

1907

* thrashing - remaining file pages alone.

1902

* thrashing - remaining file pages alone.

1908

*/

1903

*/

1909

if (global_reclaim(sc)) {

1904

if (global_reclaim(sc)) {

1910

free = zone_page_state(zone, NR_FREE_PAGES);

1905

unsigned long zonefile;

1911

if (unlikely(file + free <= high_wmark_pages(zone))) {

1906

unsigned long zonefree;

1907

1908

zonefree = zone_page_state(zone, NR_FREE_PAGES);

1909

zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +

1910

zone_page_state(zone, NR_INACTIVE_FILE);

1911

1912

if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {

1912

scan_balance = SCAN_ANON;

1913

scan_balance = SCAN_ANON;

1913

goto out;

1914

goto out;

1914

}

1915

}

1915

}

1916

}

1916

1917

/*

1918

/*

1918

* There is enough inactive page cache, do not reclaim

1919

* There is enough inactive page cache, do not reclaim

1919

* anything from the anonymous working set right now.

1920

* anything from the anonymous working set right now.

1920

*/

1921

*/

1921

if (!inactive_file_is_low(lruvec)) {

1922

if (!inactive_file_is_low(lruvec)) {

1922

scan_balance = SCAN_FILE;

1923

scan_balance = SCAN_FILE;

1923

goto out;

1924

goto out;

1924

}

1925

}

1925

1926

scan_balance = SCAN_FRACT;

1927

scan_balance = SCAN_FRACT;

1927

1928

/*

1929

/*

1929

* With swappiness at 100, anonymous and file have the same priority.

1930

* With swappiness at 100, anonymous and file have the same priority.

1930

* This scanning priority is essentially the inverse of IO cost.

1931

* This scanning priority is essentially the inverse of IO cost.

1931

*/

1932

*/

1932

anon_prio = vmscan_swappiness(sc);

1933

anon_prio = vmscan_swappiness(sc);

1933

file_prio = 200 - anon_prio;

1934

file_prio = 200 - anon_prio;

1934

1935

/*

1936

/*

1936

* OK, so we have swap space and a fair amount of page cache

1937

* OK, so we have swap space and a fair amount of page cache

1937

* pages. We use the recently rotated / recently scanned

1938

* pages. We use the recently rotated / recently scanned

1938

* ratios to determine how valuable each cache is.

1939

* ratios to determine how valuable each cache is.

1939

*

1940

*

1940

* Because workloads change over time (and to avoid overflow)

1941

* Because workloads change over time (and to avoid overflow)

1941

* we keep these statistics as a floating average, which ends

1942

* we keep these statistics as a floating average, which ends

1942

* up weighing recent references more than old ones.

1943

* up weighing recent references more than old ones.

1943

*

1944

*

1944

* anon in [0], file in [1]

1945

* anon in [0], file in [1]

1945

*/

1946

*/

1947

1948

anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +

1949

get_lru_size(lruvec, LRU_INACTIVE_ANON);

1950

file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +

1951

get_lru_size(lruvec, LRU_INACTIVE_FILE);

1952

1946

spin_lock_irq(&zone->lru_lock);

1953

spin_lock_irq(&zone->lru_lock);

1947

if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {

1954

if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {

1948

reclaim_stat->recent_scanned[0] /= 2;

1955

reclaim_stat->recent_scanned[0] /= 2;

1949

reclaim_stat->recent_rotated[0] /= 2;

1956

reclaim_stat->recent_rotated[0] /= 2;

1950

}

1957

}

1951

1958

1952

if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {

1959

if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {

1953

reclaim_stat->recent_scanned[1] /= 2;

1960

reclaim_stat->recent_scanned[1] /= 2;

1954

reclaim_stat->recent_rotated[1] /= 2;

1961

reclaim_stat->recent_rotated[1] /= 2;

1955

}

1962

}

1956

1963

1957

/*

1964

/*

1958

* The amount of pressure on anon vs file pages is inversely

1965

* The amount of pressure on anon vs file pages is inversely

1959

* proportional to the fraction of recently scanned pages on

1966

* proportional to the fraction of recently scanned pages on

1960

* each list that were recently referenced and in active use.

1967

* each list that were recently referenced and in active use.

1961

*/

1968

*/

1962

ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);

1969

ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);

1963

ap /= reclaim_stat->recent_rotated[0] + 1;

1970

ap /= reclaim_stat->recent_rotated[0] + 1;

1964

1971

1965

fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);

1972

fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);

1966

fp /= reclaim_stat->recent_rotated[1] + 1;

1973

fp /= reclaim_stat->recent_rotated[1] + 1;

1967

spin_unlock_irq(&zone->lru_lock);

1974

spin_unlock_irq(&zone->lru_lock);

1968

1975

1969

fraction[0] = ap;

1976

fraction[0] = ap;

1970

fraction[1] = fp;

1977

fraction[1] = fp;

1971

denominator = ap + fp + 1;

1978

denominator = ap + fp + 1;

1972

out:

1979

out:

1973

for_each_evictable_lru(lru) {

1980

for_each_evictable_lru(lru) {

1974

int file = is_file_lru(lru);

1981

int file = is_file_lru(lru);

1975

unsigned long size;

1982

unsigned long size;

1976

unsigned long scan;

1983

unsigned long scan;

1977

1984

1978

size = get_lru_size(lruvec, lru);

1985

size = get_lru_size(lruvec, lru);

1979

scan = size >> sc->priority;

1986

scan = size >> sc->priority;

1980

1987

1981

if (!scan && force_scan)

1988

if (!scan && force_scan)

1982

scan = min(size, SWAP_CLUSTER_MAX);

1989

scan = min(size, SWAP_CLUSTER_MAX);

1983

1990

1984

switch (scan_balance) {

1991

switch (scan_balance) {

1985

case SCAN_EQUAL:

1992

case SCAN_EQUAL:

1986

/* Scan lists relative to size */

1993

/* Scan lists relative to size */

1987

break;

1994

break;

1988

case SCAN_FRACT:

1995

case SCAN_FRACT:

1989

/*

1996

/*

1990

* Scan types proportional to swappiness and

1997

* Scan types proportional to swappiness and

1991

* their relative recent reclaim efficiency.

1998

* their relative recent reclaim efficiency.

1992

*/

1999

*/

1993

scan = div64_u64(scan * fraction[file], denominator);

2000

scan = div64_u64(scan * fraction[file], denominator);

1994

break;

2001

break;

1995

case SCAN_FILE:

2002

case SCAN_FILE:

1996

case SCAN_ANON:

2003

case SCAN_ANON:

1997

/* Scan one type exclusively */

2004

/* Scan one type exclusively */

1998

if ((scan_balance == SCAN_FILE) != file)

2005

if ((scan_balance == SCAN_FILE) != file)

1999

scan = 0;

2006

scan = 0;

2000

break;

2007

break;

2001

default:

2008

default:

2002

/* Look ma, no brain */

2009

/* Look ma, no brain */

2003

BUG();

2010

BUG();

2004

}

2011

}

2005

nr[lru] = scan;

2012

nr[lru] = scan;

2006

}

2013

}

2007

}

2014

}

2008

2015

2009

/*

2016

/*

2010

* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.

2017

* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.

2011

*/

2018

*/

2012

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

2019

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

2013

{

2020

{

2014

unsigned long nr[NR_LRU_LISTS];

2021

unsigned long nr[NR_LRU_LISTS];

2015

unsigned long targets[NR_LRU_LISTS];

2022

unsigned long targets[NR_LRU_LISTS];

2016

unsigned long nr_to_scan;

2023

unsigned long nr_to_scan;

2017

enum lru_list lru;

2024

enum lru_list lru;

2018

unsigned long nr_reclaimed = 0;

2025

unsigned long nr_reclaimed = 0;

2019

unsigned long nr_to_reclaim = sc->nr_to_reclaim;

2026

unsigned long nr_to_reclaim = sc->nr_to_reclaim;

2020

struct blk_plug plug;

2027

struct blk_plug plug;

2021

bool scan_adjusted;

2028

bool scan_adjusted;

2022

2029

2023

get_scan_count(lruvec, sc, nr);

2030

get_scan_count(lruvec, sc, nr);

2024

2031

2025

/* Record the original scan target for proportional adjustments later */

2032

/* Record the original scan target for proportional adjustments later */

2026

memcpy(targets, nr, sizeof(nr));

2033

memcpy(targets, nr, sizeof(nr));

2027

2034

2028

/*

2035

/*

2029

* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal

2036

* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal

2030

* event that can occur when there is little memory pressure e.g.

2037

* event that can occur when there is little memory pressure e.g.

2031

* multiple streaming readers/writers. Hence, we do not abort scanning

2038

* multiple streaming readers/writers. Hence, we do not abort scanning

2032

* when the requested number of pages are reclaimed when scanning at

2039

* when the requested number of pages are reclaimed when scanning at

2033

* DEF_PRIORITY on the assumption that the fact we are direct

2040

* DEF_PRIORITY on the assumption that the fact we are direct

2034

* reclaiming implies that kswapd is not keeping up and it is best to

2041

* reclaiming implies that kswapd is not keeping up and it is best to

2035

* do a batch of work at once. For memcg reclaim one check is made to

2042

* do a batch of work at once. For memcg reclaim one check is made to

2036

* abort proportional reclaim if either the file or anon lru has already

2043

* abort proportional reclaim if either the file or anon lru has already

2037

* dropped to zero at the first pass.

2044

* dropped to zero at the first pass.

2038

*/

2045

*/

2039

scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&

2046

scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&

2040

sc->priority == DEF_PRIORITY);

2047

sc->priority == DEF_PRIORITY);

2041

2048

2042

blk_start_plug(&plug);

2049

blk_start_plug(&plug);

2043

while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||

2050

while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||

2044

nr[LRU_INACTIVE_FILE]) {

2051

nr[LRU_INACTIVE_FILE]) {

2045

unsigned long nr_anon, nr_file, percentage;

2052

unsigned long nr_anon, nr_file, percentage;

2046

unsigned long nr_scanned;

2053

unsigned long nr_scanned;

2047

2054

2048

for_each_evictable_lru(lru) {

2055

for_each_evictable_lru(lru) {

2049

if (nr[lru]) {

2056

if (nr[lru]) {

2050

nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);

2057

nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);

2051

nr[lru] -= nr_to_scan;

2058

nr[lru] -= nr_to_scan;

2052

2059

2053

nr_reclaimed += shrink_list(lru, nr_to_scan,

2060

nr_reclaimed += shrink_list(lru, nr_to_scan,

2054

lruvec, sc);

2061

lruvec, sc);

2055

}

2062

}

2056

}

2063

}

2057

2064

2058

if (nr_reclaimed < nr_to_reclaim || scan_adjusted)

2065

if (nr_reclaimed < nr_to_reclaim || scan_adjusted)

2059

continue;

2066

continue;

2060

2067

2061

/*

2068

/*

2062

* For kswapd and memcg, reclaim at least the number of pages

2069

* For kswapd and memcg, reclaim at least the number of pages

2063

* requested. Ensure that the anon and file LRUs are scanned

2070

* requested. Ensure that the anon and file LRUs are scanned

2064

* proportionally what was requested by get_scan_count(). We

2071

* proportionally what was requested by get_scan_count(). We

2065

* stop reclaiming one LRU and reduce the amount scanning

2072

* stop reclaiming one LRU and reduce the amount scanning

2066

* proportional to the original scan target.

2073

* proportional to the original scan target.

2067

*/

2074

*/

2068

nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];

2075

nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];

2069

nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

2076

nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

2070

2077

2071

/*

2078

/*

2072

* It's just vindictive to attack the larger once the smaller

2079

* It's just vindictive to attack the larger once the smaller

2073

* has gone to zero. And given the way we stop scanning the

2080

* has gone to zero. And given the way we stop scanning the

2074

* smaller below, this makes sure that we only make one nudge

2081

* smaller below, this makes sure that we only make one nudge

2075

* towards proportionality once we've got nr_to_reclaim.

2082

* towards proportionality once we've got nr_to_reclaim.

2076

*/

2083

*/

2077

if (!nr_file || !nr_anon)

2084

if (!nr_file || !nr_anon)

2078

break;

2085

break;

2079

2086

2080

if (nr_file > nr_anon) {

2087

if (nr_file > nr_anon) {

2081

unsigned long scan_target = targets[LRU_INACTIVE_ANON] +

2088

unsigned long scan_target = targets[LRU_INACTIVE_ANON] +

2082

targets[LRU_ACTIVE_ANON] + 1;

2089

targets[LRU_ACTIVE_ANON] + 1;

2083

lru = LRU_BASE;

2090

lru = LRU_BASE;

2084

percentage = nr_anon * 100 / scan_target;

2091

percentage = nr_anon * 100 / scan_target;

2085

} else {

2092

} else {

2086

unsigned long scan_target = targets[LRU_INACTIVE_FILE] +

2093

unsigned long scan_target = targets[LRU_INACTIVE_FILE] +

2087

targets[LRU_ACTIVE_FILE] + 1;

2094

targets[LRU_ACTIVE_FILE] + 1;

2088

lru = LRU_FILE;

2095

lru = LRU_FILE;

2089

percentage = nr_file * 100 / scan_target;

2096

percentage = nr_file * 100 / scan_target;

2090

}

2097

}

2091

2098

2092

/* Stop scanning the smaller of the LRU */

2099

/* Stop scanning the smaller of the LRU */

2093

nr[lru] = 0;

2100

nr[lru] = 0;

2094

nr[lru + LRU_ACTIVE] = 0;

2101

nr[lru + LRU_ACTIVE] = 0;

2095

2102

2096

/*

2103

/*

2097

* Recalculate the other LRU scan count based on its original

2104

* Recalculate the other LRU scan count based on its original

2098

* scan target and the percentage scanning already complete

2105

* scan target and the percentage scanning already complete

2099

*/

2106

*/

2100

lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;

2107

lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;

2101

nr_scanned = targets[lru] - nr[lru];

2108

nr_scanned = targets[lru] - nr[lru];

2102

nr[lru] = targets[lru] * (100 - percentage) / 100;

2109

nr[lru] = targets[lru] * (100 - percentage) / 100;

2103

nr[lru] -= min(nr[lru], nr_scanned);

2110

nr[lru] -= min(nr[lru], nr_scanned);

2104

2111

2105

lru += LRU_ACTIVE;

2112

lru += LRU_ACTIVE;

2106

nr_scanned = targets[lru] - nr[lru];

2113

nr_scanned = targets[lru] - nr[lru];

2107

nr[lru] = targets[lru] * (100 - percentage) / 100;

2114

nr[lru] = targets[lru] * (100 - percentage) / 100;

2108

nr[lru] -= min(nr[lru], nr_scanned);

2115

nr[lru] -= min(nr[lru], nr_scanned);

2109

2116

2110

scan_adjusted = true;

2117

scan_adjusted = true;

2111

}

2118

}

2112

blk_finish_plug(&plug);

2119

blk_finish_plug(&plug);

2113

sc->nr_reclaimed += nr_reclaimed;

2120

sc->nr_reclaimed += nr_reclaimed;

2114

2121

2115

/*

2122

/*

2116

* Even if we did not try to evict anon pages at all, we want to

2123

* Even if we did not try to evict anon pages at all, we want to

2117

* rebalance the anon lru active/inactive ratio.

2124

* rebalance the anon lru active/inactive ratio.

2118

*/

2125

*/

2119

if (inactive_anon_is_low(lruvec))

2126

if (inactive_anon_is_low(lruvec))

2120

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2127

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2121

sc, LRU_ACTIVE_ANON);

2128

sc, LRU_ACTIVE_ANON);

2122

2129

2123

throttle_vm_writeout(sc->gfp_mask);

2130

throttle_vm_writeout(sc->gfp_mask);

2124

}

2131

}

2125

2132

2126

/* Use reclaim/compaction for costly allocs or under memory pressure */

2133

/* Use reclaim/compaction for costly allocs or under memory pressure */

2127

static bool in_reclaim_compaction(struct scan_control *sc)

2134

static bool in_reclaim_compaction(struct scan_control *sc)

2128

{

2135

{

2129

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2136

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2130

(sc->order > PAGE_ALLOC_COSTLY_ORDER ||

2137

(sc->order > PAGE_ALLOC_COSTLY_ORDER ||

2131

sc->priority < DEF_PRIORITY - 2))

2138

sc->priority < DEF_PRIORITY - 2))

2132

return true;

2139

return true;

2133

2140

2134

return false;

2141

return false;

2135

}

2142

}

2136

2143

2137

/*

2144

/*

2138

* Reclaim/compaction is used for high-order allocation requests. It reclaims

2145

* Reclaim/compaction is used for high-order allocation requests. It reclaims

2139

* order-0 pages before compacting the zone. should_continue_reclaim() returns

2146

* order-0 pages before compacting the zone. should_continue_reclaim() returns

2140

* true if more pages should be reclaimed such that when the page allocator

2147

* true if more pages should be reclaimed such that when the page allocator

2141

* calls try_to_compact_zone() that it will have enough free pages to succeed.

2148

* calls try_to_compact_zone() that it will have enough free pages to succeed.

2142

* It will give up earlier than that if there is difficulty reclaiming pages.

2149

* It will give up earlier than that if there is difficulty reclaiming pages.

2143

*/

2150

*/

2144

static inline bool should_continue_reclaim(struct zone *zone,

2151

static inline bool should_continue_reclaim(struct zone *zone,

2145

unsigned long nr_reclaimed,

2152

unsigned long nr_reclaimed,

2146

unsigned long nr_scanned,

2153

unsigned long nr_scanned,

2147

struct scan_control *sc)

2154

struct scan_control *sc)

2148

{

2155

{

2149

unsigned long pages_for_compaction;

2156

unsigned long pages_for_compaction;

2150

unsigned long inactive_lru_pages;

2157

unsigned long inactive_lru_pages;

2151

2158

2152

/* If not in reclaim/compaction mode, stop */

2159

/* If not in reclaim/compaction mode, stop */

2153

if (!in_reclaim_compaction(sc))

2160

if (!in_reclaim_compaction(sc))

2154

return false;

2161

return false;

2155

2162

2156

/* Consider stopping depending on scan and reclaim activity */

2163

/* Consider stopping depending on scan and reclaim activity */

2157

if (sc->gfp_mask & __GFP_REPEAT) {

2164

if (sc->gfp_mask & __GFP_REPEAT) {

2158

/*

2165

/*

2159

* For __GFP_REPEAT allocations, stop reclaiming if the

2166

* For __GFP_REPEAT allocations, stop reclaiming if the

2160

* full LRU list has been scanned and we are still failing

2167

* full LRU list has been scanned and we are still failing

2161

* to reclaim pages. This full LRU scan is potentially

2168

* to reclaim pages. This full LRU scan is potentially

2162

* expensive but a __GFP_REPEAT caller really wants to succeed

2169

* expensive but a __GFP_REPEAT caller really wants to succeed

2163

*/

2170

*/

2164

if (!nr_reclaimed && !nr_scanned)

2171

if (!nr_reclaimed && !nr_scanned)

2165

return false;

2172

return false;

2166

} else {

2173

} else {

2167

/*

2174

/*

2168

* For non-__GFP_REPEAT allocations which can presumably

2175

* For non-__GFP_REPEAT allocations which can presumably

2169

* fail without consequence, stop if we failed to reclaim

2176

* fail without consequence, stop if we failed to reclaim

2170

* any pages from the last SWAP_CLUSTER_MAX number of

2177

* any pages from the last SWAP_CLUSTER_MAX number of

2171

* pages that were scanned. This will return to the

2178

* pages that were scanned. This will return to the

2172

* caller faster at the risk reclaim/compaction and

2179

* caller faster at the risk reclaim/compaction and

2173

* the resulting allocation attempt fails

2180

* the resulting allocation attempt fails

2174

*/

2181

*/

2175

if (!nr_reclaimed)

2182

if (!nr_reclaimed)

2176

return false;

2183

return false;

2177

}

2184

}

2178

2185

2179

/*

2186

/*

2180

* If we have not reclaimed enough pages for compaction and the

2187

* If we have not reclaimed enough pages for compaction and the

2181

* inactive lists are large enough, continue reclaiming

2188

* inactive lists are large enough, continue reclaiming

2182

*/

2189

*/

2183

pages_for_compaction = (2UL << sc->order);

2190

pages_for_compaction = (2UL << sc->order);

2184

inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);

2191

inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);

2185

if (get_nr_swap_pages() > 0)

2192

if (get_nr_swap_pages() > 0)

2186

inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);

2193

inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);

2187

if (sc->nr_reclaimed < pages_for_compaction &&

2194

if (sc->nr_reclaimed < pages_for_compaction &&

2188

inactive_lru_pages > pages_for_compaction)

2195

inactive_lru_pages > pages_for_compaction)

2189

return true;

2196

return true;

2190

2197

2191

/* If compaction would go ahead or the allocation would succeed, stop */

2198

/* If compaction would go ahead or the allocation would succeed, stop */

2192

switch (compaction_suitable(zone, sc->order)) {

2199

switch (compaction_suitable(zone, sc->order)) {

2193

case COMPACT_PARTIAL:

2200

case COMPACT_PARTIAL:

2194

case COMPACT_CONTINUE:

2201

case COMPACT_CONTINUE:

2195

return false;

2202

return false;

2196

default:

2203

default:

2197

return true;

2204

return true;

2198

}

2205

}

2199

}

2206

}

2200

2207

2201

static void shrink_zone(struct zone *zone, struct scan_control *sc)

2208

static void shrink_zone(struct zone *zone, struct scan_control *sc)

2202

{

2209

{

2203

unsigned long nr_reclaimed, nr_scanned;

2210

unsigned long nr_reclaimed, nr_scanned;

2204

2211

2205

do {

2212

do {

2206

struct mem_cgroup *root = sc->target_mem_cgroup;

2213

struct mem_cgroup *root = sc->target_mem_cgroup;

2207

struct mem_cgroup_reclaim_cookie reclaim = {

2214

struct mem_cgroup_reclaim_cookie reclaim = {

2208

.zone = zone,

2215

.zone = zone,

2209

.priority = sc->priority,

2216

.priority = sc->priority,

2210

};

2217

};

2211

struct mem_cgroup *memcg;

2218

struct mem_cgroup *memcg;

2212

2219

2213

nr_reclaimed = sc->nr_reclaimed;

2220

nr_reclaimed = sc->nr_reclaimed;

2214

nr_scanned = sc->nr_scanned;

2221

nr_scanned = sc->nr_scanned;

2215

2222

2216

memcg = mem_cgroup_iter(root, NULL, &reclaim);

2223

memcg = mem_cgroup_iter(root, NULL, &reclaim);

2217

do {

2224

do {

2218

struct lruvec *lruvec;

2225

struct lruvec *lruvec;

2219

2226

2220

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2227

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2221

2228

2222

shrink_lruvec(lruvec, sc);

2229

shrink_lruvec(lruvec, sc);

2223

2230

2224

/*

2231

/*

2225

* Direct reclaim and kswapd have to scan all memory

2232

* Direct reclaim and kswapd have to scan all memory

2226

* cgroups to fulfill the overall scan target for the

2233

* cgroups to fulfill the overall scan target for the

2227

* zone.

2234

* zone.

2228

*

2235

*

2229

* Limit reclaim, on the other hand, only cares about

2236

* Limit reclaim, on the other hand, only cares about

2230

* nr_to_reclaim pages to be reclaimed and it will

2237

* nr_to_reclaim pages to be reclaimed and it will

2231

* retry with decreasing priority if one round over the

2238

* retry with decreasing priority if one round over the

2232

* whole hierarchy is not sufficient.

2239

* whole hierarchy is not sufficient.

2233

*/

2240

*/

2234

if (!global_reclaim(sc) &&

2241

if (!global_reclaim(sc) &&

2235

sc->nr_reclaimed >= sc->nr_to_reclaim) {

2242

sc->nr_reclaimed >= sc->nr_to_reclaim) {

2236

mem_cgroup_iter_break(root, memcg);

2243

mem_cgroup_iter_break(root, memcg);

2237

break;

2244

break;

2238

}

2245

}

2239

memcg = mem_cgroup_iter(root, memcg, &reclaim);

2246

memcg = mem_cgroup_iter(root, memcg, &reclaim);

2240

} while (memcg);

2247

} while (memcg);

2241

2248

2242

vmpressure(sc->gfp_mask, sc->target_mem_cgroup,

2249

vmpressure(sc->gfp_mask, sc->target_mem_cgroup,

2243

sc->nr_scanned - nr_scanned,

2250

sc->nr_scanned - nr_scanned,

2244

sc->nr_reclaimed - nr_reclaimed);

2251

sc->nr_reclaimed - nr_reclaimed);

2245

2252

2246

} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,

2253

} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,

2247

sc->nr_scanned - nr_scanned, sc));

2254

sc->nr_scanned - nr_scanned, sc));

2248

}

2255

}

2249

2256

2250

/* Returns true if compaction should go ahead for a high-order request */

2257

/* Returns true if compaction should go ahead for a high-order request */

2251

static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)

2258

static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)

2252

{

2259

{

2253

unsigned long balance_gap, watermark;

2260

unsigned long balance_gap, watermark;

2254

bool watermark_ok;

2261

bool watermark_ok;

2255

2262

2256

/* Do not consider compaction for orders reclaim is meant to satisfy */

2263

/* Do not consider compaction for orders reclaim is meant to satisfy */

2257

if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)

2264

if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)

2258

return false;

2265

return false;

2259

2266

2260

/*

2267

/*

2261

* Compaction takes time to run and there are potentially other

2268

* Compaction takes time to run and there are potentially other

2262

* callers using the pages just freed. Continue reclaiming until

2269

* callers using the pages just freed. Continue reclaiming until

2263

* there is a buffer of free pages available to give compaction

2270

* there is a buffer of free pages available to give compaction

2264

* a reasonable chance of completing and allocating the page

2271

* a reasonable chance of completing and allocating the page

2265

*/

2272

*/

2266

balance_gap = min(low_wmark_pages(zone),

2273

balance_gap = min(low_wmark_pages(zone),

2267

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2274

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2268

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2275

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2269

watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);

2276

watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);

2270

watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

2277

watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

2271

2278

2272

/*

2279

/*

2273

* If compaction is deferred, reclaim up to a point where

2280

* If compaction is deferred, reclaim up to a point where

2274

* compaction will have a chance of success when re-enabled

2281

* compaction will have a chance of success when re-enabled

2275

*/

2282

*/

2276

if (compaction_deferred(zone, sc->order))

2283

if (compaction_deferred(zone, sc->order))

2277

return watermark_ok;

2284

return watermark_ok;

2278

2285

2279

/* If compaction is not ready to start, keep reclaiming */

2286

/* If compaction is not ready to start, keep reclaiming */

2280

if (!compaction_suitable(zone, sc->order))

2287

if (!compaction_suitable(zone, sc->order))

2281

return false;

2288

return false;

2282

2289

2283

return watermark_ok;

2290

return watermark_ok;

2284

}

2291

}

2285

2292

2286

/*

2293

/*

2287

* This is the direct reclaim path, for page-allocating processes. We only

2294

* This is the direct reclaim path, for page-allocating processes. We only

2288

* try to reclaim pages from zones which will satisfy the caller's allocation

2295

* try to reclaim pages from zones which will satisfy the caller's allocation

2289

* request.

2296

* request.

2290

*

2297

*

2291

* We reclaim from a zone even if that zone is over high_wmark_pages(zone).

2298

* We reclaim from a zone even if that zone is over high_wmark_pages(zone).

2292

* Because:

2299

* Because:

2293

* a) The caller may be trying to free *extra* pages to satisfy a higher-order

2300

* a) The caller may be trying to free *extra* pages to satisfy a higher-order

2294

* allocation or

2301

* allocation or

2295

* b) The target zone may be at high_wmark_pages(zone) but the lower zones

2302

* b) The target zone may be at high_wmark_pages(zone) but the lower zones

2296

* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'

2303

* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'

2297

* zone defense algorithm.

2304

* zone defense algorithm.

2298

*

2305

*

2299

* If a zone is deemed to be full of pinned pages then just give it a light

2306

* If a zone is deemed to be full of pinned pages then just give it a light

2300

* scan then give up on it.

2307

* scan then give up on it.

2301

*

2308

*

2302

* This function returns true if a zone is being reclaimed for a costly

2309

* This function returns true if a zone is being reclaimed for a costly

2303

* high-order allocation and compaction is ready to begin. This indicates to

2310

* high-order allocation and compaction is ready to begin. This indicates to

2304

* the caller that it should consider retrying the allocation instead of

2311

* the caller that it should consider retrying the allocation instead of

2305

* further reclaim.

2312

* further reclaim.

2306

*/

2313

*/

2307

static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

2314

static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

2308

{

2315

{

2309

struct zoneref *z;

2316

struct zoneref *z;

2310

struct zone *zone;

2317

struct zone *zone;

2311

unsigned long nr_soft_reclaimed;

2318

unsigned long nr_soft_reclaimed;

2312

unsigned long nr_soft_scanned;

2319

unsigned long nr_soft_scanned;

2313

bool aborted_reclaim = false;

2320

bool aborted_reclaim = false;

2314

2321

2315

/*

2322

/*

2316

* If the number of buffer_heads in the machine exceeds the maximum

2323

* If the number of buffer_heads in the machine exceeds the maximum

2317

* allowed level, force direct reclaim to scan the highmem zone as

2324

* allowed level, force direct reclaim to scan the highmem zone as

2318

* highmem pages could be pinning lowmem pages storing buffer_heads

2325

* highmem pages could be pinning lowmem pages storing buffer_heads

2319

*/

2326

*/

2320

if (buffer_heads_over_limit)

2327

if (buffer_heads_over_limit)

2321

sc->gfp_mask |= __GFP_HIGHMEM;

2328

sc->gfp_mask |= __GFP_HIGHMEM;

2322

2329

2323

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2330

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2324

gfp_zone(sc->gfp_mask), sc->nodemask) {

2331

gfp_zone(sc->gfp_mask), sc->nodemask) {

2325

if (!populated_zone(zone))

2332

if (!populated_zone(zone))

2326

continue;

2333

continue;

2327

/*

2334

/*

2328

* Take care memory controller reclaiming has small influence

2335

* Take care memory controller reclaiming has small influence

2329

* to global LRU.

2336

* to global LRU.

2330

*/

2337

*/

2331

if (global_reclaim(sc)) {

2338

if (global_reclaim(sc)) {

2332

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2339

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2333

continue;

2340

continue;

2334

if (sc->priority != DEF_PRIORITY &&

2341

if (sc->priority != DEF_PRIORITY &&

2335

!zone_reclaimable(zone))

2342

!zone_reclaimable(zone))

2336

continue; /* Let kswapd poll it */

2343

continue; /* Let kswapd poll it */

2337

if (IS_ENABLED(CONFIG_COMPACTION)) {

2344

if (IS_ENABLED(CONFIG_COMPACTION)) {

2338

/*

2345

/*

2339

* If we already have plenty of memory free for

2346

* If we already have plenty of memory free for

2340

* compaction in this zone, don't free any more.

2347

* compaction in this zone, don't free any more.

2341

* Even though compaction is invoked for any

2348

* Even though compaction is invoked for any

2342

* non-zero order, only frequent costly order

2349

* non-zero order, only frequent costly order

2343

* reclamation is disruptive enough to become a

2350

* reclamation is disruptive enough to become a

2344

* noticeable problem, like transparent huge

2351

* noticeable problem, like transparent huge

2345

* page allocations.

2352

* page allocations.

2346

*/

2353

*/

2347

if (compaction_ready(zone, sc)) {

2354

if (compaction_ready(zone, sc)) {

2348

aborted_reclaim = true;

2355

aborted_reclaim = true;

2349

continue;

2356

continue;

2350

}

2357

}

2351

}

2358

}

2352

/*

2359

/*

2353

* This steals pages from memory cgroups over softlimit

2360

* This steals pages from memory cgroups over softlimit

2354

* and returns the number of reclaimed pages and

2361

* and returns the number of reclaimed pages and

2355

* scanned pages. This works for global memory pressure

2362

* scanned pages. This works for global memory pressure

2356

* and balancing, not for a memcg's limit.

2363

* and balancing, not for a memcg's limit.

2357

*/

2364

*/

2358

nr_soft_scanned = 0;

2365

nr_soft_scanned = 0;

2359

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

2366

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

2360

sc->order, sc->gfp_mask,

2367

sc->order, sc->gfp_mask,

2361

&nr_soft_scanned);

2368

&nr_soft_scanned);

2362

sc->nr_reclaimed += nr_soft_reclaimed;

2369

sc->nr_reclaimed += nr_soft_reclaimed;

2363

sc->nr_scanned += nr_soft_scanned;

2370

sc->nr_scanned += nr_soft_scanned;

2364

/* need some check for avoid more shrink_zone() */

2371

/* need some check for avoid more shrink_zone() */

2365

}

2372

}

2366

2373

2367

shrink_zone(zone, sc);

2374

shrink_zone(zone, sc);

2368

}

2375

}

2369

2376

2370

return aborted_reclaim;

2377

return aborted_reclaim;

2371

}

2378

}

2372

2379

2373

/* All zones in zonelist are unreclaimable? */

2380

/* All zones in zonelist are unreclaimable? */

2374

static bool all_unreclaimable(struct zonelist *zonelist,

2381

static bool all_unreclaimable(struct zonelist *zonelist,

2375

struct scan_control *sc)

2382

struct scan_control *sc)

2376

{

2383

{

2377

struct zoneref *z;

2384

struct zoneref *z;

2378

struct zone *zone;

2385

struct zone *zone;

2379

2386

2380

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2387

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2381

gfp_zone(sc->gfp_mask), sc->nodemask) {

2388

gfp_zone(sc->gfp_mask), sc->nodemask) {

2382

if (!populated_zone(zone))

2389

if (!populated_zone(zone))

2383

continue;

2390

continue;

2384

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2391

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2385

continue;

2392

continue;

2386

if (zone_reclaimable(zone))

2393

if (zone_reclaimable(zone))

2387

return false;

2394

return false;

2388

}

2395

}

2389

2396

2390

return true;

2397

return true;

2391

}

2398

}

2392

2399

2393

/*

2400

/*

2394

* This is the main entry point to direct page reclaim.

2401

* This is the main entry point to direct page reclaim.

2395

*

2402

*

2396

* If a full scan of the inactive list fails to free enough memory then we

2403

* If a full scan of the inactive list fails to free enough memory then we

2397

* are "out of memory" and something needs to be killed.

2404

* are "out of memory" and something needs to be killed.

2398

*

2405

*

2399

* If the caller is !__GFP_FS then the probability of a failure is reasonably

2406

* If the caller is !__GFP_FS then the probability of a failure is reasonably

2400

* high - the zone may be full of dirty or under-writeback pages, which this

2407

* high - the zone may be full of dirty or under-writeback pages, which this

2401

* caller can't do much about. We kick the writeback threads and take explicit

2408

* caller can't do much about. We kick the writeback threads and take explicit

2402

* naps in the hope that some of these pages can be written. But if the

2409

* naps in the hope that some of these pages can be written. But if the

2403

* allocating task holds filesystem locks which prevent writeout this might not

2410

* allocating task holds filesystem locks which prevent writeout this might not

2404

* work, and the allocation attempt will fail.

2411

* work, and the allocation attempt will fail.

2405

*

2412

*

2406

* returns: 0, if no pages reclaimed

2413

* returns: 0, if no pages reclaimed

2407

* else, the number of pages reclaimed

2414

* else, the number of pages reclaimed

2408

*/

2415

*/

2409

static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

2416

static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

2410

struct scan_control *sc,

2417

struct scan_control *sc,

2411

struct shrink_control *shrink)

2418

struct shrink_control *shrink)

2412

{

2419

{

2413

unsigned long total_scanned = 0;

2420

unsigned long total_scanned = 0;

2414

struct reclaim_state *reclaim_state = current->reclaim_state;

2421

struct reclaim_state *reclaim_state = current->reclaim_state;

2415

struct zoneref *z;

2422

struct zoneref *z;

2416

struct zone *zone;

2423

struct zone *zone;

2417

unsigned long writeback_threshold;

2424

unsigned long writeback_threshold;

2418

bool aborted_reclaim;

2425

bool aborted_reclaim;

2419

2426

2420

delayacct_freepages_start();

2427

delayacct_freepages_start();

2421

2428

2422

if (global_reclaim(sc))

2429

if (global_reclaim(sc))

2423

count_vm_event(ALLOCSTALL);

2430

count_vm_event(ALLOCSTALL);

2424

2431

2425

do {

2432

do {

2426

vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,

2433

vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,

2427

sc->priority);

2434

sc->priority);

2428

sc->nr_scanned = 0;

2435

sc->nr_scanned = 0;

2429

aborted_reclaim = shrink_zones(zonelist, sc);

2436

aborted_reclaim = shrink_zones(zonelist, sc);

2430

2437

2431

/*

2438

/*

2432

* Don't shrink slabs when reclaiming memory from over limit

2439

* Don't shrink slabs when reclaiming memory from over limit

2433

* cgroups but do shrink slab at least once when aborting

2440

* cgroups but do shrink slab at least once when aborting

2434

* reclaim for compaction to avoid unevenly scanning file/anon

2441

* reclaim for compaction to avoid unevenly scanning file/anon

2435

* LRU pages over slab pages.

2442

* LRU pages over slab pages.

2436

*/

2443

*/

2437

if (global_reclaim(sc)) {

2444

if (global_reclaim(sc)) {

2438

unsigned long lru_pages = 0;

2445

unsigned long lru_pages = 0;

2439

2446

2440

nodes_clear(shrink->nodes_to_scan);

2447

nodes_clear(shrink->nodes_to_scan);

2441

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2448

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2442

gfp_zone(sc->gfp_mask), sc->nodemask) {

2449

gfp_zone(sc->gfp_mask), sc->nodemask) {

2443

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2450

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2444

continue;

2451

continue;

2445

2452

2446

lru_pages += zone_reclaimable_pages(zone);

2453

lru_pages += zone_reclaimable_pages(zone);

2447

node_set(zone_to_nid(zone),

2454

node_set(zone_to_nid(zone),

2448

shrink->nodes_to_scan);

2455

shrink->nodes_to_scan);

2449

}

2456

}

2450

2457

2451

shrink_slab(shrink, sc->nr_scanned, lru_pages);

2458

shrink_slab(shrink, sc->nr_scanned, lru_pages);

2452

if (reclaim_state) {

2459

if (reclaim_state) {

2453

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2460

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2454

reclaim_state->reclaimed_slab = 0;

2461

reclaim_state->reclaimed_slab = 0;

2455

}

2462

}

2456

}

2463

}

2457

total_scanned += sc->nr_scanned;

2464

total_scanned += sc->nr_scanned;

2458

if (sc->nr_reclaimed >= sc->nr_to_reclaim)

2465

if (sc->nr_reclaimed >= sc->nr_to_reclaim)

2459

goto out;

2466

goto out;

2460

2467

2461

/*

2468

/*

2462

* If we're getting trouble reclaiming, start doing

2469

* If we're getting trouble reclaiming, start doing

2463

* writepage even in laptop mode.

2470

* writepage even in laptop mode.

2464

*/

2471

*/

2465

if (sc->priority < DEF_PRIORITY - 2)

2472

if (sc->priority < DEF_PRIORITY - 2)

2466

sc->may_writepage = 1;

2473

sc->may_writepage = 1;

2467

2474

2468

/*

2475

/*

2469

* Try to write back as many pages as we just scanned. This

2476

* Try to write back as many pages as we just scanned. This

2470

* tends to cause slow streaming writers to write data to the

2477

* tends to cause slow streaming writers to write data to the

2471

* disk smoothly, at the dirtying rate, which is nice. But

2478

* disk smoothly, at the dirtying rate, which is nice. But

2472

* that's undesirable in laptop mode, where we *want* lumpy

2479

* that's undesirable in laptop mode, where we *want* lumpy

2473

* writeout. So in laptop mode, write out the whole world.

2480

* writeout. So in laptop mode, write out the whole world.

2474

*/

2481

*/

2475

writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;

2482

writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;

2476

if (total_scanned > writeback_threshold) {

2483

if (total_scanned > writeback_threshold) {

2477

wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,

2484

wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,

2478

WB_REASON_TRY_TO_FREE_PAGES);

2485

WB_REASON_TRY_TO_FREE_PAGES);

2479

sc->may_writepage = 1;

2486

sc->may_writepage = 1;

2480

}

2487

}

2481

} while (--sc->priority >= 0 && !aborted_reclaim);

2488

} while (--sc->priority >= 0 && !aborted_reclaim);

2482

2489

2483

out:

2490

out:

2484

delayacct_freepages_end();

2491

delayacct_freepages_end();

2485

2492

2486

if (sc->nr_reclaimed)

2493

if (sc->nr_reclaimed)

2487

return sc->nr_reclaimed;

2494

return sc->nr_reclaimed;

2488

2495

2489

/*

2496

/*

2490

* As hibernation is going on, kswapd is freezed so that it can't mark

2497

* As hibernation is going on, kswapd is freezed so that it can't mark

2491

* the zone into all_unreclaimable. Thus bypassing all_unreclaimable

2498

* the zone into all_unreclaimable. Thus bypassing all_unreclaimable

2492

* check.

2499

* check.

2493

*/

2500

*/

2494

if (oom_killer_disabled)

2501

if (oom_killer_disabled)

2495

return 0;

2502

return 0;

2496

2503

2497

/* Aborted reclaim to try compaction? don't OOM, then */

2504

/* Aborted reclaim to try compaction? don't OOM, then */

2498

if (aborted_reclaim)

2505

if (aborted_reclaim)

2499

return 1;

2506

return 1;

2500

2507

2501

/* top priority shrink_zones still had more to do? don't OOM, then */

2508

/* top priority shrink_zones still had more to do? don't OOM, then */

2502

if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

2509

if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

2503

return 1;

2510

return 1;

2504

2511

2505

return 0;

2512

return 0;

2506

}

2513

}

2507

2514

2508

static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)

2515

static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)

2509

{

2516

{

2510

struct zone *zone;

2517

struct zone *zone;

2511

unsigned long pfmemalloc_reserve = 0;

2518

unsigned long pfmemalloc_reserve = 0;

2512

unsigned long free_pages = 0;

2519

unsigned long free_pages = 0;

2513

int i;

2520

int i;

2514

bool wmark_ok;

2521

bool wmark_ok;

2515

2522

2516

for (i = 0; i <= ZONE_NORMAL; i++) {

2523

for (i = 0; i <= ZONE_NORMAL; i++) {

2517

zone = &pgdat->node_zones[i];

2524

zone = &pgdat->node_zones[i];

2518

if (!populated_zone(zone))

2525

if (!populated_zone(zone))

2519

continue;

2526

continue;

2520

2527

2521

pfmemalloc_reserve += min_wmark_pages(zone);

2528

pfmemalloc_reserve += min_wmark_pages(zone);

2522

free_pages += zone_page_state(zone, NR_FREE_PAGES);

2529

free_pages += zone_page_state(zone, NR_FREE_PAGES);

2523

}

2530

}

2524

2531

2525

/* If there are no reserves (unexpected config) then do not throttle */

2532

/* If there are no reserves (unexpected config) then do not throttle */

2526

if (!pfmemalloc_reserve)

2533

if (!pfmemalloc_reserve)

2527

return true;

2534

return true;

2528

2535

2529

wmark_ok = free_pages > pfmemalloc_reserve / 2;

2536

wmark_ok = free_pages > pfmemalloc_reserve / 2;

2530

2537

2531

/* kswapd must be awake if processes are being throttled */

2538

/* kswapd must be awake if processes are being throttled */

2532

if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {

2539

if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {

2533

pgdat->classzone_idx = min(pgdat->classzone_idx,

2540

pgdat->classzone_idx = min(pgdat->classzone_idx,

2534

(enum zone_type)ZONE_NORMAL);

2541

(enum zone_type)ZONE_NORMAL);

2535

wake_up_interruptible(&pgdat->kswapd_wait);

2542

wake_up_interruptible(&pgdat->kswapd_wait);

2536

}

2543

}

2537

2544

2538

return wmark_ok;

2545

return wmark_ok;

2539

}

2546

}

2540

2547

2541

/*

2548

/*

2542

* Throttle direct reclaimers if backing storage is backed by the network

2549

* Throttle direct reclaimers if backing storage is backed by the network

2543

* and the PFMEMALLOC reserve for the preferred node is getting dangerously

2550

* and the PFMEMALLOC reserve for the preferred node is getting dangerously

2544

* depleted. kswapd will continue to make progress and wake the processes

2551

* depleted. kswapd will continue to make progress and wake the processes

2545

* when the low watermark is reached.

2552

* when the low watermark is reached.

2546

*

2553

*

2547

* Returns true if a fatal signal was delivered during throttling. If this

2554

* Returns true if a fatal signal was delivered during throttling. If this

2548

* happens, the page allocator should not consider triggering the OOM killer.

2555

* happens, the page allocator should not consider triggering the OOM killer.

2549

*/

2556

*/

2550

static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,

2557

static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,

2551

nodemask_t *nodemask)

2558

nodemask_t *nodemask)

2552

{

2559

{

2553

struct zoneref *z;

2560

struct zoneref *z;

2554

struct zone *zone;

2561

struct zone *zone;

2555

pg_data_t *pgdat = NULL;

2562

pg_data_t *pgdat = NULL;

2556

2563

2557

/*

2564

/*

2558

* Kernel threads should not be throttled as they may be indirectly

2565

* Kernel threads should not be throttled as they may be indirectly

2559

* responsible for cleaning pages necessary for reclaim to make forward

2566

* responsible for cleaning pages necessary for reclaim to make forward

2560

* progress. kjournald for example may enter direct reclaim while

2567

* progress. kjournald for example may enter direct reclaim while

2561

* committing a transaction where throttling it could forcing other

2568

* committing a transaction where throttling it could forcing other

2562

* processes to block on log_wait_commit().

2569

* processes to block on log_wait_commit().

2563

*/

2570

*/

2564

if (current->flags & PF_KTHREAD)

2571

if (current->flags & PF_KTHREAD)

2565

goto out;

2572

goto out;

2566

2573

2567

/*

2574

/*

2568

* If a fatal signal is pending, this process should not throttle.

2575

* If a fatal signal is pending, this process should not throttle.

2569

* It should return quickly so it can exit and free its memory

2576

* It should return quickly so it can exit and free its memory

2570

*/

2577

*/

2571

if (fatal_signal_pending(current))

2578

if (fatal_signal_pending(current))

2572

goto out;

2579

goto out;

2573

2580

2574

/*

2581

/*

2575

* Check if the pfmemalloc reserves are ok by finding the first node

2582

* Check if the pfmemalloc reserves are ok by finding the first node

2576

* with a usable ZONE_NORMAL or lower zone. The expectation is that

2583

* with a usable ZONE_NORMAL or lower zone. The expectation is that

2577

* GFP_KERNEL will be required for allocating network buffers when

2584

* GFP_KERNEL will be required for allocating network buffers when

2578

* swapping over the network so ZONE_HIGHMEM is unusable.

2585

* swapping over the network so ZONE_HIGHMEM is unusable.

2579

*

2586

*

2580

* Throttling is based on the first usable node and throttled processes

2587

* Throttling is based on the first usable node and throttled processes

2581

* wait on a queue until kswapd makes progress and wakes them. There

2588

* wait on a queue until kswapd makes progress and wakes them. There

2582

* is an affinity then between processes waking up and where reclaim

2589

* is an affinity then between processes waking up and where reclaim

2583

* progress has been made assuming the process wakes on the same node.

2590

* progress has been made assuming the process wakes on the same node.

2584

* More importantly, processes running on remote nodes will not compete

2591

* More importantly, processes running on remote nodes will not compete

2585

* for remote pfmemalloc reserves and processes on different nodes

2592

* for remote pfmemalloc reserves and processes on different nodes

2586

* should make reasonable progress.

2593

* should make reasonable progress.

2587

*/

2594

*/

2588

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2595

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2589

gfp_mask, nodemask) {

2596

gfp_mask, nodemask) {

2590

if (zone_idx(zone) > ZONE_NORMAL)

2597

if (zone_idx(zone) > ZONE_NORMAL)

2591

continue;

2598

continue;

2592

2599

2593

/* Throttle based on the first usable node */

2600

/* Throttle based on the first usable node */

2594

pgdat = zone->zone_pgdat;

2601

pgdat = zone->zone_pgdat;

2595

if (pfmemalloc_watermark_ok(pgdat))

2602

if (pfmemalloc_watermark_ok(pgdat))

2596

goto out;

2603

goto out;

2597

break;

2604

break;

2598

}

2605

}

2599

2606

2600

/* If no zone was usable by the allocation flags then do not throttle */

2607

/* If no zone was usable by the allocation flags then do not throttle */

2601

if (!pgdat)

2608

if (!pgdat)

2602

goto out;

2609

goto out;

2603

2610

2604

/* Account for the throttling */

2611

/* Account for the throttling */

2605

count_vm_event(PGSCAN_DIRECT_THROTTLE);

2612

count_vm_event(PGSCAN_DIRECT_THROTTLE);

2606

2613

2607

/*

2614

/*

2608

* If the caller cannot enter the filesystem, it's possible that it

2615

* If the caller cannot enter the filesystem, it's possible that it

2609

* is due to the caller holding an FS lock or performing a journal

2616

* is due to the caller holding an FS lock or performing a journal

2610

* transaction in the case of a filesystem like ext[3|4]. In this case,

2617

* transaction in the case of a filesystem like ext[3|4]. In this case,

2611

* it is not safe to block on pfmemalloc_wait as kswapd could be

2618

* it is not safe to block on pfmemalloc_wait as kswapd could be

2612

* blocked waiting on the same lock. Instead, throttle for up to a

2619

* blocked waiting on the same lock. Instead, throttle for up to a

2613

* second before continuing.

2620

* second before continuing.

2614

*/

2621

*/

2615

if (!(gfp_mask & __GFP_FS)) {

2622

if (!(gfp_mask & __GFP_FS)) {

2616

wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,

2623

wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,

2617

pfmemalloc_watermark_ok(pgdat), HZ);

2624

pfmemalloc_watermark_ok(pgdat), HZ);

2618

2625

2619

goto check_pending;

2626

goto check_pending;

2620

}

2627

}

2621

2628

2622

/* Throttle until kswapd wakes the process */

2629

/* Throttle until kswapd wakes the process */

2623

wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,

2630

wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,

2624

pfmemalloc_watermark_ok(pgdat));

2631

pfmemalloc_watermark_ok(pgdat));

2625

2632

2626

check_pending:

2633

check_pending:

2627

if (fatal_signal_pending(current))

2634

if (fatal_signal_pending(current))

2628

return true;

2635

return true;

2629

2636

2630

out:

2637

out:

2631

return false;

2638

return false;

2632

}

2639

}

2633

2640

2634

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

2641

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

2635

gfp_t gfp_mask, nodemask_t *nodemask)

2642

gfp_t gfp_mask, nodemask_t *nodemask)

2636

{

2643

{

2637

unsigned long nr_reclaimed;

2644

unsigned long nr_reclaimed;

2638

struct scan_control sc = {

2645

struct scan_control sc = {

2639

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

2646

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

2640

.may_writepage = !laptop_mode,

2647

.may_writepage = !laptop_mode,

2641

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2648

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2642

.may_unmap = 1,

2649

.may_unmap = 1,

2643

.may_swap = 1,

2650

.may_swap = 1,

2644

.order = order,

2651

.order = order,

2645

.priority = DEF_PRIORITY,

2652

.priority = DEF_PRIORITY,

2646

.target_mem_cgroup = NULL,

2653

.target_mem_cgroup = NULL,

2647

.nodemask = nodemask,

2654

.nodemask = nodemask,

2648

};

2655

};

2649

struct shrink_control shrink = {

2656

struct shrink_control shrink = {

2650

.gfp_mask = sc.gfp_mask,

2657

.gfp_mask = sc.gfp_mask,

2651

};

2658

};

2652

2659

2653

/*

2660

/*

2654

* Do not enter reclaim if fatal signal was delivered while throttled.

2661

* Do not enter reclaim if fatal signal was delivered while throttled.

2655

* 1 is returned so that the page allocator does not OOM kill at this

2662

* 1 is returned so that the page allocator does not OOM kill at this

2656

* point.

2663

* point.

2657

*/

2664

*/

2658

if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))

2665

if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))

2659

return 1;

2666

return 1;

2660

2667

2661

trace_mm_vmscan_direct_reclaim_begin(order,

2668

trace_mm_vmscan_direct_reclaim_begin(order,

2662

sc.may_writepage,

2669

sc.may_writepage,

2663

gfp_mask);

2670

gfp_mask);

2664

2671

2665

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2672

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2666

2673

2667

trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

2674

trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

2668

2675

2669

return nr_reclaimed;

2676

return nr_reclaimed;

2670

}

2677

}

2671

2678

2672

#ifdef CONFIG_MEMCG

2679

#ifdef CONFIG_MEMCG

2673

2680

2674

unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,

2681

unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,

2675

gfp_t gfp_mask, bool noswap,

2682

gfp_t gfp_mask, bool noswap,

2676

struct zone *zone,

2683

struct zone *zone,

2677

unsigned long *nr_scanned)

2684

unsigned long *nr_scanned)

2678

{

2685

{

2679

struct scan_control sc = {

2686

struct scan_control sc = {

2680

.nr_scanned = 0,

2687

.nr_scanned = 0,

2681

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2688

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2682

.may_writepage = !laptop_mode,

2689

.may_writepage = !laptop_mode,

2683

.may_unmap = 1,

2690

.may_unmap = 1,

2684

.may_swap = !noswap,

2691

.may_swap = !noswap,

2685

.order = 0,

2692

.order = 0,

2686

.priority = 0,

2693

.priority = 0,

2687

.target_mem_cgroup = memcg,

2694

.target_mem_cgroup = memcg,

2688

};

2695

};

2689

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2696

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2690

2697

2691

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2698

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2692

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

2699

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

2693

2700

2694

trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,

2701

trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,

2695

sc.may_writepage,

2702

sc.may_writepage,

2696

sc.gfp_mask);

2703

sc.gfp_mask);

2697

2704

2698

/*

2705

/*

2699

* NOTE: Although we can get the priority field, using it

2706

* NOTE: Although we can get the priority field, using it

2700

* here is not a good idea, since it limits the pages we can scan.

2707

* here is not a good idea, since it limits the pages we can scan.

2701

* if we don't reclaim here, the shrink_zone from balance_pgdat

2708

* if we don't reclaim here, the shrink_zone from balance_pgdat

2702

* will pick up pages from other mem cgroup's as well. We hack

2709

* will pick up pages from other mem cgroup's as well. We hack

2703

* the priority and make it zero.

2710

* the priority and make it zero.

2704

*/

2711

*/

2705

shrink_lruvec(lruvec, &sc);

2712

shrink_lruvec(lruvec, &sc);

2706

2713

2707

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

2714

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

2708

2715

2709

*nr_scanned = sc.nr_scanned;

2716

*nr_scanned = sc.nr_scanned;

2710

return sc.nr_reclaimed;

2717

return sc.nr_reclaimed;

2711

}

2718

}

2712

2719

2713

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

2720

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

2714

gfp_t gfp_mask,

2721

gfp_t gfp_mask,

2715

bool noswap)

2722

bool noswap)

2716

{

2723

{

2717

struct zonelist *zonelist;

2724

struct zonelist *zonelist;

2718

unsigned long nr_reclaimed;

2725

unsigned long nr_reclaimed;

2719

int nid;

2726

int nid;

2720

struct scan_control sc = {

2727

struct scan_control sc = {

2721

.may_writepage = !laptop_mode,

2728

.may_writepage = !laptop_mode,

2722

.may_unmap = 1,

2729

.may_unmap = 1,

2723

.may_swap = !noswap,

2730

.may_swap = !noswap,

2724

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2731

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2725

.order = 0,

2732

.order = 0,

2726

.priority = DEF_PRIORITY,

2733

.priority = DEF_PRIORITY,

2727

.target_mem_cgroup = memcg,

2734

.target_mem_cgroup = memcg,

2728

.nodemask = NULL, /* we don't care the placement */

2735

.nodemask = NULL, /* we don't care the placement */

2729

.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2736

.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2730

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),

2737

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),

2731

};

2738

};

2732

struct shrink_control shrink = {

2739

struct shrink_control shrink = {

2733

.gfp_mask = sc.gfp_mask,

2740

.gfp_mask = sc.gfp_mask,

2734

};

2741

};

2735

2742

2736

/*

2743

/*

2737

* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't

2744

* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't

2738

* take care of from where we get pages. So the node where we start the

2745

* take care of from where we get pages. So the node where we start the

2739

* scan does not need to be the current node.

2746

* scan does not need to be the current node.

2740

*/

2747

*/

2741

nid = mem_cgroup_select_victim_node(memcg);

2748

nid = mem_cgroup_select_victim_node(memcg);

2742

2749

2743

zonelist = NODE_DATA(nid)->node_zonelists;

2750

zonelist = NODE_DATA(nid)->node_zonelists;

2744

2751

2745

trace_mm_vmscan_memcg_reclaim_begin(0,

2752

trace_mm_vmscan_memcg_reclaim_begin(0,

2746

sc.may_writepage,

2753

sc.may_writepage,

2747

sc.gfp_mask);

2754

sc.gfp_mask);

2748

2755

2749

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2756

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2750

2757

2751

trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);

2758

trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);

2752

2759

2753

return nr_reclaimed;

2760

return nr_reclaimed;

2754

}

2761

}

2755

#endif

2762

#endif

2756

2763

2757

static void age_active_anon(struct zone *zone, struct scan_control *sc)

2764

static void age_active_anon(struct zone *zone, struct scan_control *sc)

2758

{

2765

{

2759

struct mem_cgroup *memcg;

2766

struct mem_cgroup *memcg;

2760

2767

2761

if (!total_swap_pages)

2768

if (!total_swap_pages)

2762

return;

2769

return;

2763

2770

2764

memcg = mem_cgroup_iter(NULL, NULL, NULL);

2771

memcg = mem_cgroup_iter(NULL, NULL, NULL);

2765

do {

2772

do {

2766

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2773

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2767

2774

2768

if (inactive_anon_is_low(lruvec))

2775

if (inactive_anon_is_low(lruvec))

2769

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2776

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2770

sc, LRU_ACTIVE_ANON);

2777

sc, LRU_ACTIVE_ANON);

2771

2778

2772

memcg = mem_cgroup_iter(NULL, memcg, NULL);

2779

memcg = mem_cgroup_iter(NULL, memcg, NULL);

2773

} while (memcg);

2780

} while (memcg);

2774

}

2781

}

2775

2782

2776

static bool zone_balanced(struct zone *zone, int order,

2783

static bool zone_balanced(struct zone *zone, int order,

2777

unsigned long balance_gap, int classzone_idx)

2784

unsigned long balance_gap, int classzone_idx)

2778

{

2785

{

2779

if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +

2786

if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +

2780

balance_gap, classzone_idx, 0))

2787

balance_gap, classzone_idx, 0))

2781

return false;

2788

return false;

2782

2789

2783

if (IS_ENABLED(CONFIG_COMPACTION) && order &&

2790

if (IS_ENABLED(CONFIG_COMPACTION) && order &&

2784

!compaction_suitable(zone, order))

2791

!compaction_suitable(zone, order))

2785

return false;

2792

return false;

2786

2793

2787

return true;

2794

return true;

2788

}

2795

}

2789

2796

2790

/*

2797

/*

2791

* pgdat_balanced() is used when checking if a node is balanced.

2798

* pgdat_balanced() is used when checking if a node is balanced.

2792

*

2799

*

2793

* For order-0, all zones must be balanced!

2800

* For order-0, all zones must be balanced!

2794

*

2801

*

2795

* For high-order allocations only zones that meet watermarks and are in a

2802

* For high-order allocations only zones that meet watermarks and are in a

2796

* zone allowed by the callers classzone_idx are added to balanced_pages. The

2803

* zone allowed by the callers classzone_idx are added to balanced_pages. The

2797

* total of balanced pages must be at least 25% of the zones allowed by

2804

* total of balanced pages must be at least 25% of the zones allowed by

2798

* classzone_idx for the node to be considered balanced. Forcing all zones to

2805

* classzone_idx for the node to be considered balanced. Forcing all zones to

2799

* be balanced for high orders can cause excessive reclaim when there are

2806

* be balanced for high orders can cause excessive reclaim when there are

2800

* imbalanced zones.

2807

* imbalanced zones.

2801

* The choice of 25% is due to

2808

* The choice of 25% is due to

2802

* o a 16M DMA zone that is balanced will not balance a zone on any

2809

* o a 16M DMA zone that is balanced will not balance a zone on any

2803

* reasonable sized machine

2810

* reasonable sized machine

2804

* o On all other machines, the top zone must be at least a reasonable

2811

* o On all other machines, the top zone must be at least a reasonable

2805

* percentage of the middle zones. For example, on 32-bit x86, highmem

2812

* percentage of the middle zones. For example, on 32-bit x86, highmem

2806

* would need to be at least 256M for it to be balance a whole node.

2813

* would need to be at least 256M for it to be balance a whole node.

2807

* Similarly, on x86-64 the Normal zone would need to be at least 1G

2814

* Similarly, on x86-64 the Normal zone would need to be at least 1G

2808

* to balance a node on its own. These seemed like reasonable ratios.

2815

* to balance a node on its own. These seemed like reasonable ratios.

2809

*/

2816

*/

2810

static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

2817

static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

2811

{

2818

{

2812

unsigned long managed_pages = 0;

2819

unsigned long managed_pages = 0;

2813

unsigned long balanced_pages = 0;

2820

unsigned long balanced_pages = 0;

2814

int i;

2821

int i;

2815

2822

2816

/* Check the watermark levels */

2823

/* Check the watermark levels */

2817

for (i = 0; i <= classzone_idx; i++) {

2824

for (i = 0; i <= classzone_idx; i++) {

2818

struct zone *zone = pgdat->node_zones + i;

2825

struct zone *zone = pgdat->node_zones + i;

2819

2826

2820

if (!populated_zone(zone))

2827

if (!populated_zone(zone))

2821

continue;

2828

continue;

2822

2829

2823

managed_pages += zone->managed_pages;

2830

managed_pages += zone->managed_pages;

2824

2831

2825

/*

2832

/*

2826

* A special case here:

2833

* A special case here:

2827

*

2834

*

2828

* balance_pgdat() skips over all_unreclaimable after

2835

* balance_pgdat() skips over all_unreclaimable after

2829

* DEF_PRIORITY. Effectively, it considers them balanced so

2836

* DEF_PRIORITY. Effectively, it considers them balanced so

2830

* they must be considered balanced here as well!

2837

* they must be considered balanced here as well!

2831

*/

2838

*/

2832

if (!zone_reclaimable(zone)) {

2839

if (!zone_reclaimable(zone)) {

2833

balanced_pages += zone->managed_pages;

2840

balanced_pages += zone->managed_pages;

2834

continue;

2841

continue;

2835

}

2842

}

2836

2843

2837

if (zone_balanced(zone, order, 0, i))

2844

if (zone_balanced(zone, order, 0, i))

2838

balanced_pages += zone->managed_pages;

2845

balanced_pages += zone->managed_pages;

2839

else if (!order)

2846

else if (!order)

2840

return false;

2847

return false;

2841

}

2848

}

2842

2849

2843

if (order)

2850

if (order)

2844

return balanced_pages >= (managed_pages >> 2);

2851

return balanced_pages >= (managed_pages >> 2);

2845

else

2852

else

2846

return true;

2853

return true;

2847

}

2854

}

2848

2855

2849

/*

2856

/*

2850

* Prepare kswapd for sleeping. This verifies that there are no processes

2857

* Prepare kswapd for sleeping. This verifies that there are no processes

2851

* waiting in throttle_direct_reclaim() and that watermarks have been met.

2858

* waiting in throttle_direct_reclaim() and that watermarks have been met.

2852

*

2859

*

2853

* Returns true if kswapd is ready to sleep

2860

* Returns true if kswapd is ready to sleep

2854

*/

2861

*/

2855

static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

2862

static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

2856

int classzone_idx)

2863

int classzone_idx)

2857

{

2864

{

2858

/* If a direct reclaimer woke kswapd within HZ/10, it's premature */

2865

/* If a direct reclaimer woke kswapd within HZ/10, it's premature */

2859

if (remaining)

2866

if (remaining)

2860

return false;

2867

return false;

2861

2868

2862

/*

2869

/*

2863

* There is a potential race between when kswapd checks its watermarks

2870

* There is a potential race between when kswapd checks its watermarks

2864

* and a process gets throttled. There is also a potential race if

2871

* and a process gets throttled. There is also a potential race if

2865

* processes get throttled, kswapd wakes, a large process exits therby

2872

* processes get throttled, kswapd wakes, a large process exits therby

2866

* balancing the zones that causes kswapd to miss a wakeup. If kswapd

2873

* balancing the zones that causes kswapd to miss a wakeup. If kswapd

2867

* is going to sleep, no process should be sleeping on pfmemalloc_wait

2874

* is going to sleep, no process should be sleeping on pfmemalloc_wait

2868

* so wake them now if necessary. If necessary, processes will wake

2875

* so wake them now if necessary. If necessary, processes will wake

2869

* kswapd and get throttled again

2876

* kswapd and get throttled again

2870

*/

2877

*/

2871

if (waitqueue_active(&pgdat->pfmemalloc_wait)) {

2878

if (waitqueue_active(&pgdat->pfmemalloc_wait)) {

2872

wake_up(&pgdat->pfmemalloc_wait);

2879

wake_up(&pgdat->pfmemalloc_wait);

2873

return false;

2880

return false;

2874

}

2881

}

2875

2882

2876

return pgdat_balanced(pgdat, order, classzone_idx);

2883

return pgdat_balanced(pgdat, order, classzone_idx);

2877

}

2884

}

2878

2885

2879

/*

2886

/*

2880

* kswapd shrinks the zone by the number of pages required to reach

2887

* kswapd shrinks the zone by the number of pages required to reach

2881

* the high watermark.

2888

* the high watermark.

2882

*

2889

*

2883

* Returns true if kswapd scanned at least the requested number of pages to

2890

* Returns true if kswapd scanned at least the requested number of pages to

2884

* reclaim or if the lack of progress was due to pages under writeback.

2891

* reclaim or if the lack of progress was due to pages under writeback.

2885

* This is used to determine if the scanning priority needs to be raised.

2892

* This is used to determine if the scanning priority needs to be raised.

2886

*/

2893

*/

2887

static bool kswapd_shrink_zone(struct zone *zone,

2894

static bool kswapd_shrink_zone(struct zone *zone,

2888

int classzone_idx,

2895

int classzone_idx,

2889

struct scan_control *sc,

2896

struct scan_control *sc,

2890

unsigned long lru_pages,

2897

unsigned long lru_pages,

2891

unsigned long *nr_attempted)

2898

unsigned long *nr_attempted)

2892

{

2899

{

2893

int testorder = sc->order;

2900

int testorder = sc->order;

2894

unsigned long balance_gap;

2901

unsigned long balance_gap;

2895

struct reclaim_state *reclaim_state = current->reclaim_state;

2902

struct reclaim_state *reclaim_state = current->reclaim_state;

2896

struct shrink_control shrink = {

2903

struct shrink_control shrink = {

2897

.gfp_mask = sc->gfp_mask,

2904

.gfp_mask = sc->gfp_mask,

2898

};

2905

};

2899

bool lowmem_pressure;

2906

bool lowmem_pressure;

2900

2907

2901

/* Reclaim above the high watermark. */

2908

/* Reclaim above the high watermark. */

2902

sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

2909

sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

2903

2910

2904

/*

2911

/*

2905

* Kswapd reclaims only single pages with compaction enabled. Trying

2912

* Kswapd reclaims only single pages with compaction enabled. Trying

2906

* too hard to reclaim until contiguous free pages have become

2913

* too hard to reclaim until contiguous free pages have become

2907

* available can hurt performance by evicting too much useful data

2914

* available can hurt performance by evicting too much useful data

2908

* from memory. Do not reclaim more than needed for compaction.

2915

* from memory. Do not reclaim more than needed for compaction.

2909

*/

2916

*/

2910

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2917

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2911

compaction_suitable(zone, sc->order) !=

2918

compaction_suitable(zone, sc->order) !=

2912

COMPACT_SKIPPED)

2919

COMPACT_SKIPPED)

2913

testorder = 0;

2920

testorder = 0;

2914

2921

2915

/*

2922

/*

2916

* We put equal pressure on every zone, unless one zone has way too

2923

* We put equal pressure on every zone, unless one zone has way too

2917

* many pages free already. The "too many pages" is defined as the

2924

* many pages free already. The "too many pages" is defined as the

2918

* high wmark plus a "gap" where the gap is either the low

2925

* high wmark plus a "gap" where the gap is either the low

2919

* watermark or 1% of the zone, whichever is smaller.

2926

* watermark or 1% of the zone, whichever is smaller.

2920

*/

2927

*/

2921

balance_gap = min(low_wmark_pages(zone),

2928

balance_gap = min(low_wmark_pages(zone),

2922

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2929

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2923

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2930

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2924

2931

2925

/*

2932

/*

2926

* If there is no low memory pressure or the zone is balanced then no

2933

* If there is no low memory pressure or the zone is balanced then no

2927

* reclaim is necessary

2934

* reclaim is necessary

2928

*/

2935

*/

2929

lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));

2936

lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));

2930

if (!lowmem_pressure && zone_balanced(zone, testorder,

2937

if (!lowmem_pressure && zone_balanced(zone, testorder,

2931

balance_gap, classzone_idx))

2938

balance_gap, classzone_idx))

2932

return true;

2939

return true;

2933

2940

2934

shrink_zone(zone, sc);

2941

shrink_zone(zone, sc);

2935

nodes_clear(shrink.nodes_to_scan);

2942

nodes_clear(shrink.nodes_to_scan);

2936

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

2943

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

2937

2944

2938

reclaim_state->reclaimed_slab = 0;

2945

reclaim_state->reclaimed_slab = 0;

2939

shrink_slab(&shrink, sc->nr_scanned, lru_pages);

2946

shrink_slab(&shrink, sc->nr_scanned, lru_pages);

2940

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2947

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2941

2948

2942

/* Account for the number of pages attempted to reclaim */

2949

/* Account for the number of pages attempted to reclaim */

2943

*nr_attempted += sc->nr_to_reclaim;

2950

*nr_attempted += sc->nr_to_reclaim;

2944

2951

2945

zone_clear_flag(zone, ZONE_WRITEBACK);

2952

zone_clear_flag(zone, ZONE_WRITEBACK);

2946

2953

2947

/*

2954

/*

2948

* If a zone reaches its high watermark, consider it to be no longer

2955

* If a zone reaches its high watermark, consider it to be no longer

2949

* congested. It's possible there are dirty pages backed by congested

2956

* congested. It's possible there are dirty pages backed by congested

2950

* BDIs but as pressure is relieved, speculatively avoid congestion

2957

* BDIs but as pressure is relieved, speculatively avoid congestion

2951

* waits.

2958

* waits.

2952

*/

2959

*/

2953

if (zone_reclaimable(zone) &&

2960

if (zone_reclaimable(zone) &&

2954

zone_balanced(zone, testorder, 0, classzone_idx)) {

2961

zone_balanced(zone, testorder, 0, classzone_idx)) {

2955

zone_clear_flag(zone, ZONE_CONGESTED);

2962

zone_clear_flag(zone, ZONE_CONGESTED);

2956

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

2963

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

2957

}

2964

}

2958

2965

2959

return sc->nr_scanned >= sc->nr_to_reclaim;

2966

return sc->nr_scanned >= sc->nr_to_reclaim;

2960

}

2967

}

2961

2968

2962

/*

2969

/*

2963

* For kswapd, balance_pgdat() will work across all this node's zones until

2970

* For kswapd, balance_pgdat() will work across all this node's zones until

2964

* they are all at high_wmark_pages(zone).

2971

* they are all at high_wmark_pages(zone).

2965

*

2972

*

2966

* Returns the final order kswapd was reclaiming at

2973

* Returns the final order kswapd was reclaiming at

2967

*

2974

*

2968

* There is special handling here for zones which are full of pinned pages.

2975

* There is special handling here for zones which are full of pinned pages.

2969

* This can happen if the pages are all mlocked, or if they are all used by

2976

* This can happen if the pages are all mlocked, or if they are all used by

2970

* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.

2977

* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.

2971

* What we do is to detect the case where all pages in the zone have been

2978

* What we do is to detect the case where all pages in the zone have been

2972

* scanned twice and there has been zero successful reclaim. Mark the zone as

2979

* scanned twice and there has been zero successful reclaim. Mark the zone as

2973

* dead and from now on, only perform a short scan. Basically we're polling

2980

* dead and from now on, only perform a short scan. Basically we're polling

2974

* the zone for when the problem goes away.

2981

* the zone for when the problem goes away.

2975

*

2982

*

2976

* kswapd scans the zones in the highmem->normal->dma direction. It skips

2983

* kswapd scans the zones in the highmem->normal->dma direction. It skips

2977

* zones which have free_pages > high_wmark_pages(zone), but once a zone is

2984

* zones which have free_pages > high_wmark_pages(zone), but once a zone is

2978

* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the

2985

* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the

2979

* lower zones regardless of the number of free pages in the lower zones. This

2986

* lower zones regardless of the number of free pages in the lower zones. This

2980

* interoperates with the page allocator fallback scheme to ensure that aging

2987

* interoperates with the page allocator fallback scheme to ensure that aging

2981

* of pages is balanced across the zones.

2988

* of pages is balanced across the zones.

2982

*/

2989

*/

2983

static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

2990

static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

2984

int *classzone_idx)

2991

int *classzone_idx)

2985

{

2992

{

2986

int i;

2993

int i;

2987

int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */

2994

int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */

2988

unsigned long nr_soft_reclaimed;

2995

unsigned long nr_soft_reclaimed;

2989

unsigned long nr_soft_scanned;

2996

unsigned long nr_soft_scanned;

2990

struct scan_control sc = {

2997

struct scan_control sc = {

2991

.gfp_mask = GFP_KERNEL,

2998

.gfp_mask = GFP_KERNEL,

2992

.priority = DEF_PRIORITY,

2999

.priority = DEF_PRIORITY,

2993

.may_unmap = 1,

3000

.may_unmap = 1,

2994

.may_swap = 1,

3001

.may_swap = 1,

2995

.may_writepage = !laptop_mode,

3002

.may_writepage = !laptop_mode,

2996

.order = order,

3003

.order = order,

2997

.target_mem_cgroup = NULL,

3004

.target_mem_cgroup = NULL,

2998

};

3005

};

2999

count_vm_event(PAGEOUTRUN);

3006

count_vm_event(PAGEOUTRUN);

3000

3007

3001

do {

3008

do {

3002

unsigned long lru_pages = 0;

3009

unsigned long lru_pages = 0;

3003

unsigned long nr_attempted = 0;

3010

unsigned long nr_attempted = 0;

3004

bool raise_priority = true;

3011

bool raise_priority = true;

3005

bool pgdat_needs_compaction = (order > 0);

3012

bool pgdat_needs_compaction = (order > 0);

3006

3013

3007

sc.nr_reclaimed = 0;

3014

sc.nr_reclaimed = 0;

3008

3015

3009

/*

3016

/*

3010

* Scan in the highmem->dma direction for the highest

3017

* Scan in the highmem->dma direction for the highest

3011

* zone which needs scanning

3018

* zone which needs scanning

3012

*/

3019

*/

3013

for (i = pgdat->nr_zones - 1; i >= 0; i--) {

3020

for (i = pgdat->nr_zones - 1; i >= 0; i--) {

3014

struct zone *zone = pgdat->node_zones + i;

3021

struct zone *zone = pgdat->node_zones + i;

3015

3022

3016

if (!populated_zone(zone))

3023

if (!populated_zone(zone))

3017

continue;

3024

continue;

3018

3025

3019

if (sc.priority != DEF_PRIORITY &&

3026

if (sc.priority != DEF_PRIORITY &&

3020

!zone_reclaimable(zone))

3027

!zone_reclaimable(zone))

3021

continue;

3028

continue;

3022

3029

3023

/*

3030

/*

3024

* Do some background aging of the anon list, to give

3031

* Do some background aging of the anon list, to give

3025

* pages a chance to be referenced before reclaiming.

3032

* pages a chance to be referenced before reclaiming.

3026

*/

3033

*/

3027

age_active_anon(zone, &sc);

3034

age_active_anon(zone, &sc);

3028

3035

3029

/*

3036

/*

3030

* If the number of buffer_heads in the machine

3037

* If the number of buffer_heads in the machine

3031

* exceeds the maximum allowed level and this node

3038

* exceeds the maximum allowed level and this node

3032

* has a highmem zone, force kswapd to reclaim from

3039

* has a highmem zone, force kswapd to reclaim from

3033

* it to relieve lowmem pressure.

3040

* it to relieve lowmem pressure.

3034

*/

3041

*/

3035

if (buffer_heads_over_limit && is_highmem_idx(i)) {

3042

if (buffer_heads_over_limit && is_highmem_idx(i)) {

3036

end_zone = i;

3043

end_zone = i;

3037

break;

3044

break;

3038

}

3045

}

3039

3046

3040

if (!zone_balanced(zone, order, 0, 0)) {

3047

if (!zone_balanced(zone, order, 0, 0)) {

3041

end_zone = i;

3048

end_zone = i;

3042

break;

3049

break;

3043

} else {

3050

} else {

3044

/*

3051

/*

3045

* If balanced, clear the dirty and congested

3052

* If balanced, clear the dirty and congested

3046

* flags

3053

* flags

3047

*/

3054

*/

3048

zone_clear_flag(zone, ZONE_CONGESTED);

3055

zone_clear_flag(zone, ZONE_CONGESTED);

3049

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

3056

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

3050

}

3057

}

3051

}

3058

}

3052

3059

3053

if (i < 0)

3060

if (i < 0)

3054

goto out;

3061

goto out;

3055

3062

3056

for (i = 0; i <= end_zone; i++) {

3063

for (i = 0; i <= end_zone; i++) {

3057

struct zone *zone = pgdat->node_zones + i;

3064

struct zone *zone = pgdat->node_zones + i;

3058

3065

3059

if (!populated_zone(zone))

3066

if (!populated_zone(zone))

3060

continue;

3067

continue;

3061

3068

3062

lru_pages += zone_reclaimable_pages(zone);

3069

lru_pages += zone_reclaimable_pages(zone);

3063

3070

3064

/*

3071

/*

3065

* If any zone is currently balanced then kswapd will

3072

* If any zone is currently balanced then kswapd will

3066

* not call compaction as it is expected that the

3073

* not call compaction as it is expected that the

3067

* necessary pages are already available.

3074

* necessary pages are already available.

3068

*/

3075

*/

3069

if (pgdat_needs_compaction &&

3076

if (pgdat_needs_compaction &&

3070

zone_watermark_ok(zone, order,

3077

zone_watermark_ok(zone, order,

3071

low_wmark_pages(zone),

3078

low_wmark_pages(zone),

3072

*classzone_idx, 0))

3079

*classzone_idx, 0))

3073

pgdat_needs_compaction = false;

3080

pgdat_needs_compaction = false;

3074

}

3081

}

3075

3082

3076

/*

3083

/*

3077

* If we're getting trouble reclaiming, start doing writepage

3084

* If we're getting trouble reclaiming, start doing writepage

3078

* even in laptop mode.

3085

* even in laptop mode.

3079

*/

3086

*/

3080

if (sc.priority < DEF_PRIORITY - 2)

3087

if (sc.priority < DEF_PRIORITY - 2)

3081

sc.may_writepage = 1;

3088

sc.may_writepage = 1;

3082

3089

3083

/*

3090

/*

3084

* Now scan the zone in the dma->highmem direction, stopping

3091

* Now scan the zone in the dma->highmem direction, stopping

3085

* at the last zone which needs scanning.

3092

* at the last zone which needs scanning.

3086

*

3093

*

3087

* We do this because the page allocator works in the opposite

3094

* We do this because the page allocator works in the opposite

3088

* direction. This prevents the page allocator from allocating

3095

* direction. This prevents the page allocator from allocating

3089

* pages behind kswapd's direction of progress, which would

3096

* pages behind kswapd's direction of progress, which would

3090

* cause too much scanning of the lower zones.

3097

* cause too much scanning of the lower zones.

3091

*/

3098

*/

3092

for (i = 0; i <= end_zone; i++) {

3099

for (i = 0; i <= end_zone; i++) {

3093

struct zone *zone = pgdat->node_zones + i;

3100

struct zone *zone = pgdat->node_zones + i;

3094

3101

3095

if (!populated_zone(zone))

3102

if (!populated_zone(zone))

3096

continue;

3103

continue;

3097

3104

3098

if (sc.priority != DEF_PRIORITY &&

3105

if (sc.priority != DEF_PRIORITY &&

3099

!zone_reclaimable(zone))

3106

!zone_reclaimable(zone))

3100

continue;

3107

continue;

3101

3108

3102

sc.nr_scanned = 0;

3109

sc.nr_scanned = 0;

3103

3110

3104

nr_soft_scanned = 0;

3111

nr_soft_scanned = 0;

3105

/*

3112

/*

3106

* Call soft limit reclaim before calling shrink_zone.

3113

* Call soft limit reclaim before calling shrink_zone.

3107

*/

3114

*/

3108

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

3115

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

3109

order, sc.gfp_mask,

3116

order, sc.gfp_mask,

3110

&nr_soft_scanned);

3117

&nr_soft_scanned);

3111

sc.nr_reclaimed += nr_soft_reclaimed;

3118

sc.nr_reclaimed += nr_soft_reclaimed;

3112

3119

3113

/*

3120

/*

3114

* There should be no need to raise the scanning

3121

* There should be no need to raise the scanning

3115

* priority if enough pages are already being scanned

3122

* priority if enough pages are already being scanned

3116

* that that high watermark would be met at 100%

3123

* that that high watermark would be met at 100%

3117

* efficiency.

3124

* efficiency.

3118

*/

3125

*/

3119

if (kswapd_shrink_zone(zone, end_zone, &sc,

3126

if (kswapd_shrink_zone(zone, end_zone, &sc,

3120

lru_pages, &nr_attempted))

3127

lru_pages, &nr_attempted))

3121

raise_priority = false;

3128

raise_priority = false;

3122

}

3129

}

3123

3130

3124

/*

3131

/*

3125

* If the low watermark is met there is no need for processes

3132

* If the low watermark is met there is no need for processes

3126

* to be throttled on pfmemalloc_wait as they should not be

3133

* to be throttled on pfmemalloc_wait as they should not be

3127

* able to safely make forward progress. Wake them

3134

* able to safely make forward progress. Wake them

3128

*/

3135

*/

3129

if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

3136

if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

3130

pfmemalloc_watermark_ok(pgdat))

3137

pfmemalloc_watermark_ok(pgdat))

3131

wake_up(&pgdat->pfmemalloc_wait);

3138

wake_up(&pgdat->pfmemalloc_wait);

3132

3139

3133

/*

3140

/*

3134

* Fragmentation may mean that the system cannot be rebalanced

3141

* Fragmentation may mean that the system cannot be rebalanced

3135

* for high-order allocations in all zones. If twice the

3142

* for high-order allocations in all zones. If twice the

3136

* allocation size has been reclaimed and the zones are still

3143

* allocation size has been reclaimed and the zones are still

3137

* not balanced then recheck the watermarks at order-0 to

3144

* not balanced then recheck the watermarks at order-0 to

3138

* prevent kswapd reclaiming excessively. Assume that a

3145

* prevent kswapd reclaiming excessively. Assume that a

3139

* process requested a high-order can direct reclaim/compact.

3146

* process requested a high-order can direct reclaim/compact.

3140

*/

3147

*/

3141

if (order && sc.nr_reclaimed >= 2UL << order)

3148

if (order && sc.nr_reclaimed >= 2UL << order)

3142

order = sc.order = 0;

3149

order = sc.order = 0;

3143

3150

3144

/* Check if kswapd should be suspending */

3151

/* Check if kswapd should be suspending */

3145

if (try_to_freeze() || kthread_should_stop())

3152

if (try_to_freeze() || kthread_should_stop())

3146

break;

3153

break;

3147

3154

3148

/*

3155

/*

3149

* Compact if necessary and kswapd is reclaiming at least the

3156

* Compact if necessary and kswapd is reclaiming at least the

3150

* high watermark number of pages as requsted

3157

* high watermark number of pages as requsted

3151

*/

3158

*/

3152

if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)

3159

if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)

3153

compact_pgdat(pgdat, order);

3160

compact_pgdat(pgdat, order);

3154

3161

3155

/*

3162

/*

3156

* Raise priority if scanning rate is too low or there was no

3163

* Raise priority if scanning rate is too low or there was no

3157

* progress in reclaiming pages

3164

* progress in reclaiming pages

3158

*/

3165

*/

3159

if (raise_priority || !sc.nr_reclaimed)

3166

if (raise_priority || !sc.nr_reclaimed)

3160

sc.priority--;

3167

sc.priority--;

3161

} while (sc.priority >= 1 &&

3168

} while (sc.priority >= 1 &&

3162

!pgdat_balanced(pgdat, order, *classzone_idx));

3169

!pgdat_balanced(pgdat, order, *classzone_idx));

3163

3170

3164

out:

3171

out:

3165

/*

3172

/*

3166

* Return the order we were reclaiming at so prepare_kswapd_sleep()

3173

* Return the order we were reclaiming at so prepare_kswapd_sleep()

3167

* makes a decision on the order we were last reclaiming at. However,

3174

* makes a decision on the order we were last reclaiming at. However,

3168

* if another caller entered the allocator slow path while kswapd

3175

* if another caller entered the allocator slow path while kswapd

3169

* was awake, order will remain at the higher level

3176

* was awake, order will remain at the higher level

3170

*/

3177

*/

3171

*classzone_idx = end_zone;

3178

*classzone_idx = end_zone;

3172

return order;

3179

return order;

3173

}

3180

}

3174

3181

3175

static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)

3182

static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)

3176

{

3183

{

3177

long remaining = 0;

3184

long remaining = 0;

3178

DEFINE_WAIT(wait);

3185

DEFINE_WAIT(wait);

3179

3186

3180

if (freezing(current) || kthread_should_stop())

3187

if (freezing(current) || kthread_should_stop())

3181

return;

3188

return;

3182

3189

3183

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3190

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3184

3191

3185

/* Try to sleep for a short interval */

3192

/* Try to sleep for a short interval */

3186

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3193

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3187

remaining = schedule_timeout(HZ/10);

3194

remaining = schedule_timeout(HZ/10);

3188

finish_wait(&pgdat->kswapd_wait, &wait);

3195

finish_wait(&pgdat->kswapd_wait, &wait);

3189

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3196

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3190

}

3197

}

3191

3198

3192

/*

3199

/*

3193

* After a short sleep, check if it was a premature sleep. If not, then

3200

* After a short sleep, check if it was a premature sleep. If not, then

3194

* go fully to sleep until explicitly woken up.

3201

* go fully to sleep until explicitly woken up.

3195

*/

3202

*/

3196

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3203

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3197

trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

3204

trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

3198

3205

3199

/*

3206

/*

3200

* vmstat counters are not perfectly accurate and the estimated

3207

* vmstat counters are not perfectly accurate and the estimated

3201

* value for counters such as NR_FREE_PAGES can deviate from the

3208

* value for counters such as NR_FREE_PAGES can deviate from the

3202

* true value by nr_online_cpus * threshold. To avoid the zone

3209

* true value by nr_online_cpus * threshold. To avoid the zone

3203

* watermarks being breached while under pressure, we reduce the

3210

* watermarks being breached while under pressure, we reduce the

3204

* per-cpu vmstat threshold while kswapd is awake and restore

3211

* per-cpu vmstat threshold while kswapd is awake and restore

3205

* them before going back to sleep.

3212

* them before going back to sleep.

3206

*/

3213

*/

3207

set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

3214

set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

3208

3215

3209

/*

3216

/*

3210

* Compaction records what page blocks it recently failed to

3217

* Compaction records what page blocks it recently failed to

3211

* isolate pages from and skips them in the future scanning.

3218

* isolate pages from and skips them in the future scanning.

3212

* When kswapd is going to sleep, it is reasonable to assume

3219

* When kswapd is going to sleep, it is reasonable to assume

3213

* that pages and compaction may succeed so reset the cache.

3220

* that pages and compaction may succeed so reset the cache.

3214

*/

3221

*/

3215

reset_isolation_suitable(pgdat);

3222

reset_isolation_suitable(pgdat);

3216

3223

3217

if (!kthread_should_stop())

3224

if (!kthread_should_stop())

3218

schedule();

3225

schedule();

3219

3226

3220

set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);

3227

set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);

3221

} else {

3228

} else {

3222

if (remaining)

3229

if (remaining)

3223

count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);

3230

count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);

3224

else

3231

else

3225

count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);

3232

count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);

3226

}

3233

}

3227

finish_wait(&pgdat->kswapd_wait, &wait);

3234

finish_wait(&pgdat->kswapd_wait, &wait);

3228

}

3235

}

3229

3236

3230

/*

3237

/*

3231

* The background pageout daemon, started as a kernel thread

3238

* The background pageout daemon, started as a kernel thread

3232

* from the init process.

3239

* from the init process.

3233

*

3240

*

3234

* This basically trickles out pages so that we have _some_

3241

* This basically trickles out pages so that we have _some_

3235

* free memory available even if there is no other activity

3242

* free memory available even if there is no other activity

3236

* that frees anything up. This is needed for things like routing

3243

* that frees anything up. This is needed for things like routing

3237

* etc, where we otherwise might have all activity going on in

3244

* etc, where we otherwise might have all activity going on in

3238

* asynchronous contexts that cannot page things out.

3245

* asynchronous contexts that cannot page things out.

3239

*

3246

*

3240

* If there are applications that are active memory-allocators

3247

* If there are applications that are active memory-allocators

3241

* (most normal use), this basically shouldn't matter.

3248

* (most normal use), this basically shouldn't matter.

3242

*/

3249

*/

3243

static int kswapd(void *p)

3250

static int kswapd(void *p)

3244

{

3251

{

3245

unsigned long order, new_order;

3252

unsigned long order, new_order;

3246

unsigned balanced_order;

3253

unsigned balanced_order;

3247

int classzone_idx, new_classzone_idx;

3254

int classzone_idx, new_classzone_idx;

3248

int balanced_classzone_idx;

3255

int balanced_classzone_idx;

3249

pg_data_t *pgdat = (pg_data_t*)p;

3256

pg_data_t *pgdat = (pg_data_t*)p;

3250

struct task_struct *tsk = current;

3257

struct task_struct *tsk = current;

3251

3258

3252

struct reclaim_state reclaim_state = {

3259

struct reclaim_state reclaim_state = {

3253

.reclaimed_slab = 0,

3260

.reclaimed_slab = 0,

3254

};

3261

};

3255

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

3262

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

3256

3263

3257

lockdep_set_current_reclaim_state(GFP_KERNEL);

3264

lockdep_set_current_reclaim_state(GFP_KERNEL);

3258

3265

3259

if (!cpumask_empty(cpumask))

3266

if (!cpumask_empty(cpumask))

3260

set_cpus_allowed_ptr(tsk, cpumask);

3267

set_cpus_allowed_ptr(tsk, cpumask);

3261

current->reclaim_state = &reclaim_state;

3268

current->reclaim_state = &reclaim_state;

3262

3269

3263

/*

3270

/*

3264

* Tell the memory management that we're a "memory allocator",

3271

* Tell the memory management that we're a "memory allocator",

3265

* and that if we need more memory we should get access to it

3272

* and that if we need more memory we should get access to it

3266

* regardless (see "__alloc_pages()"). "kswapd" should

3273

* regardless (see "__alloc_pages()"). "kswapd" should

3267

* never get caught in the normal page freeing logic.

3274

* never get caught in the normal page freeing logic.

3268

*

3275

*

3269

* (Kswapd normally doesn't need memory anyway, but sometimes

3276

* (Kswapd normally doesn't need memory anyway, but sometimes

3270

* you need a small amount of memory in order to be able to

3277

* you need a small amount of memory in order to be able to

3271

* page out something else, and this flag essentially protects

3278

* page out something else, and this flag essentially protects

3272

* us from recursively trying to free more memory as we're

3279

* us from recursively trying to free more memory as we're

3273

* trying to free the first piece of memory in the first place).

3280

* trying to free the first piece of memory in the first place).

3274

*/

3281

*/

3275

tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;

3282

tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;

3276

set_freezable();

3283

set_freezable();

3277

3284

3278

order = new_order = 0;

3285

order = new_order = 0;

3279

balanced_order = 0;

3286

balanced_order = 0;

3280

classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;

3287

classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;

3281

balanced_classzone_idx = classzone_idx;

3288

balanced_classzone_idx = classzone_idx;

3282

for ( ; ; ) {

3289

for ( ; ; ) {

3283

bool ret;

3290

bool ret;

3284

3291

3285

/*

3292

/*

3286

* If the last balance_pgdat was unsuccessful it's unlikely a

3293

* If the last balance_pgdat was unsuccessful it's unlikely a

3287

* new request of a similar or harder type will succeed soon

3294

* new request of a similar or harder type will succeed soon

3288

* so consider going to sleep on the basis we reclaimed at

3295

* so consider going to sleep on the basis we reclaimed at

3289

*/

3296

*/

3290

if (balanced_classzone_idx >= new_classzone_idx &&

3297

if (balanced_classzone_idx >= new_classzone_idx &&

3291

balanced_order == new_order) {

3298

balanced_order == new_order) {

3292

new_order = pgdat->kswapd_max_order;

3299

new_order = pgdat->kswapd_max_order;

3293

new_classzone_idx = pgdat->classzone_idx;

3300

new_classzone_idx = pgdat->classzone_idx;

3294

pgdat->kswapd_max_order = 0;

3301

pgdat->kswapd_max_order = 0;

3295

pgdat->classzone_idx = pgdat->nr_zones - 1;

3302

pgdat->classzone_idx = pgdat->nr_zones - 1;

3296

}

3303

}

3297

3304

3298

if (order < new_order || classzone_idx > new_classzone_idx) {

3305

if (order < new_order || classzone_idx > new_classzone_idx) {

3299

/*

3306

/*

3300

* Don't sleep if someone wants a larger 'order'

3307

* Don't sleep if someone wants a larger 'order'

3301

* allocation or has tigher zone constraints

3308

* allocation or has tigher zone constraints

3302

*/

3309

*/

3303

order = new_order;

3310

order = new_order;

3304

classzone_idx = new_classzone_idx;

3311

classzone_idx = new_classzone_idx;

3305

} else {

3312

} else {

3306

kswapd_try_to_sleep(pgdat, balanced_order,

3313

kswapd_try_to_sleep(pgdat, balanced_order,

3307

balanced_classzone_idx);

3314

balanced_classzone_idx);

3308

order = pgdat->kswapd_max_order;

3315

order = pgdat->kswapd_max_order;

3309

classzone_idx = pgdat->classzone_idx;

3316

classzone_idx = pgdat->classzone_idx;

3310

new_order = order;

3317

new_order = order;

3311

new_classzone_idx = classzone_idx;

3318

new_classzone_idx = classzone_idx;

3312

pgdat->kswapd_max_order = 0;

3319

pgdat->kswapd_max_order = 0;

3313

pgdat->classzone_idx = pgdat->nr_zones - 1;

3320

pgdat->classzone_idx = pgdat->nr_zones - 1;

3314

}

3321

}

3315

3322

3316

ret = try_to_freeze();

3323

ret = try_to_freeze();

3317

if (kthread_should_stop())

3324

if (kthread_should_stop())

3318

break;

3325

break;

3319

3326

3320

/*

3327

/*

3321

* We can speed up thawing tasks if we don't call balance_pgdat

3328

* We can speed up thawing tasks if we don't call balance_pgdat

3322

* after returning from the refrigerator

3329

* after returning from the refrigerator

3323

*/

3330

*/

3324

if (!ret) {

3331

if (!ret) {

3325

trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);

3332

trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);

3326

balanced_classzone_idx = classzone_idx;

3333

balanced_classzone_idx = classzone_idx;

3327

balanced_order = balance_pgdat(pgdat, order,

3334

balanced_order = balance_pgdat(pgdat, order,

3328

&balanced_classzone_idx);

3335

&balanced_classzone_idx);

3329

}

3336

}

3330

}

3337

}

3331

3338

3332

tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

3339

tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

3333

current->reclaim_state = NULL;

3340

current->reclaim_state = NULL;

3334

lockdep_clear_current_reclaim_state();

3341

lockdep_clear_current_reclaim_state();

3335

3342

3336

return 0;

3343

return 0;

3337

}

3344

}

3338

3345

3339

/*

3346

/*

3340

* A zone is low on free memory, so wake its kswapd task to service it.

3347

* A zone is low on free memory, so wake its kswapd task to service it.

3341

*/

3348

*/

3342

void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)

3349

void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)

3343

{

3350

{

3344

pg_data_t *pgdat;

3351

pg_data_t *pgdat;

3345

3352

3346

if (!populated_zone(zone))

3353

if (!populated_zone(zone))

3347

return;

3354

return;

3348

3355

3349

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

3356

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

3350

return;

3357

return;

3351

pgdat = zone->zone_pgdat;

3358

pgdat = zone->zone_pgdat;

3352

if (pgdat->kswapd_max_order < order) {

3359

if (pgdat->kswapd_max_order < order) {

3353

pgdat->kswapd_max_order = order;

3360

pgdat->kswapd_max_order = order;

3354

pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);

3361

pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);

3355

}

3362

}

3356

if (!waitqueue_active(&pgdat->kswapd_wait))

3363

if (!waitqueue_active(&pgdat->kswapd_wait))

3357

return;

3364

return;

3358

if (zone_balanced(zone, order, 0, 0))

3365

if (zone_balanced(zone, order, 0, 0))

3359

return;

3366

return;

3360

3367

3361

trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

3368

trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

3362

wake_up_interruptible(&pgdat->kswapd_wait);

3369

wake_up_interruptible(&pgdat->kswapd_wait);

3363

}

3370

}

3364

3371

3365

#ifdef CONFIG_HIBERNATION

3372

#ifdef CONFIG_HIBERNATION

3366

/*

3373

/*

3367

* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of

3374

* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of

3368

* freed pages.

3375

* freed pages.

3369

*

3376

*

3370

* Rather than trying to age LRUs the aim is to preserve the overall

3377

* Rather than trying to age LRUs the aim is to preserve the overall

3371

* LRU order by reclaiming preferentially

3378

* LRU order by reclaiming preferentially

3372

* inactive > active > active referenced > active mapped

3379

* inactive > active > active referenced > active mapped

3373

*/

3380

*/

3374

unsigned long shrink_all_memory(unsigned long nr_to_reclaim)

3381

unsigned long shrink_all_memory(unsigned long nr_to_reclaim)

3375

{

3382

{

3376

struct reclaim_state reclaim_state;

3383

struct reclaim_state reclaim_state;

3377

struct scan_control sc = {

3384

struct scan_control sc = {

3378

.gfp_mask = GFP_HIGHUSER_MOVABLE,

3385

.gfp_mask = GFP_HIGHUSER_MOVABLE,

3379

.may_swap = 1,

3386

.may_swap = 1,

3380

.may_unmap = 1,

3387

.may_unmap = 1,

3381

.may_writepage = 1,

3388

.may_writepage = 1,

3382

.nr_to_reclaim = nr_to_reclaim,

3389

.nr_to_reclaim = nr_to_reclaim,

3383

.hibernation_mode = 1,

3390

.hibernation_mode = 1,

3384

.order = 0,

3391

.order = 0,

3385

.priority = DEF_PRIORITY,

3392

.priority = DEF_PRIORITY,

3386

};

3393

};

3387

struct shrink_control shrink = {

3394

struct shrink_control shrink = {

3388

.gfp_mask = sc.gfp_mask,

3395

.gfp_mask = sc.gfp_mask,

3389

};

3396

};

3390

struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

3397

struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

3391

struct task_struct *p = current;

3398

struct task_struct *p = current;

3392

unsigned long nr_reclaimed;

3399

unsigned long nr_reclaimed;

3393

3400

3394

p->flags |= PF_MEMALLOC;

3401

p->flags |= PF_MEMALLOC;

3395

lockdep_set_current_reclaim_state(sc.gfp_mask);

3402

lockdep_set_current_reclaim_state(sc.gfp_mask);

3396

reclaim_state.reclaimed_slab = 0;

3403

reclaim_state.reclaimed_slab = 0;

3397

p->reclaim_state = &reclaim_state;

3404

p->reclaim_state = &reclaim_state;

3398

3405

3399

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

3406

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

3400

3407

3401

p->reclaim_state = NULL;

3408

p->reclaim_state = NULL;

3402

lockdep_clear_current_reclaim_state();

3409

lockdep_clear_current_reclaim_state();

3403

p->flags &= ~PF_MEMALLOC;

3410

p->flags &= ~PF_MEMALLOC;

3404

3411

3405

return nr_reclaimed;

3412

return nr_reclaimed;

3406

}

3413

}

3407

#endif /* CONFIG_HIBERNATION */

3414

#endif /* CONFIG_HIBERNATION */

3408

3415

3409

/* It's optimal to keep kswapds on the same CPUs as their memory, but

3416

/* It's optimal to keep kswapds on the same CPUs as their memory, but

3410

not required for correctness. So if the last cpu in a node goes

3417

not required for correctness. So if the last cpu in a node goes

3411

away, we get changed to run anywhere: as the first one comes back,

3418

away, we get changed to run anywhere: as the first one comes back,

3412

restore their cpu bindings. */

3419

restore their cpu bindings. */

3413

static int cpu_callback(struct notifier_block *nfb, unsigned long action,

3420

static int cpu_callback(struct notifier_block *nfb, unsigned long action,

3414

void *hcpu)

3421

void *hcpu)

3415

{

3422

{

3416

int nid;

3423

int nid;

3417

3424

3418

if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {

3425

if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {

3419

for_each_node_state(nid, N_MEMORY) {

3426

for_each_node_state(nid, N_MEMORY) {

3420

pg_data_t *pgdat = NODE_DATA(nid);

3427

pg_data_t *pgdat = NODE_DATA(nid);

3421

const struct cpumask *mask;

3428

const struct cpumask *mask;

3422

3429

3423

mask = cpumask_of_node(pgdat->node_id);

3430

mask = cpumask_of_node(pgdat->node_id);

3424

3431

3425

if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)

3432

if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)

3426

/* One of our CPUs online: restore mask */

3433

/* One of our CPUs online: restore mask */

3427

set_cpus_allowed_ptr(pgdat->kswapd, mask);

3434

set_cpus_allowed_ptr(pgdat->kswapd, mask);

3428

}

3435

}

3429

}

3436

}

3430

return NOTIFY_OK;

3437

return NOTIFY_OK;

3431

}

3438

}

3432

3439

3433

/*

3440

/*

3434

* This kswapd start function will be called by init and node-hot-add.

3441

* This kswapd start function will be called by init and node-hot-add.

3435

* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.

3442

* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.

3436

*/

3443

*/

3437

int kswapd_run(int nid)

3444

int kswapd_run(int nid)

3438

{

3445

{

3439

pg_data_t *pgdat = NODE_DATA(nid);

3446

pg_data_t *pgdat = NODE_DATA(nid);

3440

int ret = 0;

3447

int ret = 0;

3441

3448

3442

if (pgdat->kswapd)

3449

if (pgdat->kswapd)

3443

return 0;

3450

return 0;

3444

3451

3445

pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);

3452

pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);

3446

if (IS_ERR(pgdat->kswapd)) {

3453

if (IS_ERR(pgdat->kswapd)) {

3447

/* failure at boot is fatal */

3454

/* failure at boot is fatal */

3448

BUG_ON(system_state == SYSTEM_BOOTING);

3455

BUG_ON(system_state == SYSTEM_BOOTING);

3449

pr_err("Failed to start kswapd on node %d\n", nid);

3456

pr_err("Failed to start kswapd on node %d\n", nid);

3450

ret = PTR_ERR(pgdat->kswapd);

3457

ret = PTR_ERR(pgdat->kswapd);

3451

pgdat->kswapd = NULL;

3458

pgdat->kswapd = NULL;

3452

}

3459

}

3453

return ret;

3460

return ret;

3454

}

3461

}

3455

3462

3456

/*

3463

/*

3457

* Called by memory hotplug when all memory in a node is offlined. Caller must

3464

* Called by memory hotplug when all memory in a node is offlined. Caller must

3458

* hold lock_memory_hotplug().

3465

* hold lock_memory_hotplug().

3459

*/

3466

*/

3460

void kswapd_stop(int nid)

3467

void kswapd_stop(int nid)

3461

{

3468

{

3462

struct task_struct *kswapd = NODE_DATA(nid)->kswapd;

3469

struct task_struct *kswapd = NODE_DATA(nid)->kswapd;

3463

3470

3464

if (kswapd) {

3471

if (kswapd) {

3465

kthread_stop(kswapd);

3472

kthread_stop(kswapd);

3466

NODE_DATA(nid)->kswapd = NULL;

3473

NODE_DATA(nid)->kswapd = NULL;

3467

}

3474

}

3468

}

3475

}

3469

3476

3470

static int __init kswapd_init(void)

3477

static int __init kswapd_init(void)

3471

{

3478

{

3472

int nid;

3479

int nid;

3473

3480

3474

swap_setup();

3481

swap_setup();

3475

for_each_node_state(nid, N_MEMORY)

3482

for_each_node_state(nid, N_MEMORY)

3476

kswapd_run(nid);

3483

kswapd_run(nid);

3477

hotcpu_notifier(cpu_callback, 0);

3484

hotcpu_notifier(cpu_callback, 0);

3478

return 0;

3485

return 0;

3479

}

3486

}

3480

3487

3481

module_init(kswapd_init)

3488

module_init(kswapd_init)

3482

3489

3483

#ifdef CONFIG_NUMA

3490

#ifdef CONFIG_NUMA

3484

/*

3491

/*

3485

* Zone reclaim mode

3492

* Zone reclaim mode

3486

*

3493

*

3487

* If non-zero call zone_reclaim when the number of free pages falls below

3494

* If non-zero call zone_reclaim when the number of free pages falls below

3488

* the watermarks.

3495

* the watermarks.

3489

*/

3496

*/

3490

int zone_reclaim_mode __read_mostly;

3497

int zone_reclaim_mode __read_mostly;

3491

3498

3492

#define RECLAIM_OFF 0

3499

#define RECLAIM_OFF 0

3493

#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */

3500

#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */

3494

#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */

3501

#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */

3495

#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */

3502

#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */

3496

3503

3497

/*

3504

/*

3498

* Priority for ZONE_RECLAIM. This determines the fraction of pages

3505

* Priority for ZONE_RECLAIM. This determines the fraction of pages

3499

* of a node considered for each zone_reclaim. 4 scans 1/16th of

3506

* of a node considered for each zone_reclaim. 4 scans 1/16th of

3500

* a zone.

3507

* a zone.

3501

*/

3508

*/

3502

#define ZONE_RECLAIM_PRIORITY 4

3509

#define ZONE_RECLAIM_PRIORITY 4

3503

3510

3504

/*

3511

/*

3505

* Percentage of pages in a zone that must be unmapped for zone_reclaim to

3512

* Percentage of pages in a zone that must be unmapped for zone_reclaim to

3506

* occur.

3513

* occur.

3507

*/

3514

*/

3508

int sysctl_min_unmapped_ratio = 1;

3515

int sysctl_min_unmapped_ratio = 1;

3509

3516

3510

/*

3517

/*

3511

* If the number of slab pages in a zone grows beyond this percentage then

3518

* If the number of slab pages in a zone grows beyond this percentage then

3512

* slab reclaim needs to occur.

3519

* slab reclaim needs to occur.

3513

*/

3520

*/

3514

int sysctl_min_slab_ratio = 5;

3521

int sysctl_min_slab_ratio = 5;

3515

3522

3516

static inline unsigned long zone_unmapped_file_pages(struct zone *zone)

3523

static inline unsigned long zone_unmapped_file_pages(struct zone *zone)

3517

{

3524

{

3518

unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);

3525

unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);

3519

unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +

3526

unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +

3520

zone_page_state(zone, NR_ACTIVE_FILE);

3527

zone_page_state(zone, NR_ACTIVE_FILE);

3521

3528

3522

/*

3529

/*

3523

* It's possible for there to be more file mapped pages than

3530

* It's possible for there to be more file mapped pages than

3524

* accounted for by the pages on the file LRU lists because

3531

* accounted for by the pages on the file LRU lists because

3525

* tmpfs pages accounted for as ANON can also be FILE_MAPPED

3532

* tmpfs pages accounted for as ANON can also be FILE_MAPPED

3526

*/

3533

*/

3527

return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;

3534

return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;

3528

}

3535

}

3529

3536

3530

/* Work out how many page cache pages we can reclaim in this reclaim_mode */

3537

/* Work out how many page cache pages we can reclaim in this reclaim_mode */

3531

static long zone_pagecache_reclaimable(struct zone *zone)

3538

static long zone_pagecache_reclaimable(struct zone *zone)

3532

{

3539

{

3533

long nr_pagecache_reclaimable;

3540

long nr_pagecache_reclaimable;

3534

long delta = 0;

3541

long delta = 0;

3535

3542

3536

/*

3543

/*

3537

* If RECLAIM_SWAP is set, then all file pages are considered

3544

* If RECLAIM_SWAP is set, then all file pages are considered

3538

* potentially reclaimable. Otherwise, we have to worry about

3545

* potentially reclaimable. Otherwise, we have to worry about

3539

* pages like swapcache and zone_unmapped_file_pages() provides

3546

* pages like swapcache and zone_unmapped_file_pages() provides

3540

* a better estimate

3547

* a better estimate

3541

*/

3548

*/

3542

if (zone_reclaim_mode & RECLAIM_SWAP)

3549

if (zone_reclaim_mode & RECLAIM_SWAP)

3543

nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);

3550

nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);

3544

else

3551

else

3545

nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);

3552

nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);

3546

3553

3547

/* If we can't clean pages, remove dirty pages from consideration */

3554

/* If we can't clean pages, remove dirty pages from consideration */

3548

if (!(zone_reclaim_mode & RECLAIM_WRITE))

3555

if (!(zone_reclaim_mode & RECLAIM_WRITE))

3549

delta += zone_page_state(zone, NR_FILE_DIRTY);

3556

delta += zone_page_state(zone, NR_FILE_DIRTY);

3550

3557

3551

/* Watch for any possible underflows due to delta */

3558

/* Watch for any possible underflows due to delta */

3552

if (unlikely(delta > nr_pagecache_reclaimable))

3559

if (unlikely(delta > nr_pagecache_reclaimable))

3553

delta = nr_pagecache_reclaimable;

3560

delta = nr_pagecache_reclaimable;

3554

3561

3555

return nr_pagecache_reclaimable - delta;

3562

return nr_pagecache_reclaimable - delta;

3556

}

3563

}

3557

3564

3558

/*

3565

/*

3559

* Try to free up some pages from this zone through reclaim.

3566

* Try to free up some pages from this zone through reclaim.

3560

*/

3567

*/

3561

static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3568

static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3562

{

3569

{

3563

/* Minimum pages needed in order to stay on node */

3570

/* Minimum pages needed in order to stay on node */

3564

const unsigned long nr_pages = 1 << order;

3571

const unsigned long nr_pages = 1 << order;

3565

struct task_struct *p = current;

3572

struct task_struct *p = current;

3566

struct reclaim_state reclaim_state;

3573

struct reclaim_state reclaim_state;

3567

struct scan_control sc = {

3574

struct scan_control sc = {

3568

.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),

3575

.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),

3569

.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),

3576

.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),

3570

.may_swap = 1,

3577

.may_swap = 1,

3571

.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),

3578

.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),

3572

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

3579

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

3573

.order = order,

3580

.order = order,

3574

.priority = ZONE_RECLAIM_PRIORITY,

3581

.priority = ZONE_RECLAIM_PRIORITY,

3575

};

3582

};

3576

struct shrink_control shrink = {

3583

struct shrink_control shrink = {

3577

.gfp_mask = sc.gfp_mask,

3584

.gfp_mask = sc.gfp_mask,

3578

};

3585

};

3579

unsigned long nr_slab_pages0, nr_slab_pages1;

3586

unsigned long nr_slab_pages0, nr_slab_pages1;

3580

3587

3581

cond_resched();

3588

cond_resched();

3582

/*

3589

/*

3583

* We need to be able to allocate from the reserves for RECLAIM_SWAP

3590

* We need to be able to allocate from the reserves for RECLAIM_SWAP

3584

* and we also need to be able to write out pages for RECLAIM_WRITE

3591

* and we also need to be able to write out pages for RECLAIM_WRITE

3585

* and RECLAIM_SWAP.

3592

* and RECLAIM_SWAP.

3586

*/

3593

*/

3587

p->flags |= PF_MEMALLOC | PF_SWAPWRITE;

3594

p->flags |= PF_MEMALLOC | PF_SWAPWRITE;

3588

lockdep_set_current_reclaim_state(gfp_mask);

3595

lockdep_set_current_reclaim_state(gfp_mask);

3589

reclaim_state.reclaimed_slab = 0;

3596

reclaim_state.reclaimed_slab = 0;

3590

p->reclaim_state = &reclaim_state;

3597

p->reclaim_state = &reclaim_state;

3591

3598

3592

if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {

3599

if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {

3593

/*

3600

/*

3594

* Free memory by calling shrink zone with increasing

3601

* Free memory by calling shrink zone with increasing

3595

* priorities until we have enough memory freed.

3602

* priorities until we have enough memory freed.

3596

*/

3603

*/

3597

do {

3604

do {

3598

shrink_zone(zone, &sc);

3605

shrink_zone(zone, &sc);

3599

} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);

3606

} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);

3600

}

3607

}

3601

3608

3602

nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3609

nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3603

if (nr_slab_pages0 > zone->min_slab_pages) {

3610

if (nr_slab_pages0 > zone->min_slab_pages) {

3604

/*

3611

/*

3605

* shrink_slab() does not currently allow us to determine how

3612

* shrink_slab() does not currently allow us to determine how

3606

* many pages were freed in this zone. So we take the current

3613

* many pages were freed in this zone. So we take the current

3607

* number of slab pages and shake the slab until it is reduced

3614

* number of slab pages and shake the slab until it is reduced

3608

* by the same nr_pages that we used for reclaiming unmapped

3615

* by the same nr_pages that we used for reclaiming unmapped

3609

* pages.

3616

* pages.

3610

*/

3617

*/

3611

nodes_clear(shrink.nodes_to_scan);

3618

nodes_clear(shrink.nodes_to_scan);

3612

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

3619

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

3613

for (;;) {

3620

for (;;) {

3614

unsigned long lru_pages = zone_reclaimable_pages(zone);

3621

unsigned long lru_pages = zone_reclaimable_pages(zone);

3615

3622

3616

/* No reclaimable slab or very low memory pressure */

3623

/* No reclaimable slab or very low memory pressure */

3617

if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

3624

if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

3618

break;

3625

break;

3619

3626

3620

/* Freed enough memory */

3627

/* Freed enough memory */

3621

nr_slab_pages1 = zone_page_state(zone,

3628

nr_slab_pages1 = zone_page_state(zone,

3622

NR_SLAB_RECLAIMABLE);

3629

NR_SLAB_RECLAIMABLE);

3623

if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)

3630

if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)

3624

break;

3631

break;

3625

}

3632

}

3626

3633

3627

/*

3634

/*

3628

* Update nr_reclaimed by the number of slab pages we

3635

* Update nr_reclaimed by the number of slab pages we

3629

* reclaimed from this zone.

3636

* reclaimed from this zone.

3630

*/

3637

*/

3631

nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3638

nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3632

if (nr_slab_pages1 < nr_slab_pages0)

3639

if (nr_slab_pages1 < nr_slab_pages0)

3633

sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;

3640

sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;

3634

}

3641

}

3635

3642

3636

p->reclaim_state = NULL;

3643

p->reclaim_state = NULL;

3637

current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);

3644

current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);

3638

lockdep_clear_current_reclaim_state();

3645

lockdep_clear_current_reclaim_state();

3639

return sc.nr_reclaimed >= nr_pages;

3646

return sc.nr_reclaimed >= nr_pages;

3640

}

3647

}

3641

3648

3642

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3649

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3643

{

3650

{

3644

int node_id;

3651

int node_id;

3645

int ret;

3652

int ret;

3646

3653

3647

/*

3654

/*

3648

* Zone reclaim reclaims unmapped file backed pages and

3655

* Zone reclaim reclaims unmapped file backed pages and

3649

* slab pages if we are over the defined limits.

3656

* slab pages if we are over the defined limits.

3650

*

3657

*

3651

* A small portion of unmapped file backed pages is needed for

3658

* A small portion of unmapped file backed pages is needed for

3652

* file I/O otherwise pages read by file I/O will be immediately

3659

* file I/O otherwise pages read by file I/O will be immediately

3653

* thrown out if the zone is overallocated. So we do not reclaim

3660

* thrown out if the zone is overallocated. So we do not reclaim

3654

* if less than a specified percentage of the zone is used by

3661

* if less than a specified percentage of the zone is used by

3655

* unmapped file backed pages.

3662

* unmapped file backed pages.

3656

*/

3663

*/

3657

if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&

3664

if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&

3658

zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)

3665

zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)

3659

return ZONE_RECLAIM_FULL;

3666

return ZONE_RECLAIM_FULL;

3660

3667

3661

if (!zone_reclaimable(zone))

3668

if (!zone_reclaimable(zone))

3662

return ZONE_RECLAIM_FULL;

3669

return ZONE_RECLAIM_FULL;

3663

3670

3664

/*

3671

/*

3665

* Do not scan if the allocation should not be delayed.

3672

* Do not scan if the allocation should not be delayed.

3666

*/

3673

*/

3667

if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))

3674

if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))

3668

return ZONE_RECLAIM_NOSCAN;

3675

return ZONE_RECLAIM_NOSCAN;

3669

3676

3670

/*

3677

/*

3671

* Only run zone reclaim on the local zone or on zones that do not

3678

* Only run zone reclaim on the local zone or on zones that do not

3672

* have associated processors. This will favor the local processor

3679

* have associated processors. This will favor the local processor

3673

* over remote processors and spread off node memory allocations

3680

* over remote processors and spread off node memory allocations

3674

* as wide as possible.

3681

* as wide as possible.

3675

*/

3682

*/

3676

node_id = zone_to_nid(zone);

3683

node_id = zone_to_nid(zone);

3677

if (node_state(node_id, N_CPU) && node_id != numa_node_id())

3684

if (node_state(node_id, N_CPU) && node_id != numa_node_id())

3678

return ZONE_RECLAIM_NOSCAN;

3685

return ZONE_RECLAIM_NOSCAN;

3679

3686

3680

if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))

3687

if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))

3681

return ZONE_RECLAIM_NOSCAN;

3688

return ZONE_RECLAIM_NOSCAN;

3682

3689

3683

ret = __zone_reclaim(zone, gfp_mask, order);

3690

ret = __zone_reclaim(zone, gfp_mask, order);

3684

zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);

3691

zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);

3685

3692

3686

if (!ret)

3693

if (!ret)

3687

count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

3694

count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

3688

3695

3689

return ret;

3696

return ret;

3690

}

3697

}

3691

#endif

3698

#endif

3692

3699

3693

/*

3700

/*

3694

* page_evictable - test whether a page is evictable

3701

* page_evictable - test whether a page is evictable

3695

* @page: the page to test

3702

* @page: the page to test

3696

*

3703

*

3697

* Test whether page is evictable--i.e., should be placed on active/inactive

3704

* Test whether page is evictable--i.e., should be placed on active/inactive

3698

* lists vs unevictable list.

3705

* lists vs unevictable list.

3699

*

3706

*

3700

* Reasons page might not be evictable:

3707

* Reasons page might not be evictable:

3701

* (1) page's mapping marked unevictable

3708

* (1) page's mapping marked unevictable

3702

* (2) page is part of an mlocked VMA

3709

* (2) page is part of an mlocked VMA

3703

*

3710

*

3704

*/

3711

*/

3705

int page_evictable(struct page *page)

3712

int page_evictable(struct page *page)

3706

{

3713

{

3707

return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);

3714

return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);

3708

}

3715

}

3709

3716

3710

#ifdef CONFIG_SHMEM

3717

#ifdef CONFIG_SHMEM

3711

/**

3718

/**

3712

* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list

3719

* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list

3713

* @pages: array of pages to check

3720

* @pages: array of pages to check

3714

* @nr_pages: number of pages to check

3721

* @nr_pages: number of pages to check

3715

*

3722

*

3716

* Checks pages for evictability and moves them to the appropriate lru list.

3723

* Checks pages for evictability and moves them to the appropriate lru list.

3717

*

3724

*

3718

* This function is only used for SysV IPC SHM_UNLOCK.

3725

* This function is only used for SysV IPC SHM_UNLOCK.

3719

*/

3726

*/

3720

void check_move_unevictable_pages(struct page **pages, int nr_pages)

3727

void check_move_unevictable_pages(struct page **pages, int nr_pages)

3721

{

3728

{

3722

struct lruvec *lruvec;

3729

struct lruvec *lruvec;

3723

struct zone *zone = NULL;

3730

struct zone *zone = NULL;

3724

int pgscanned = 0;

3731

int pgscanned = 0;

3725

int pgrescued = 0;

3732

int pgrescued = 0;

3726

int i;

3733

int i;

3727

3734

3728

for (i = 0; i < nr_pages; i++) {

3735

for (i = 0; i < nr_pages; i++) {

3729

struct page *page = pages[i];

3736

struct page *page = pages[i];

3730

struct zone *pagezone;

3737

struct zone *pagezone;

3731

3738

3732

pgscanned++;

3739

pgscanned++;

3733

pagezone = page_zone(page);

3740

pagezone = page_zone(page);

3734

if (pagezone != zone) {

3741

if (pagezone != zone) {

3735

if (zone)

3742

if (zone)

3736

spin_unlock_irq(&zone->lru_lock);

3743

spin_unlock_irq(&zone->lru_lock);

3737

zone = pagezone;

3744

zone = pagezone;

3738

spin_lock_irq(&zone->lru_lock);

3745

spin_lock_irq(&zone->lru_lock);

3739

}

3746

}

3740

lruvec = mem_cgroup_page_lruvec(page, zone);

3747

lruvec = mem_cgroup_page_lruvec(page, zone);

3741

3748

3742

if (!PageLRU(page) || !PageUnevictable(page))

3749

if (!PageLRU(page) || !PageUnevictable(page))

3743

continue;

3750

continue;

3744

3751

3745

if (page_evictable(page)) {

3752

if (page_evictable(page)) {

3746

enum lru_list lru = page_lru_base_type(page);

3753

enum lru_list lru = page_lru_base_type(page);

3747

3754

3748

VM_BUG_ON(PageActive(page));

3755

VM_BUG_ON(PageActive(page));

3749

ClearPageUnevictable(page);

3756

ClearPageUnevictable(page);

3750

del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);

3757

del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);

3751

add_page_to_lru_list(page, lruvec, lru);

3758

add_page_to_lru_list(page, lruvec, lru);

3752

pgrescued++;

3759

pgrescued++;

3753

}

3760

}

3754

}

3761

}

3755

3762

3756

if (zone) {

3763

if (zone) {

3757

__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);

3764

__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);

3758

__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);

3765

__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);

3759

spin_unlock_irq(&zone->lru_lock);

3766

spin_unlock_irq(&zone->lru_lock);

3760

}

3767

}

3761

}

3768

}

3762

#endif /* CONFIG_SHMEM */

3769

#endif /* CONFIG_SHMEM */

3763

3770

3764

static void warn_scan_unevictable_pages(void)

3771

static void warn_scan_unevictable_pages(void)

3765

{

3772

{

3766

printk_once(KERN_WARNING

3773

printk_once(KERN_WARNING

3767

"%s: The scan_unevictable_pages sysctl/node-interface has been "

3774

"%s: The scan_unevictable_pages sysctl/node-interface has been "

3768

"disabled for lack of a legitimate use case. If you have "

3775

"disabled for lack of a legitimate use case. If you have "

3769

"one, please send an email to linux-mm@kvack.org.\n",

3776

"one, please send an email to linux-mm@kvack.org.\n",

3770

current->comm);

3777

current->comm);

3771

}

3778

}

3772

3779

3773

/*

3780

/*

3774

* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of

3781

* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of

3775

* all nodes' unevictable lists for evictable pages

3782

* all nodes' unevictable lists for evictable pages

3776

*/

3783

*/

3777

unsigned long scan_unevictable_pages;

3784

unsigned long scan_unevictable_pages;

3778

3785

3779

int scan_unevictable_handler(struct ctl_table *table, int write,

3786

int scan_unevictable_handler(struct ctl_table *table, int write,

3780

void __user *buffer,

3787

void __user *buffer,

3781

size_t *length, loff_t *ppos)

3788

size_t *length, loff_t *ppos)

3782

{

3789

{

3783

warn_scan_unevictable_pages();

3790

warn_scan_unevictable_pages();

3784

proc_doulongvec_minmax(table, write, buffer, length, ppos);

3791

proc_doulongvec_minmax(table, write, buffer, length, ppos);

3785

scan_unevictable_pages = 0;

3792

scan_unevictable_pages = 0;

3786

return 0;

3793

return 0;

3787

}

3794

}

3788

3795

3789

#ifdef CONFIG_NUMA

3796

#ifdef CONFIG_NUMA

3790

/*

3797

/*

3791

* per node 'scan_unevictable_pages' attribute. On demand re-scan of

3798

* per node 'scan_unevictable_pages' attribute. On demand re-scan of

3792

* a specified node's per zone unevictable lists for evictable pages.

3799

* a specified node's per zone unevictable lists for evictable pages.

3793

*/

3800

*/

3794

3801

3795

static ssize_t read_scan_unevictable_node(struct device *dev,

3802

static ssize_t read_scan_unevictable_node(struct device *dev,

3796

struct device_attribute *attr,

3803

struct device_attribute *attr,

3797

char *buf)

3804

char *buf)

3798

{

3805

{

3799

warn_scan_unevictable_pages();

3806

warn_scan_unevictable_pages();

3800

return sprintf(buf, "0\n"); /* always zero; should fit... */

3807

return sprintf(buf, "0\n"); /* always zero; should fit... */

3801

}

3808

}

3802

3809

3803

static ssize_t write_scan_unevictable_node(struct device *dev,

3810

static ssize_t write_scan_unevictable_node(struct device *dev,

3804

struct device_attribute *attr,

3811

struct device_attribute *attr,

3805

const char *buf, size_t count)

3812

const char *buf, size_t count)

3806

{

3813

{

3807

warn_scan_unevictable_pages();

3814

warn_scan_unevictable_pages();

3808

return 1;

3815

return 1;

3809

}

3816

}

3810

3817

3811

3818

3812

static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,

3819

static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,

3813

read_scan_unevictable_node,

3820

read_scan_unevictable_node,

3814

write_scan_unevictable_node);

3821

write_scan_unevictable_node);

3815

3822

3816

int scan_unevictable_register_node(struct node *node)

3823

int scan_unevictable_register_node(struct node *node)

3817

{

3824

{

3818

return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);

3825

return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);

3819

}

3826

}

3820

3827

3821

void scan_unevictable_unregister_node(struct node *node)

3828

void scan_unevictable_unregister_node(struct node *node)

GITLAB

memcg, vmscan: Fix forced scan of anonymous pages

 /*
  *  linux/mm/vmscan.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
 #include <linux/balloon_compaction.h>
 #include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 	/* Number of pages freed so far during a call to shrink_zones() */
 	unsigned long nr_reclaimed;
 	/* How many pages shrink_list() should reclaim */
 	unsigned long nr_to_reclaim;
 	unsigned long hibernation_mode;
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 	int may_writepage;
 	/* Can mapped pages be reclaimed? */
 	int may_unmap;
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 	int order;
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
 	 */
 	struct mem_cgroup *target_mem_cgroup;
 	/*
 	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
 	 * are scanned.
 	 */
 	nodemask_t	*nodemask;
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetch(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetchw(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 /*
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 unsigned long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
 #endif
 static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	int nr;
 	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
 	     zone_page_state(zone, NR_INACTIVE_FILE);
 	if (get_nr_swap_pages() > 0)
 		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
 		      zone_page_state(zone, NR_INACTIVE_ANON);
 	return nr;
 }
 bool zone_reclaimable(struct zone *zone)
 {
 	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_get_lru_size(lruvec, lru);
 	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
 }
 /*
  * Add a shrinker callback to be called from the vm.
  */
 int register_shrinker(struct shrinker *shrinker)
 {
 	size_t size = sizeof(*shrinker->nr_deferred);
 	/*
 	 * If we only have one possible node in the system anyway, save
 	 * ourselves the trouble and disable NUMA aware behavior. This way we
 	 * will save memory and some small loop time later.
 	 */
 	if (nr_node_ids == 1)
 		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
 	if (shrinker->flags & SHRINKER_NUMA_AWARE)
 		size *= nr_node_ids;
 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 /*
  * Remove one
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
 static unsigned long
 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
 		 unsigned long nr_pages_scanned, unsigned long lru_pages)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
 	long total_scan;
 	long freeable;
 	long nr;
 	long new_nr;
 	int nid = shrinkctl->nid;
 	long batch_size = shrinker->batch ? shrinker->batch
 					  : SHRINK_BATCH;
 	freeable = shrinker->count_objects(shrinker, shrinkctl);
 	if (freeable == 0)
 		return 0;
 	/*
 	 * copy the current shrinker scan count into a local variable
 	 * and zero it so that other concurrent shrinker invocations
 	 * don't also do this scanning work.
 	 */
 	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 	total_scan = nr;
 	delta = (4 * nr_pages_scanned) / shrinker->seeks;
 	delta *= freeable;
 	do_div(delta, lru_pages + 1);
 	total_scan += delta;
 	if (total_scan < 0) {
 		printk(KERN_ERR
 		"shrink_slab: %pF negative objects to delete nr=%ld\n",
 		       shrinker->scan_objects, total_scan);
 		total_scan = freeable;
 	}
 	/*
 	 * We need to avoid excessive windup on filesystem shrinkers
 	 * due to large numbers of GFP_NOFS allocations causing the
 	 * shrinkers to return -1 all the time. This results in a large
 	 * nr being built up so when a shrink that can do some work
 	 * comes along it empties the entire cache due to nr >>>
 	 * freeable. This is bad for sustaining a working set in
 	 * memory.
 	 *
 	 * Hence only allow the shrinker to scan the entire cache when
 	 * a large delta change is calculated directly.
 	 */
 	if (delta < freeable / 4)
 		total_scan = min(total_scan, freeable / 2);
 	/*
 	 * Avoid risking looping forever due to too large nr value:
 	 * never try to free more than twice the estimate number of
 	 * freeable entries.
 	 */
 	if (total_scan > freeable * 2)
 		total_scan = freeable * 2;
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 				nr_pages_scanned, lru_pages,
 				freeable, delta, total_scan);
 	/*
 	 * Normally, we should not scan less than batch_size objects in one
 	 * pass to avoid too frequent shrinker calls, but if the slab has less
 	 * than batch_size objects in total and we are really tight on memory,
 	 * we will try to reclaim all available objects, otherwise we can end
 	 * up failing allocations although there are plenty of reclaimable
 	 * objects spread over several slabs with usage less than the
 	 * batch_size.
 	 *
 	 * We detect the "tight on memory" situations by looking at the total
 	 * number of objects we want to scan (total_scan). If it is greater
 	 * than the total number of objects on slab (freeable), we must be
 	 * scanning at high prio and therefore should try to reclaim as much as
 	 * possible.
 	 */
 	while (total_scan >= batch_size ||
 	       total_scan >= freeable) {
 		unsigned long ret;
 		unsigned long nr_to_scan = min(batch_size, total_scan);
 		shrinkctl->nr_to_scan = nr_to_scan;
 		ret = shrinker->scan_objects(shrinker, shrinkctl);
 		if (ret == SHRINK_STOP)
 			break;
 		freed += ret;
 		count_vm_events(SLABS_SCANNED, nr_to_scan);
 		total_scan -= nr_to_scan;
 		cond_resched();
 	}
 	/*
 	 * move the unused scan count back into the shrinker in a
 	 * manner that handles concurrent updates. If we exhausted the
 	 * scan, there is no need to do an update.
 	 */
 	if (total_scan > 0)
 		new_nr = atomic_long_add_return(total_scan,
 						&shrinker->nr_deferred[nid]);
 	else
 		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
 	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	return freed;
 }
 /*
  * Call the shrink functions to age shrinkable caches
  *
  * Here we assume it costs one seek to replace a lru page and that it also
  * takes a seek to recreate a cache object.  With this in mind we age equal
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
  * If the vm encountered mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  *
  * `lru_pages' represents the number of on-LRU pages in all the zones which
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(struct shrink_control *shrinkctl,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
 	if (nr_pages_scanned == 0)
 		nr_pages_scanned = SWAP_CLUSTER_MAX;
 	if (!down_read_trylock(&shrinker_rwsem)) {
 		/*
 		 * If we would return 0, our callers would understand that we
 		 * have nothing else to shrink and give up trying. By returning
 		 * 1 we keep it going and assume we'll be able to shrink next
 		 * time.
 		 */
 		freed = 1;
 		goto out;
 	}
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
 			shrinkctl->nid = 0;
 			freed += shrink_slab_node(shrinkctl, shrinker,
 					nr_pages_scanned, lru_pages);
 			continue;
 		}
 		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
 			if (node_online(shrinkctl->nid))
 				freed += shrink_slab_node(shrinkctl, shrinker,
 						nr_pages_scanned, lru_pages);
 		}
 	}
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
 	return freed;
 }
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
 	 * A freeable page cache page is referenced only by the caller
 	 * that isolated the page, the page cache radix tree and
 	 * optional buffer heads at page->private.
 	 */
 	return page_count(page) - page_has_private(page) == 2;
 }
 static int may_write_to_queue(struct backing_dev_info *bdi,
 			      struct scan_control *sc)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
 	return 0;
 }
 /*
  * We detected a synchronous write error writing a page out.  Probably
  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
  * fsync(), msync() or close().
  *
  * The tricky part is that after writepage we cannot touch the mapping: nothing
  * prevents it from being freed up.  But we have a ref on the page and once
  * that page is locked, the mapping is pinned.
  *
  * We're allowed to run sleeping lock_page() here because we know the caller has
  * __GFP_FS.
  */
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
 	if (page_mapping(page) == mapping)
 		mapping_set_error(mapping, error);
 	unlock_page(page);
 }
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
 	PAGE_KEEP,
 	/* move page to the active list, page is locked */
 	PAGE_ACTIVATE,
 	/* page has been sent to the disk successfully, page is unlocked */
 	PAGE_SUCCESS,
 	/* page is clean and locked */
 	PAGE_CLEAN,
 } pageout_t;
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
 			 struct scan_control *sc)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
 	 * will be non-blocking.  To prevent this allocation from being
 	 * stalled by pagecache activity.  But note that there may be
 	 * stalls if we need to run get_block().  We could test
 	 * PagePrivate for that.
 	 *
 	 * If this process is currently in __generic_file_aio_write() against
 	 * this page's queue, we can perform writeback even if that
 	 * will block.
 	 *
 	 * If the page is swapcache, write it back even if that would
 	 * block, for some throttling. This happens by accident, because
 	 * swap_backing_dev_info is bust: it doesn't reflect the
 	 * congestion state of the swapdevs.  Easy to fix, if needed.
 	 */
 	if (!is_page_cache_freeable(page))
 		return PAGE_KEEP;
 	if (!mapping) {
 		/*
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
 		if (page_has_private(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __func__);
 				return PAGE_CLEAN;
 			}
 		}
 		return PAGE_KEEP;
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
 	if (!may_write_to_queue(mapping->backing_dev_info, sc))
 		return PAGE_KEEP;
 	if (clear_page_dirty_for_io(page)) {
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.for_reclaim = 1,
 		};
 		SetPageReclaim(page);
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
 		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
 		if (!PageWriteback(page)) {
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
 		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
 	return PAGE_CLEAN;
 }
 /*
  * Same as remove_mapping, but if the page is removed from the mapping, it
  * gets returned with a refcount of 0.
  */
 static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 	spin_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
 	 * Must be careful with the order of the tests. When someone has
 	 * a ref to the page, it may be possible that they dirty it then
 	 * drop the reference. So if PageDirty is tested before page_count
 	 * here, then the following race may occur:
 	 *
 	 * get_user_pages(&page);
 	 * [user mapping goes away]
 	 * write_to(page);
 	 *				!PageDirty(page)    [good]
 	 * SetPageDirty(page);
 	 * put_page(page);
 	 *				!page_count(page)   [good, discard it]
 	 *
 	 * [oops, our write_to data is lost]
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
 	 * load is not satisfied before that of page->_count.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
 	if (!page_freeze_refs(page, 2))
 		goto cannot_free;
 	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
 		page_unfreeze_refs(page, 2);
 		goto cannot_free;
 	}
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		swapcache_free(swap, page);
 	} else {
 		void (*freepage)(struct page *);
 		freepage = mapping->a_ops->freepage;
 		__delete_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		mem_cgroup_uncharge_cache_page(page);
 		if (freepage != NULL)
 			freepage(page);
 	}
 	return 1;
 cannot_free:
 	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 /*
  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
  * someone else has a ref on the page, abort and return 0.  If it was
  * successfully detached, return 1.  Assumes the caller has a single ref on
  * this page.
  */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (__remove_mapping(mapping, page)) {
 		/*
 		 * Unfreezing the refcount with 1 rather than 2 effectively
 		 * drops the pagecache ref for us without requiring another
 		 * atomic operation.
 		 */
 		page_unfreeze_refs(page, 1);
 		return 1;
 	}
 	return 0;
 }
 /**
  * putback_lru_page - put previously isolated page onto appropriate LRU list
  * @page: page to be put back to appropriate lru list
  *
  * Add previously isolated @page to appropriate LRU list.
  * Page may still be unevictable for other reasons.
  *
  * lru_lock must not be held, interrupts must be enabled.
  */
 void putback_lru_page(struct page *page)
 {
 	bool is_unevictable;
 	int was_unevictable = PageUnevictable(page);
 	VM_BUG_ON(PageLRU(page));
 redo:
 	ClearPageUnevictable(page);
 	if (page_evictable(page)) {
 		/*
 		 * For evictable pages, we can use the cache.
 		 * In event of a race, worst case is we end up with an
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
 		is_unevictable = false;
 		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
 		 * list.
 		 */
 		is_unevictable = true;
 		add_page_to_unevictable_list(page);
 		/*
 		 * When racing with an mlock or AS_UNEVICTABLE clearing
 		 * (page is unlocked) make sure that if the other thread
 		 * does not observe our setting of PG_lru and fails
 		 * isolation/check_move_unevictable_pages,
 		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
 		 * the page back to the evictable list.
 		 *
 		 * The other side is TestClearPageMlocked() or shmem_lock().
 		 */
 		smp_mb();
 	}
 	/*
 	 * page's status can change while we move it among lru. If an evictable
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
 	if (is_unevictable && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
 		}
 		/* This means someone else dropped this page from LRU
 		 * So, it will be freed or putback to LRU again. There is
 		 * nothing to do here.
 		 */
 	}
 	if (was_unevictable && !is_unevictable)
 		count_vm_event(UNEVICTABLE_PGRESCUED);
 	else if (!was_unevictable && is_unevictable)
 		count_vm_event(UNEVICTABLE_PGCULLED);
 	put_page(page);		/* drop ref from isolate */
 }
 enum page_references {
 	PAGEREF_RECLAIM,
 	PAGEREF_RECLAIM_CLEAN,
 	PAGEREF_KEEP,
 	PAGEREF_ACTIVATE,
 };
 static enum page_references page_check_references(struct page *page,
 						  struct scan_control *sc)
 {
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
 	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
 					  &vm_flags);
 	referenced_page = TestClearPageReferenced(page);
 	/*
 	 * Mlock lost the isolation race with us.  Let try_to_unmap()
 	 * move the page to the unevictable list.
 	 */
 	if (vm_flags & VM_LOCKED)
 		return PAGEREF_RECLAIM;
 	if (referenced_ptes) {
 		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
 		 * references from the instantiating fault, so we need
 		 * to look twice if a mapped file page is used more
 		 * than once.
 		 *
 		 * Mark it and spare it for another trip around the
 		 * inactive list.  Another page table reference will
 		 * lead to its activation.
 		 *
 		 * Note: the mark is set for activated pages as well
 		 * so that recently deactivated but used pages are
 		 * quickly recovered.
 		 */
 		SetPageReferenced(page);
 		if (referenced_page || referenced_ptes > 1)
 			return PAGEREF_ACTIVATE;
 		/*
 		 * Activate file-backed executable pages after first usage.
 		 */
 		if (vm_flags & VM_EXEC)
 			return PAGEREF_ACTIVATE;
 		return PAGEREF_KEEP;
 	}
 	/* Reclaim if clean, defer dirty pages to writeback */
 	if (referenced_page && !PageSwapBacked(page))
 		return PAGEREF_RECLAIM_CLEAN;
 	return PAGEREF_RECLAIM;
 }
 /* Check if a page is dirty or under writeback */
 static void page_check_dirty_writeback(struct page *page,
 				       bool *dirty, bool *writeback)
 {
 	struct address_space *mapping;
 	/*
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
 	 */
 	if (!page_is_file_cache(page)) {
 		*dirty = false;
 		*writeback = false;
 		return;
 	}
 	/* By default assume that the page flags are accurate */
 	*dirty = PageDirty(page);
 	*writeback = PageWriteback(page);
 	/* Verify dirty/writeback state if the filesystem supports it */
 	if (!page_has_private(page))
 		return;
 	mapping = page_mapping(page);
 	if (mapping && mapping->a_ops->is_dirty_writeback)
 		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct zone *zone,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      unsigned long *ret_nr_dirty,
 				      unsigned long *ret_nr_unqueued_dirty,
 				      unsigned long *ret_nr_congested,
 				      unsigned long *ret_nr_writeback,
 				      unsigned long *ret_nr_immediate,
 				      bool force_reclaim)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
 	int pgactivate = 0;
 	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_writeback = 0;
 	unsigned long nr_immediate = 0;
 	cond_resched();
 	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
 		cond_resched();
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 		if (!trylock_page(page))
 			goto keep;
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 		sc->nr_scanned++;
 		if (unlikely(!page_evictable(page)))
 			goto cull_mlocked;
 		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 		/*
 		 * The number of dirty pages determines if a zone is marked
 		 * reclaim_congested which affects wait_iff_congested. kswapd
 		 * will stall and start writing pages if the tail of the LRU
 		 * is all dirty unqueued pages.
 		 */
 		page_check_dirty_writeback(page, &dirty, &writeback);
 		if (dirty || writeback)
 			nr_dirty++;
 		if (dirty && !writeback)
 			nr_unqueued_dirty++;
 		/*
 		 * Treat this page as congested if the underlying BDI is or if
 		 * pages are cycling through the LRU so quickly that the
 		 * pages marked for immediate reclaim are making it to the
 		 * end of the LRU a second time.
 		 */
 		mapping = page_mapping(page);
 		if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 		/*
 		 * If a page at the tail of the LRU is under writeback, there
 		 * are three cases to consider.
 		 *
 		 * 1) If reclaim is encountering an excessive number of pages
 		 *    under writeback and this page is both under writeback and
 		 *    PageReclaim then it indicates that pages are being queued
 		 *    for IO but are being recycled through the LRU before the
 		 *    IO can complete. Waiting on the page itself risks an
 		 *    indefinite stall if it is impossible to writeback the
 		 *    page due to IO error or disconnected storage so instead
 		 *    note that the LRU is being scanned too quickly and the
 		 *    caller can stall after page list has been processed.
 		 *
 		 * 2) Global reclaim encounters a page, memcg encounters a
 		 *    page that is not marked for immediate reclaim or
 		 *    the caller does not have __GFP_IO. In this case mark
 		 *    the page for immediate reclaim and continue scanning.
 		 *
 		 *    __GFP_IO is checked  because a loop driver thread might
 		 *    enter reclaim, and deadlock if it waits on a page for
 		 *    which it is needed to do the write (loop masks off
 		 *    __GFP_IO|__GFP_FS for this reason); but more thought
 		 *    would probably show more reasons.
 		 *
 		 *    Don't require __GFP_FS, since we're not going into the
 		 *    FS, just waiting on its writeback completion. Worryingly,
 		 *    ext4 gfs2 and xfs allocate pages with
 		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
 		 *    may_enter_fs here is liable to OOM on them.
 		 *
 		 * 3) memcg encounters a page that is not already marked
 		 *    PageReclaim. memcg does not have any dirty pages
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
 		 *    reclaim. Wait for the writeback to complete.
 		 */
 		if (PageWriteback(page)) {
 			/* Case 1 above */
 			if (current_is_kswapd() &&
 			    PageReclaim(page) &&
 			    zone_is_reclaim_writeback(zone)) {
 				nr_immediate++;
 				goto keep_locked;
 			/* Case 2 above */
 			} else if (global_reclaim(sc) ||
 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
 				/*
 				 * This is slightly racy - end_page_writeback()
 				 * might have just cleared PageReclaim, then
 				 * setting PageReclaim here end up interpreted
 				 * as PageReadahead - but that does not matter
 				 * enough to care.  What we do want is for this
 				 * page to have PageReclaim set next time memcg
 				 * reclaim reaches the tests above, so it will
 				 * then wait_on_page_writeback() to avoid OOM;
 				 * and it's also appropriate in global reclaim.
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
 				goto keep_locked;
 			/* Case 3 above */
 			} else {
 				wait_on_page_writeback(page);
 			}
 		}
 		if (!force_reclaim)
 			references = page_check_references(page, sc);
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
 		case PAGEREF_KEEP:
 			goto keep_locked;
 		case PAGEREF_RECLAIM:
 		case PAGEREF_RECLAIM_CLEAN:
 			; /* try to reclaim the page below */
 		}
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
 				goto activate_locked;
 			may_enter_fs = 1;
 			/* Adding to swap updated mapping */
 			mapping = page_mapping(page);
 		}
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, ttu_flags)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
 			case SWAP_MLOCK:
 				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
 		if (PageDirty(page)) {
 			/*
 			 * Only kswapd can writeback filesystem pages to
 			 * avoid risk of stack overflow but only writeback
 			 * if many dirty pages have been encountered.
 			 */
 			if (page_is_file_cache(page) &&
 					(!current_is_kswapd() ||
 					 !zone_is_reclaim_dirty(zone))) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
 				 * except we already have the page isolated
 				 * and know it's dirty
 				 */
 				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
 				SetPageReclaim(page);
 				goto keep_locked;
 			}
 			if (references == PAGEREF_RECLAIM_CLEAN)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
 			if (!sc->may_writepage)
 				goto keep_locked;
 			/* Page is dirty, try to write it out here */
 			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page))
 					goto keep;
 				if (PageDirty(page))
 					goto keep;
 				/*
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
 				if (!trylock_page(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
 				mapping = page_mapping(page);
 			case PAGE_CLEAN:
 				; /* try to free the page below */
 			}
 		}
 		/*
 		 * If the page has buffers, try to free the buffer mappings
 		 * associated with this page. If we succeed we try to free
 		 * the page as well.
 		 *
 		 * We do this even if the page is PageDirty().
 		 * try_to_release_page() does not perform I/O, but it is
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
 		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
 		 * Rarely, pages can have buffers and no ->mapping.  These are
 		 * the pages which were not successfully invalidated in
 		 * truncate_complete_page().  We try to drop those buffers here
 		 * and if that worked, and the page is no longer mapped into
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (page_has_private(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1) {
 				unlock_page(page);
 				if (put_page_testzero(page))
 					goto free_it;
 				else {
 					/*
 					 * rare race with speculative reference.
 					 * the speculative reference will free
 					 * this page shortly, so we may
 					 * increment nr_reclaimed here (and
 					 * leave it off the LRU).
 					 */
 					nr_reclaimed++;
 					continue;
 				}
 			}
 		}
 		if (!mapping || !__remove_mapping(mapping, page))
 			goto keep_locked;
 		/*
 		 * At this point, we have no other references and there is
 		 * no way to pick any more up (removed from LRU, removed
 		 * from pagecache). Can use non-atomic bitops now (and
 		 * we obviously don't have to worry about waking up a process
 		 * waiting on the page lock, because there are no references.
 		 */
 		__clear_page_locked(page);
 free_it:
 		nr_reclaimed++;
 		/*
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
 		 */
 		list_add(&page->lru, &free_pages);
 		continue;
 cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		unlock_page(page);
 		putback_lru_page(page);
 		continue;
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
 		if (PageSwapCache(page) && vm_swap_full())
 			try_to_free_swap(page);
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	free_hot_cold_page_list(&free_pages, true);
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
 	mem_cgroup_uncharge_end();
 	*ret_nr_dirty += nr_dirty;
 	*ret_nr_congested += nr_congested;
 	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
 	*ret_nr_writeback += nr_writeback;
 	*ret_nr_immediate += nr_immediate;
 	return nr_reclaimed;
 }
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *page_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
 	};
 	unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
 	struct page *page, *next;
 	LIST_HEAD(clean_pages);
 	list_for_each_entry_safe(page, next, page_list, lru) {
 		if (page_is_file_cache(page) && !PageDirty(page) &&
 		    !isolated_balloon_page(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
 	}
 	ret = shrink_page_list(&clean_pages, zone, &sc,
 			TTU_UNMAP|TTU_IGNORE_ACCESS,
 			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
 	list_splice(&clean_pages, page_list);
 	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
  * freed elsewhere are also ignored.
  *
  * page:	page to consider
  * mode:	one of the LRU isolation modes defined above
  *
  * returns 0 on success, -ve errno on failure.
  */
 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 {
 	int ret = -EINVAL;
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
 		return ret;
 	/* Compaction should not handle unevictable pages but CMA can do so */
 	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
 		return ret;
 	ret = -EBUSY;
 	/*
 	 * To minimise LRU disruption, the caller can indicate that it only
 	 * wants to isolate pages it will be able to operate on without
 	 * blocking - clean pages for the most part.
 	 *
 	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
 	 * is used by reclaim when it is cannot write to backing storage
 	 *
 	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
 	 * that it is possible to migrate without blocking
 	 */
 	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
 			return ret;
 		if (PageDirty(page)) {
 			struct address_space *mapping;
 			/* ISOLATE_CLEAN means only clean pages */
 			if (mode & ISOLATE_CLEAN)
 				return ret;
 			/*
 			 * Only pages without mappings or that have a
 			 * ->migratepage callback are possible to migrate
 			 * without blocking
 			 */
 			mapping = page_mapping(page);
 			if (mapping && !mapping->a_ops->migratepage)
 				return ret;
 		}
 	}
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
 		 * sure the page is not being freed elsewhere -- the
 		 * page release code relies on it.
 		 */
 		ClearPageLRU(page);
 		ret = 0;
 	}
 	return ret;
 }
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:	The number of pages to look through on the list.
  * @lruvec:	The LRU vector to pull pages from.
  * @dst:	The temp list to put pages on to.
  * @nr_scanned:	The number of pages that were scanned.
  * @sc:		The scan_control struct for this reclaim session
  * @mode:	One of the LRU isolation modes
  * @lru:	LRU list id for isolating
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct lruvec *lruvec, struct list_head *dst,
 		unsigned long *nr_scanned, struct scan_control *sc,
 		isolate_mode_t mode, enum lru_list lru)
 {
 	struct list_head *src = &lruvec->lists[lru];
 	unsigned long nr_taken = 0;
 	unsigned long scan;
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
 		int nr_pages;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 		VM_BUG_ON(!PageLRU(page));
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
 			nr_pages = hpage_nr_pages(page);
 			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 			list_move(&page->lru, dst);
 			nr_taken += nr_pages;
 			break;
 		case -EBUSY:
 			/* else it is being freed elsewhere */
 			list_move(&page->lru, src);
 			continue;
 		default:
 			BUG();
 		}
 	}
 	*nr_scanned = scan;
 	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
 				    nr_taken, mode, is_file_lru(lru));
 	return nr_taken;
 }
 /**
  * isolate_lru_page - tries to isolate a page from its LRU list
  * @page: page to isolate from its LRU list
  *
  * Isolates a @page from an LRU list, clears PageLRU and adjusts the
  * vmstat statistic corresponding to whatever LRU list the page was on.
  *
  * Returns 0 if the page was removed from an LRU list.
  * Returns -EBUSY if the page was not on an LRU list.
  *
  * The returned page will have PageLRU() cleared.  If it was found on
  * the active list, it will have PageActive set.  If it was found on
  * the unevictable list, it will have the PageUnevictable bit set. That flag
  * may need to be cleared by the caller before letting the page go.
  *
  * The vmstat statistic corresponding to the list on which the page was
  * found will be decremented.
  *
  * Restrictions:
  * (1) Must be called with an elevated refcount on the page. This is a
  *     fundamentnal difference from isolate_lru_pages (which is called
  *     without a stable reference).
  * (2) the lru_lock must not be held.
  * (3) interrupts must be enabled.
  */
 int isolate_lru_page(struct page *page)
 {
 	int ret = -EBUSY;
 	VM_BUG_ON(!page_count(page));
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
 		struct lruvec *lruvec;
 		spin_lock_irq(&zone->lru_lock);
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		if (PageLRU(page)) {
 			int lru = page_lru(page);
 			get_page(page);
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	return ret;
 }
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
 	if (current_is_kswapd())
 		return 0;
 	if (!global_reclaim(sc))
 		return 0;
 	if (file) {
 		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
 		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
 	} else {
 		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
 	}
 	/*
 	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
 	 * won't get blocked by normal direct-reclaimers, forming a circular
 	 * deadlock.
 	 */
 	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
 		inactive >>= 3;
 	return isolated > inactive;
 }
 static noinline_for_stack void
 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	struct zone *zone = lruvec_zone(lruvec);
 	LIST_HEAD(pages_to_free);
 	/*
 	 * Put back any unfreeable pages.
 	 */
 	while (!list_empty(page_list)) {
 		struct page *page = lru_to_page(page_list);
 		int lru;
 		VM_BUG_ON(PageLRU(page));
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
 			spin_unlock_irq(&zone->lru_lock);
 			putback_lru_page(page);
 			spin_lock_irq(&zone->lru_lock);
 			continue;
 		}
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		SetPageLRU(page);
 		lru = page_lru(page);
 		add_page_to_lru_list(page, lruvec, lru);
 		if (is_active_lru(lru)) {
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
 			reclaim_stat->recent_rotated[file] += numpages;
 		}
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 		}
 	}
 	/*
 	 * To save our caller's stack, now use input list for pages to free.
 	 */
 	list_splice(&pages_to_free, page_list);
 }
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		     struct scan_control *sc, enum lru_list lru)
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_writeback = 0;
 	unsigned long nr_immediate = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))
 			return SWAP_CLUSTER_MAX;
 	}
 	lru_add_drain();
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 	spin_lock_irq(&zone->lru_lock);
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	if (global_reclaim(sc)) {
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
 		else
 			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 	if (nr_taken == 0)
 		return 0;
 	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
 				&nr_dirty, &nr_unqueued_dirty, &nr_congested,
 				&nr_writeback, &nr_immediate,
 				false);
 	spin_lock_irq(&zone->lru_lock);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
 					       nr_reclaimed);
 		else
 			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
 					       nr_reclaimed);
 	}
 	putback_inactive_pages(lruvec, &page_list);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	free_hot_cold_page_list(&page_list, true);
 	/*
 	 * If reclaim is isolating dirty pages under writeback, it implies
 	 * that the long-lived page allocation rate is exceeding the page
 	 * laundering rate. Either the global limits are not being effective
 	 * at throttling processes due to the page distribution throughout
 	 * zones or there is heavy usage of a slow backing device. The
 	 * only option is to throttle from reclaim context which is not ideal
 	 * as there is no guarantee the dirtying process is throttled in the
 	 * same way balance_dirty_pages() manages.
 	 *
 	 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
 	 * of pages under pages flagged for immediate reclaim and stall if any
 	 * are encountered in the nr_immediate check below.
 	 */
 	if (nr_writeback && nr_writeback == nr_taken)
 		zone_set_flag(zone, ZONE_WRITEBACK);
 	/*
 	 * memcg will stall in page writeback so only consider forcibly
 	 * stalling for global reclaim
 	 */
 	if (global_reclaim(sc)) {
 		/*
 		 * Tag a zone as congested if all the dirty pages scanned were
 		 * backed by a congested BDI and wait_iff_congested will stall.
 		 */
 		if (nr_dirty && nr_dirty == nr_congested)
 			zone_set_flag(zone, ZONE_CONGESTED);
 		/*
 		 * If dirty pages are scanned that are not queued for IO, it
 		 * implies that flushers are not keeping up. In this case, flag
 		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
 		 * pages from reclaim context.
 		 */
 		if (nr_unqueued_dirty == nr_taken)
 			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
 		/*
 		 * If kswapd scans pages marked marked for immediate
 		 * reclaim and under writeback (nr_immediate), it implies
 		 * that pages are cycling through the LRU faster than
 		 * they are written so also forcibly stall.
 		 */
 		if (nr_immediate)
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 	/*
 	 * Stall direct reclaim for IO completions if underlying BDIs or zone
 	 * is congested. Allow kswapd to continue until it starts encountering
 	 * unqueued dirty pages or cycling through the LRU too quickly.
 	 */
 	if (!sc->hibernation_mode && !current_is_kswapd())
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
 		zone_idx(zone),
 		nr_scanned, nr_reclaimed,
 		sc->priority,
 		trace_shrink_flags(file));
 	return nr_reclaimed;
 }
 /*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
  * appropriate to hold zone->lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
  * should drop zone->lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
 static void move_active_pages_to_lru(struct lruvec *lruvec,
 				     struct list_head *list,
 				     struct list_head *pages_to_free,
 				     enum lru_list lru)
 {
 	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long pgmoved = 0;
 	struct page *page;
 	int nr_pages;
 	while (!list_empty(list)) {
 		page = lru_to_page(list);
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		nr_pages = hpage_nr_pages(page);
 		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
 				list_add(&page->lru, pages_to_free);
 		}
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
 static void shrink_active_list(unsigned long nr_to_scan,
 			       struct lruvec *lruvec,
 			       struct scan_control *sc,
 			       enum lru_list lru)
 {
 	unsigned long nr_taken;
 	unsigned long nr_scanned;
 	unsigned long vm_flags;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	unsigned long nr_rotated = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 	lru_add_drain();
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 	spin_lock_irq(&zone->lru_lock);
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
 	if (global_reclaim(sc))
 		zone->pages_scanned += nr_scanned;
 	reclaim_stat->recent_scanned[file] += nr_taken;
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
 			putback_lru_page(page);
 			continue;
 		}
 		if (unlikely(buffer_heads_over_limit)) {
 			if (page_has_private(page) && trylock_page(page)) {
 				if (page_has_private(page))
 					try_to_release_page(page, 0);
 				unlock_page(page);
 			}
 		}
 		if (page_referenced(page, 0, sc->target_mem_cgroup,
 				    &vm_flags)) {
 			nr_rotated += hpage_nr_pages(page);
 			/*
 			 * Identify referenced, file-backed active pages and
 			 * give them one more trip around the active list. So
 			 * that executable code get better chances to stay in
 			 * memory under moderate memory pressure.  Anon pages
 			 * are not likely to be evicted by use-once streaming
 			 * IO, plus JVM can create lots of anon VM_EXEC pages,
 			 * so we ignore them here.
 			 */
 			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
 		}
 		ClearPageActive(page);	/* we are de-activating */
 		list_add(&page->lru, &l_inactive);
 	}
 	/*
 	 * Move pages back to the lru list.
 	 */
 	spin_lock_irq(&zone->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as rotated,
 	 * even though only some of them are actually re-activated.  This
 	 * helps balance scan pressure between file and anonymous pages in
 	 * get_scan_ratio.
 	 */
 	reclaim_stat->recent_rotated[file] += nr_rotated;
 	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
 	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	free_hot_cold_page_list(&l_hold, true);
 }
 #ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
 	unsigned long active, inactive;
 	active = zone_page_state(zone, NR_ACTIVE_ANON);
 	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 	if (inactive * zone->inactive_ratio < active)
 		return 1;
 	return 0;
 }
 /**
  * inactive_anon_is_low - check if anonymous pages need to be deactivated
  * @lruvec: LRU vector to check
  *
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
 static int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	/*
 	 * If we don't have swap space, anonymous page deactivation
 	 * is pointless.
 	 */
 	if (!total_swap_pages)
 		return 0;
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_inactive_anon_is_low(lruvec);
 	return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
 static inline int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 0;
 }
 #endif
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
  * @lruvec: LRU vector to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
  * than half of the file pages are on the inactive list.
  *
  * Once we get to that situation, protect the system's working
  * set from being evicted by disabling active file page aging.
  *
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
 static int inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive;
 	unsigned long active;
 	inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
 	return active > inactive;
 }
 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (is_file_lru(lru))
 		return inactive_file_is_low(lruvec);
 	else
 		return inactive_anon_is_low(lruvec);
 }
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
 		if (inactive_list_is_low(lruvec, lru))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
 }
 static int vmscan_swappiness(struct scan_control *sc)
 {
 	if (global_reclaim(sc))
 		return vm_swappiness;
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
 	SCAN_ANON,
 	SCAN_FILE,
 };
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
  * by looking at the fraction of the pages scanned we did rotate back
  * onto the active list instead of evict.
  *
  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 			   unsigned long *nr)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2];
 	u64 denominator = 0;	/* gcc */
 	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long anon_prio, file_prio;
 	enum scan_balance scan_balance;
-	unsigned long anon, file, free;
+	unsigned long anon, file;
 	bool force_scan = false;
 	unsigned long ap, fp;
 	enum lru_list lru;
 	/*
 	 * If the zone or memcg is small, nr[l] can be 0.  This
 	 * results in no scanning on this priority and a potential
 	 * priority drop.  Global direct reclaim can go to the next
 	 * zone and tends to have no problems. Global kswapd is for
 	 * zone balancing and it needs to scan a minimum amount. When
 	 * reclaiming for a memcg, a priority drop can cause high
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
 	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	/*
 	 * Global reclaim will swap to prevent OOM even with no
 	 * swappiness, but memcg users want to use this knob to
 	 * disable swapping for individual groups completely when
 	 * using the memory controller's swap limit feature would be
 	 * too expensive.
 	 */
 	if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	/*
 	 * Do not apply any pressure balancing cleverness when the
 	 * system is close to OOM, scan both anon and file equally
 	 * (unless the swappiness setting disagrees with swapping).
 	 */
 	if (!sc->priority && vmscan_swappiness(sc)) {
 		scan_balance = SCAN_EQUAL;
 		goto out;
 	}
-	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	/*
 	 * If it's foreseeable that reclaiming the file cache won't be
 	 * enough to get the zone back into a desirable shape, we have
 	 * to swap.  Better start now and leave the - probably heavily
 	 * thrashing - remaining file pages alone.
 	 */
 	if (global_reclaim(sc)) {
-		free = zone_page_state(zone, NR_FREE_PAGES);
+		unsigned long zonefile;
-		if (unlikely(file + free <= high_wmark_pages(zone))) {
+		unsigned long zonefree;
+		zonefree = zone_page_state(zone, NR_FREE_PAGES);
+		zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
+			   zone_page_state(zone, NR_INACTIVE_FILE);
+		if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
 			scan_balance = SCAN_ANON;
 			goto out;
 		}
 	}
 	/*
 	 * There is enough inactive page cache, do not reclaim
 	 * anything from the anonymous working set right now.
 	 */
 	if (!inactive_file_is_low(lruvec)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	scan_balance = SCAN_FRACT;
 	/*
 	 * With swappiness at 100, anonymous and file have the same priority.
 	 * This scanning priority is essentially the inverse of IO cost.
 	 */
 	anon_prio = vmscan_swappiness(sc);
 	file_prio = 200 - anon_prio;
 	/*
 	 * OK, so we have swap space and a fair amount of page cache
 	 * pages.  We use the recently rotated / recently scanned
 	 * ratios to determine how valuable each cache is.
 	 *
 	 * Because workloads change over time (and to avoid overflow)
 	 * we keep these statistics as a floating average, which ends
 	 * up weighing recent references more than old ones.
 	 *
 	 * anon in [0], file in [1]
 	 */
+	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
+		get_lru_size(lruvec, LRU_INACTIVE_ANON);
+	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
+		get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	spin_lock_irq(&zone->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
 		reclaim_stat->recent_scanned[0] /= 2;
 		reclaim_stat->recent_rotated[0] /= 2;
 	}
 	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
 		reclaim_stat->recent_scanned[1] /= 2;
 		reclaim_stat->recent_rotated[1] /= 2;
 	}
 	/*
 	 * The amount of pressure on anon vs file pages is inversely
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
 	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
 	ap /= reclaim_stat->recent_rotated[0] + 1;
 	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
 	fp /= reclaim_stat->recent_rotated[1] + 1;
 	spin_unlock_irq(&zone->lru_lock);
 	fraction[0] = ap;
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
 		unsigned long scan;
 		size = get_lru_size(lruvec, lru);
 		scan = size >> sc->priority;
 		if (!scan && force_scan)
 			scan = min(size, SWAP_CLUSTER_MAX);
 		switch (scan_balance) {
 		case SCAN_EQUAL:
 			/* Scan lists relative to size */
 			break;
 		case SCAN_FRACT:
 			/*
 			 * Scan types proportional to swappiness and
 			 * their relative recent reclaim efficiency.
 			 */
 			scan = div64_u64(scan * fraction[file], denominator);
 			break;
 		case SCAN_FILE:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
 			if ((scan_balance == SCAN_FILE) != file)
 				scan = 0;
 			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
 		}
 		nr[lru] = scan;
 	}
 }
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long targets[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	enum lru_list lru;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 	struct blk_plug plug;
 	bool scan_adjusted;
 	get_scan_count(lruvec, sc, nr);
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
 	/*
 	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
 	 * event that can occur when there is little memory pressure e.g.
 	 * multiple streaming readers/writers. Hence, we do not abort scanning
 	 * when the requested number of pages are reclaimed when scanning at
 	 * DEF_PRIORITY on the assumption that the fact we are direct
 	 * reclaiming implies that kswapd is not keeping up and it is best to
 	 * do a batch of work at once. For memcg reclaim one check is made to
 	 * abort proportional reclaim if either the file or anon lru has already
 	 * dropped to zero at the first pass.
 	 */
 	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
 			 sc->priority == DEF_PRIORITY);
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
 		unsigned long nr_anon, nr_file, percentage;
 		unsigned long nr_scanned;
 		for_each_evictable_lru(lru) {
 			if (nr[lru]) {
 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
 				nr[lru] -= nr_to_scan;
 				nr_reclaimed += shrink_list(lru, nr_to_scan,
 							    lruvec, sc);
 			}
 		}
 		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
 			continue;
 		/*
 		 * For kswapd and memcg, reclaim at least the number of pages
 		 * requested. Ensure that the anon and file LRUs are scanned
 		 * proportionally what was requested by get_scan_count(). We
 		 * stop reclaiming one LRU and reduce the amount scanning
 		 * proportional to the original scan target.
 		 */
 		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
 		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
 		/*
 		 * It's just vindictive to attack the larger once the smaller
 		 * has gone to zero.  And given the way we stop scanning the
 		 * smaller below, this makes sure that we only make one nudge
 		 * towards proportionality once we've got nr_to_reclaim.
 		 */
 		if (!nr_file || !nr_anon)
 			break;
 		if (nr_file > nr_anon) {
 			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
 						targets[LRU_ACTIVE_ANON] + 1;
 			lru = LRU_BASE;
 			percentage = nr_anon * 100 / scan_target;
 		} else {
 			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
 						targets[LRU_ACTIVE_FILE] + 1;
 			lru = LRU_FILE;
 			percentage = nr_file * 100 / scan_target;
 		}
 		/* Stop scanning the smaller of the LRU */
 		nr[lru] = 0;
 		nr[lru + LRU_ACTIVE] = 0;
 		/*
 		 * Recalculate the other LRU scan count based on its original
 		 * scan target and the percentage scanning already complete
 		 */
 		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 		lru += LRU_ACTIVE;
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 		scan_adjusted = true;
 	}
 	blk_finish_plug(&plug);
 	sc->nr_reclaimed += nr_reclaimed;
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
 	if (inactive_anon_is_low(lruvec))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 	throttle_vm_writeout(sc->gfp_mask);
 }
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
 	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
 			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
 			 sc->priority < DEF_PRIORITY - 2))
 		return true;
 	return false;
 }
 /*
  * Reclaim/compaction is used for high-order allocation requests. It reclaims
  * order-0 pages before compacting the zone. should_continue_reclaim() returns
  * true if more pages should be reclaimed such that when the page allocator
  * calls try_to_compact_zone() that it will have enough free pages to succeed.
  * It will give up earlier than that if there is difficulty reclaiming pages.
  */
 static inline bool should_continue_reclaim(struct zone *zone,
 					unsigned long nr_reclaimed,
 					unsigned long nr_scanned,
 					struct scan_control *sc)
 {
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 	/* If not in reclaim/compaction mode, stop */
 	if (!in_reclaim_compaction(sc))
 		return false;
 	/* Consider stopping depending on scan and reclaim activity */
 	if (sc->gfp_mask & __GFP_REPEAT) {
 		/*
 		 * For __GFP_REPEAT allocations, stop reclaiming if the
 		 * full LRU list has been scanned and we are still failing
 		 * to reclaim pages. This full LRU scan is potentially
 		 * expensive but a __GFP_REPEAT caller really wants to succeed
 		 */
 		if (!nr_reclaimed && !nr_scanned)
 			return false;
 	} else {
 		/*
 		 * For non-__GFP_REPEAT allocations which can presumably
 		 * fail without consequence, stop if we failed to reclaim
 		 * any pages from the last SWAP_CLUSTER_MAX number of
 		 * pages that were scanned. This will return to the
 		 * caller faster at the risk reclaim/compaction and
 		 * the resulting allocation attempt fails
 		 */
 		if (!nr_reclaimed)
 			return false;
 	}
 	/*
 	 * If we have not reclaimed enough pages for compaction and the
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
 	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
 	if (get_nr_swap_pages() > 0)
 		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
 	if (sc->nr_reclaimed < pages_for_compaction &&
 			inactive_lru_pages > pages_for_compaction)
 		return true;
 	/* If compaction would go ahead or the allocation would succeed, stop */
 	switch (compaction_suitable(zone, sc->order)) {
 	case COMPACT_PARTIAL:
 	case COMPACT_CONTINUE:
 		return false;
 	default:
 		return true;
 	}
 }
 static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
 		struct mem_cgroup_reclaim_cookie reclaim = {
 			.zone = zone,
 			.priority = sc->priority,
 		};
 		struct mem_cgroup *memcg;
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			struct lruvec *lruvec;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			shrink_lruvec(lruvec, sc);
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
 			 * cgroups to fulfill the overall scan target for the
 			 * zone.
 			 *
 			 * Limit reclaim, on the other hand, only cares about
 			 * nr_to_reclaim pages to be reclaimed and it will
 			 * retry with decreasing priority if one round over the
 			 * whole hierarchy is not sufficient.
 			 */
 			if (!global_reclaim(sc) &&
 					sc->nr_reclaimed >= sc->nr_to_reclaim) {
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
 			memcg = mem_cgroup_iter(root, memcg, &reclaim);
 		} while (memcg);
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
 			   sc->nr_reclaimed - nr_reclaimed);
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 }
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long balance_gap, watermark;
 	bool watermark_ok;
 	/* Do not consider compaction for orders reclaim is meant to satisfy */
 	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
 		return false;
 	/*
 	 * Compaction takes time to run and there are potentially other
 	 * callers using the pages just freed. Continue reclaiming until
 	 * there is a buffer of free pages available to give compaction
 	 * a reasonable chance of completing and allocating the page
 	 */
 	balance_gap = min(low_wmark_pages(zone),
 		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 			KSWAPD_ZONE_BALANCE_GAP_RATIO);
 	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
 	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
 	/*
 	 * If compaction is deferred, reclaim up to a point where
 	 * compaction will have a chance of success when re-enabled
 	 */
 	if (compaction_deferred(zone, sc->order))
 		return watermark_ok;
 	/* If compaction is not ready to start, keep reclaiming */
 	if (!compaction_suitable(zone, sc->order))
 		return false;
 	return watermark_ok;
 }
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
  * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
  * Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
  * b) The target zone may be at high_wmark_pages(zone) but the lower zones
  *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
  *    zone defense algorithm.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  *
  * This function returns true if a zone is being reclaimed for a costly
  * high-order allocation and compaction is ready to begin. This indicates to
  * the caller that it should consider retrying the allocation instead of
  * further reclaim.
  */
 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
 	 * allowed level, force direct reclaim to scan the highmem zone as
 	 * highmem pages could be pinning lowmem pages storing buffer_heads
 	 */
 	if (buffer_heads_over_limit)
 		sc->gfp_mask |= __GFP_HIGHMEM;
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
 		 */
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 			if (sc->priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
 				 * If we already have plenty of memory free for
 				 * compaction in this zone, don't free any more.
 				 * Even though compaction is invoked for any
 				 * non-zero order, only frequent costly order
 				 * reclamation is disruptive enough to become a
 				 * noticeable problem, like transparent huge
 				 * page allocations.
 				 */
 				if (compaction_ready(zone, sc)) {
 					aborted_reclaim = true;
 					continue;
 				}
 			}
 			/*
 			 * This steals pages from memory cgroups over softlimit
 			 * and returns the number of reclaimed pages and
 			 * scanned pages. This works for global memory pressure
 			 * and balancing, not for a memcg's limit.
 			 */
 			nr_soft_scanned = 0;
 			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
 						sc->order, sc->gfp_mask,
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 		shrink_zone(zone, sc);
 	}
 	return aborted_reclaim;
 }
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
 		struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			gfp_zone(sc->gfp_mask), sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
 		if (zone_reclaimable(zone))
 			return false;
 	}
 	return true;
 }
 /*
  * This is the main entry point to direct page reclaim.
  *
  * If a full scan of the inactive list fails to free enough memory then we
  * are "out of memory" and something needs to be killed.
  *
  * If the caller is !__GFP_FS then the probability of a failure is reasonably
  * high - the zone may be full of dirty or under-writeback pages, which this
  * caller can't do much about.  We kick the writeback threads and take explicit
  * naps in the hope that some of these pages can be written.  But if the
  * allocating task holds filesystem locks which prevent writeout this might not
  * work, and the allocation attempt will fail.
  *
  * returns:	0, if no pages reclaimed
  * 		else, the number of pages reclaimed
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc,
 					struct shrink_control *shrink)
 {
 	unsigned long total_scanned = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 	delayacct_freepages_start();
 	if (global_reclaim(sc))
 		count_vm_event(ALLOCSTALL);
 	do {
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from over limit
 		 * cgroups but do shrink slab at least once when aborting
 		 * reclaim for compaction to avoid unevenly scanning file/anon
 		 * LRU pages over slab pages.
 		 */
 		if (global_reclaim(sc)) {
 			unsigned long lru_pages = 0;
 			nodes_clear(shrink->nodes_to_scan);
 			for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
 				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 					continue;
 				lru_pages += zone_reclaimable_pages(zone);
 				node_set(zone_to_nid(zone),
 					 shrink->nodes_to_scan);
 			}
 			shrink_slab(shrink, sc->nr_scanned, lru_pages);
 			if (reclaim_state) {
 				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
 			}
 		}
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
 			goto out;
 		/*
 		 * If we're getting trouble reclaiming, start doing
 		 * writepage even in laptop mode.
 		 */
 		if (sc->priority < DEF_PRIORITY - 2)
 			sc->may_writepage = 1;
 		/*
 		 * Try to write back as many pages as we just scanned.  This
 		 * tends to cause slow streaming writers to write data to the
 		 * disk smoothly, at the dirtying rate, which is nice.   But
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
 			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
 						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
 	} while (--sc->priority >= 0 && !aborted_reclaim);
 out:
 	delayacct_freepages_end();
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
 	/*
 	 * As hibernation is going on, kswapd is freezed so that it can't mark
 	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
 	 * check.
 	 */
 	if (oom_killer_disabled)
 		return 0;
 	/* Aborted reclaim to try compaction? don't OOM, then */
 	if (aborted_reclaim)
 		return 1;
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
 		return 1;
 	return 0;
 }
 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
 	unsigned long free_pages = 0;
 	int i;
 	bool wmark_ok;
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!populated_zone(zone))
 			continue;
 		pfmemalloc_reserve += min_wmark_pages(zone);
 		free_pages += zone_page_state(zone, NR_FREE_PAGES);
 	}
 	/* If there are no reserves (unexpected config) then do not throttle */
 	if (!pfmemalloc_reserve)
 		return true;
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		pgdat->classzone_idx = min(pgdat->classzone_idx,
 						(enum zone_type)ZONE_NORMAL);
 		wake_up_interruptible(&pgdat->kswapd_wait);
 	}
 	return wmark_ok;
 }
 /*
  * Throttle direct reclaimers if backing storage is backed by the network
  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
  * depleted. kswapd will continue to make progress and wake the processes
  * when the low watermark is reached.
  *
  * Returns true if a fatal signal was delivered during throttling. If this
  * happens, the page allocator should not consider triggering the OOM killer.
  */
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 					nodemask_t *nodemask)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	pg_data_t *pgdat = NULL;
 	/*
 	 * Kernel threads should not be throttled as they may be indirectly
 	 * responsible for cleaning pages necessary for reclaim to make forward
 	 * progress. kjournald for example may enter direct reclaim while
 	 * committing a transaction where throttling it could forcing other
 	 * processes to block on log_wait_commit().
 	 */
 	if (current->flags & PF_KTHREAD)
 		goto out;
 	/*
 	 * If a fatal signal is pending, this process should not throttle.
 	 * It should return quickly so it can exit and free its memory
 	 */
 	if (fatal_signal_pending(current))
 		goto out;
 	/*
 	 * Check if the pfmemalloc reserves are ok by finding the first node
 	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
 	 * GFP_KERNEL will be required for allocating network buffers when
 	 * swapping over the network so ZONE_HIGHMEM is unusable.
 	 *
 	 * Throttling is based on the first usable node and throttled processes
 	 * wait on a queue until kswapd makes progress and wakes them. There
 	 * is an affinity then between processes waking up and where reclaim
 	 * progress has been made assuming the process wakes on the same node.
 	 * More importantly, processes running on remote nodes will not compete
 	 * for remote pfmemalloc reserves and processes on different nodes
 	 * should make reasonable progress.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_mask, nodemask) {
 		if (zone_idx(zone) > ZONE_NORMAL)
 			continue;
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
 		if (pfmemalloc_watermark_ok(pgdat))
 			goto out;
 		break;
 	}
 	/* If no zone was usable by the allocation flags then do not throttle */
 	if (!pgdat)
 		goto out;
 	/* Account for the throttling */
 	count_vm_event(PGSCAN_DIRECT_THROTTLE);
 	/*
 	 * If the caller cannot enter the filesystem, it's possible that it
 	 * is due to the caller holding an FS lock or performing a journal
 	 * transaction in the case of a filesystem like ext[3|4]. In this case,
 	 * it is not safe to block on pfmemalloc_wait as kswapd could be
 	 * blocked waiting on the same lock. Instead, throttle for up to a
 	 * second before continuing.
 	 */
 	if (!(gfp_mask & __GFP_FS)) {
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
 			pfmemalloc_watermark_ok(pgdat), HZ);
 		goto check_pending;
 	}
 	/* Throttle until kswapd wakes the process */
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
 		pfmemalloc_watermark_ok(pgdat));
 check_pending:
 	if (fatal_signal_pending(current))
 		return true;
 out:
 	return false;
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.may_writepage = !laptop_mode,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.order = order,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = NULL,
 		.nodemask = nodemask,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	/*
 	 * Do not enter reclaim if fatal signal was delivered while throttled.
 	 * 1 is returned so that the page allocator does not OOM kill at this
 	 * point.
 	 */
 	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
 		return 1;
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
 				gfp_mask);
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 	return nr_reclaimed;
 }
 #ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
 						unsigned long *nr_scanned)
 {
 	struct scan_control sc = {
 		.nr_scanned = 0,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
 		.order = 0,
 		.priority = 0,
 		.target_mem_cgroup = memcg,
 	};
 	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
 						      sc.may_writepage,
 						      sc.gfp_mask);
 	/*
 	 * NOTE: Although we can get the priority field, using it
 	 * here is not a good idea, since it limits the pages we can scan.
 	 * if we don't reclaim here, the shrink_zone from balance_pgdat
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
 	shrink_lruvec(lruvec, &sc);
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 	*nr_scanned = sc.nr_scanned;
 	return sc.nr_reclaimed;
 }
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   gfp_t gfp_mask,
 					   bool noswap)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
 		.nodemask = NULL, /* we don't care the placement */
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
 	 * take care of from where we get pages. So the node where we start the
 	 * scan does not need to be the current node.
 	 */
 	nid = mem_cgroup_select_victim_node(memcg);
 	zonelist = NODE_DATA(nid)->node_zonelists;
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 	return nr_reclaimed;
 }
 #endif
 static void age_active_anon(struct zone *zone, struct scan_control *sc)
 {
 	struct mem_cgroup *memcg;
 	if (!total_swap_pages)
 		return;
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 		if (inactive_anon_is_low(lruvec))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
 	} while (memcg);
 }
 static bool zone_balanced(struct zone *zone, int order,
 			  unsigned long balance_gap, int classzone_idx)
 {
 	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
 				    balance_gap, classzone_idx, 0))
 		return false;
 	if (IS_ENABLED(CONFIG_COMPACTION) && order &&
 	    !compaction_suitable(zone, order))
 		return false;
 	return true;
 }
 /*
  * pgdat_balanced() is used when checking if a node is balanced.
  *
  * For order-0, all zones must be balanced!
  *
  * For high-order allocations only zones that meet watermarks and are in a
  * zone allowed by the callers classzone_idx are added to balanced_pages. The
  * total of balanced pages must be at least 25% of the zones allowed by
  * classzone_idx for the node to be considered balanced. Forcing all zones to
  * be balanced for high orders can cause excessive reclaim when there are
  * imbalanced zones.
  * The choice of 25% is due to
  *   o a 16M DMA zone that is balanced will not balance a zone on any
  *     reasonable sized machine
  *   o On all other machines, the top zone must be at least a reasonable
  *     percentage of the middle zones. For example, on 32-bit x86, highmem
  *     would need to be at least 256M for it to be balance a whole node.
  *     Similarly, on x86-64 the Normal zone would need to be at least 1G
  *     to balance a node on its own. These seemed like reasonable ratios.
  */
 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
 	unsigned long managed_pages = 0;
 	unsigned long balanced_pages = 0;
 	int i;
 	/* Check the watermark levels */
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		if (!populated_zone(zone))
 			continue;
 		managed_pages += zone->managed_pages;
 		/*
 		 * A special case here:
 		 *
 		 * balance_pgdat() skips over all_unreclaimable after
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
 		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
 		if (zone_balanced(zone, order, 0, i))
 			balanced_pages += zone->managed_pages;
 		else if (!order)
 			return false;
 	}
 	if (order)
 		return balanced_pages >= (managed_pages >> 2);
 	else
 		return true;
 }
 /*
  * Prepare kswapd for sleeping. This verifies that there are no processes
  * waiting in throttle_direct_reclaim() and that watermarks have been met.
  *
  * Returns true if kswapd is ready to sleep
  */
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 					int classzone_idx)
 {
 	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 	if (remaining)
 		return false;
 	/*
 	 * There is a potential race between when kswapd checks its watermarks
 	 * and a process gets throttled. There is also a potential race if
 	 * processes get throttled, kswapd wakes, a large process exits therby
 	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
 	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
 	 * so wake them now if necessary. If necessary, processes will wake
 	 * kswapd and get throttled again
 	 */
 	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
 		wake_up(&pgdat->pfmemalloc_wait);
 		return false;
 	}
 	return pgdat_balanced(pgdat, order, classzone_idx);
 }
 /*
  * kswapd shrinks the zone by the number of pages required to reach
  * the high watermark.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
  * reclaim or if the lack of progress was due to pages under writeback.
  * This is used to determine if the scanning priority needs to be raised.
  */
 static bool kswapd_shrink_zone(struct zone *zone,
 			       int classzone_idx,
 			       struct scan_control *sc,
 			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
 	int testorder = sc->order;
 	unsigned long balance_gap;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct shrink_control shrink = {
 		.gfp_mask = sc->gfp_mask,
 	};
 	bool lowmem_pressure;
 	/* Reclaim above the high watermark. */
 	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
 	/*
 	 * Kswapd reclaims only single pages with compaction enabled. Trying
 	 * too hard to reclaim until contiguous free pages have become
 	 * available can hurt performance by evicting too much useful data
 	 * from memory. Do not reclaim more than needed for compaction.
 	 */
 	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
 			compaction_suitable(zone, sc->order) !=
 				COMPACT_SKIPPED)
 		testorder = 0;
 	/*
 	 * We put equal pressure on every zone, unless one zone has way too
 	 * many pages free already. The "too many pages" is defined as the
 	 * high wmark plus a "gap" where the gap is either the low
 	 * watermark or 1% of the zone, whichever is smaller.
 	 */
 	balance_gap = min(low_wmark_pages(zone),
 		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 		KSWAPD_ZONE_BALANCE_GAP_RATIO);
 	/*
 	 * If there is no low memory pressure or the zone is balanced then no
 	 * reclaim is necessary
 	 */
 	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
 	if (!lowmem_pressure && zone_balanced(zone, testorder,
 						balance_gap, classzone_idx))
 		return true;
 	shrink_zone(zone, sc);
 	nodes_clear(shrink.nodes_to_scan);
 	node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 	reclaim_state->reclaimed_slab = 0;
 	shrink_slab(&shrink, sc->nr_scanned, lru_pages);
 	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 	/*
 	 * If a zone reaches its high watermark, consider it to be no longer
 	 * congested. It's possible there are dirty pages backed by congested
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
 	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 	}
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at high_wmark_pages(zone).
  *
  * Returns the final order kswapd was reclaiming at
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
  * What we do is to detect the case where all pages in the zone have been
  * scanned twice and there has been zero successful reclaim.  Mark the zone as
  * dead and from now on, only perform a short scan.  Basically we're polling
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
  * lower zones regardless of the number of free pages in the lower zones. This
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 							int *classzone_idx)
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.may_writepage = !laptop_mode,
 		.order = order,
 		.target_mem_cgroup = NULL,
 	};
 	count_vm_event(PAGEOUTRUN);
 	do {
 		unsigned long lru_pages = 0;
 		unsigned long nr_attempted = 0;
 		bool raise_priority = true;
 		bool pgdat_needs_compaction = (order > 0);
 		sc.nr_reclaimed = 0;
 		/*
 		 * Scan in the highmem->dma direction for the highest
 		 * zone which needs scanning
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (sc.priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;
 			/*
 			 * Do some background aging of the anon list, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
 			age_active_anon(zone, &sc);
 			/*
 			 * If the number of buffer_heads in the machine
 			 * exceeds the maximum allowed level and this node
 			 * has a highmem zone, force kswapd to reclaim from
 			 * it to relieve lowmem pressure.
 			 */
 			if (buffer_heads_over_limit && is_highmem_idx(i)) {
 				end_zone = i;
 				break;
 			}
 			if (!zone_balanced(zone, order, 0, 0)) {
 				end_zone = i;
 				break;
 			} else {
 				/*
 				 * If balanced, clear the dirty and congested
 				 * flags
 				 */
 				zone_clear_flag(zone, ZONE_CONGESTED);
 				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 			}
 		}
 		if (i < 0)
 			goto out;
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			lru_pages += zone_reclaimable_pages(zone);
 			/*
 			 * If any zone is currently balanced then kswapd will
 			 * not call compaction as it is expected that the
 			 * necessary pages are already available.
 			 */
 			if (pgdat_needs_compaction &&
 					zone_watermark_ok(zone, order,
 						low_wmark_pages(zone),
 						*classzone_idx, 0))
 				pgdat_needs_compaction = false;
 		}
 		/*
 		 * If we're getting trouble reclaiming, start doing writepage
 		 * even in laptop mode.
 		 */
 		if (sc.priority < DEF_PRIORITY - 2)
 			sc.may_writepage = 1;
 		/*
 		 * Now scan the zone in the dma->highmem direction, stopping
 		 * at the last zone which needs scanning.
 		 *
 		 * We do this because the page allocator works in the opposite
 		 * direction.  This prevents the page allocator from allocating
 		 * pages behind kswapd's direction of progress, which would
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (sc.priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;
 			sc.nr_scanned = 0;
 			nr_soft_scanned = 0;
 			/*
 			 * Call soft limit reclaim before calling shrink_zone.
 			 */
 			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
 							order, sc.gfp_mask,
 							&nr_soft_scanned);
 			sc.nr_reclaimed += nr_soft_reclaimed;
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
 			 * that that high watermark would be met at 100%
 			 * efficiency.
 			 */
 			if (kswapd_shrink_zone(zone, end_zone, &sc,
 					lru_pages, &nr_attempted))
 				raise_priority = false;
 		}
 		/*
 		 * If the low watermark is met there is no need for processes
 		 * to be throttled on pfmemalloc_wait as they should not be
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
 				pfmemalloc_watermark_ok(pgdat))
 			wake_up(&pgdat->pfmemalloc_wait);
 		/*
 		 * Fragmentation may mean that the system cannot be rebalanced
 		 * for high-order allocations in all zones. If twice the
 		 * allocation size has been reclaimed and the zones are still
 		 * not balanced then recheck the watermarks at order-0 to
 		 * prevent kswapd reclaiming excessively. Assume that a
 		 * process requested a high-order can direct reclaim/compact.
 		 */
 		if (order && sc.nr_reclaimed >= 2UL << order)
 			order = sc.order = 0;
 		/* Check if kswapd should be suspending */
 		if (try_to_freeze() || kthread_should_stop())
 			break;
 		/*
 		 * Compact if necessary and kswapd is reclaiming at least the
 		 * high watermark number of pages as requsted
 		 */
 		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
 			compact_pgdat(pgdat, order);
 		/*
 		 * Raise priority if scanning rate is too low or there was no
 		 * progress in reclaiming pages
 		 */
 		if (raise_priority || !sc.nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1 &&
 		 !pgdat_balanced(pgdat, order, *classzone_idx));
 out:
 	/*
 	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 	 * makes a decision on the order we were last reclaiming at. However,
 	 * if another caller entered the allocator slow path while kswapd
 	 * was awake, order will remain at the higher level
 	 */
 	*classzone_idx = end_zone;
 	return order;
 }
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
 	if (freezing(current) || kthread_should_stop())
 		return;
 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 	/* Try to sleep for a short interval */
 	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		remaining = schedule_timeout(HZ/10);
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 	}
 	/*
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
 	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 		/*
 		 * vmstat counters are not perfectly accurate and the estimated
 		 * value for counters such as NR_FREE_PAGES can deviate from the
 		 * true value by nr_online_cpus * threshold. To avoid the zone
 		 * watermarks being breached while under pressure, we reduce the
 		 * per-cpu vmstat threshold while kswapd is awake and restore
 		 * them before going back to sleep.
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 		/*
 		 * Compaction records what page blocks it recently failed to
 		 * isolate pages from and skips them in the future scanning.
 		 * When kswapd is going to sleep, it is reasonable to assume
 		 * that pages and compaction may succeed so reset the cache.
 		 */
 		reset_isolation_suitable(pgdat);
 		if (!kthread_should_stop())
 			schedule();
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
 		if (remaining)
 			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
 		else
 			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
 	}
 	finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
  * that frees anything up. This is needed for things like routing
  * etc, where we otherwise might have all activity going on in
  * asynchronous contexts that cannot page things out.
  *
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
 static int kswapd(void *p)
 {
 	unsigned long order, new_order;
 	unsigned balanced_order;
 	int classzone_idx, new_classzone_idx;
 	int balanced_classzone_idx;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 	lockdep_set_current_reclaim_state(GFP_KERNEL);
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "__alloc_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
 	 * you need a small amount of memory in order to be able to
 	 * page out something else, and this flag essentially protects
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 	order = new_order = 0;
 	balanced_order = 0;
 	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
 	balanced_classzone_idx = classzone_idx;
 	for ( ; ; ) {
 		bool ret;
 		/*
 		 * If the last balance_pgdat was unsuccessful it's unlikely a
 		 * new request of a similar or harder type will succeed soon
 		 * so consider going to sleep on the basis we reclaimed at
 		 */
 		if (balanced_classzone_idx >= new_classzone_idx &&
 					balanced_order == new_order) {
 			new_order = pgdat->kswapd_max_order;
 			new_classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order =  0;
 			pgdat->classzone_idx = pgdat->nr_zones - 1;
 		}
 		if (order < new_order || classzone_idx > new_classzone_idx) {
 			/*
 			 * Don't sleep if someone wants a larger 'order'
 			 * allocation or has tigher zone constraints
 			 */
 			order = new_order;
 			classzone_idx = new_classzone_idx;
 		} else {
 			kswapd_try_to_sleep(pgdat, balanced_order,
 						balanced_classzone_idx);
 			order = pgdat->kswapd_max_order;
 			classzone_idx = pgdat->classzone_idx;
 			new_order = order;
 			new_classzone_idx = classzone_idx;
 			pgdat->kswapd_max_order = 0;
 			pgdat->classzone_idx = pgdat->nr_zones - 1;
 		}
 		ret = try_to_freeze();
 		if (kthread_should_stop())
 			break;
 		/*
 		 * We can speed up thawing tasks if we don't call balance_pgdat
 		 * after returning from the refrigerator
 		 */
 		if (!ret) {
 			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
 			balanced_classzone_idx = classzone_idx;
 			balanced_order = balance_pgdat(pgdat, order,
 						&balanced_classzone_idx);
 		}
 	}
 	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	return 0;
 }
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
 	pg_data_t *pgdat;
 	if (!populated_zone(zone))
 		return;
 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 		return;
 	pgdat = zone->zone_pgdat;
 	if (pgdat->kswapd_max_order < order) {
 		pgdat->kswapd_max_order = order;
 		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 	if (zone_balanced(zone, order, 0, 0))
 		return;
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
  * freed pages.
  *
  * Rather than trying to age LRUs the aim is to preserve the overall
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_HIGHUSER_MOVABLE,
 		.may_swap = 1,
 		.may_unmap = 1,
 		.may_writepage = 1,
 		.nr_to_reclaim = nr_to_reclaim,
 		.hibernation_mode = 1,
 		.order = 0,
 		.priority = DEF_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
 	p->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	p->flags &= ~PF_MEMALLOC;
 	return nr_reclaimed;
 }
 #endif /* CONFIG_HIBERNATION */
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int cpu_callback(struct notifier_block *nfb, unsigned long action,
 			void *hcpu)
 {
 	int nid;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
 			const struct cpumask *mask;
 			mask = cpumask_of_node(pgdat->node_id);
 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
 				set_cpus_allowed_ptr(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
 }
 /*
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
 int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	int ret = 0;
 	if (pgdat->kswapd)
 		return 0;
 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
 		pr_err("Failed to start kswapd on node %d\n", nid);
 		ret = PTR_ERR(pgdat->kswapd);
 		pgdat->kswapd = NULL;
 	}
 	return ret;
 }
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
  * hold lock_memory_hotplug().
  */
 void kswapd_stop(int nid)
 {
 	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
 	if (kswapd) {
 		kthread_stop(kswapd);
 		NODE_DATA(nid)->kswapd = NULL;
 	}
 }
 static int __init kswapd_init(void)
 {
 	int nid;
 	swap_setup();
 	for_each_node_state(nid, N_MEMORY)
  		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
 module_init(kswapd_init)
 #ifdef CONFIG_NUMA
 /*
  * Zone reclaim mode
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
  */
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
 #define ZONE_RECLAIM_PRIORITY 4
 /*
  * Percentage of pages in a zone that must be unmapped for zone_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
  * slab reclaim needs to occur.
  */
 int sysctl_min_slab_ratio = 5;
 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 {
 	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
 	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
 		zone_page_state(zone, NR_ACTIVE_FILE);
 	/*
 	 * It's possible for there to be more file mapped pages than
 	 * accounted for by the pages on the file LRU lists because
 	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
 	 */
 	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
 }
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
 static long zone_pagecache_reclaimable(struct zone *zone)
 {
 	long nr_pagecache_reclaimable;
 	long delta = 0;
 	/*
 	 * If RECLAIM_SWAP is set, then all file pages are considered
 	 * potentially reclaimable. Otherwise, we have to worry about
 	 * pages like swapcache and zone_unmapped_file_pages() provides
 	 * a better estimate
 	 */
 	if (zone_reclaim_mode & RECLAIM_SWAP)
 		nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
 	else
 		nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
 	/* If we can't clean pages, remove dirty pages from consideration */
 	if (!(zone_reclaim_mode & RECLAIM_WRITE))
 		delta += zone_page_state(zone, NR_FILE_DIRTY);
 	/* Watch for any possible underflows due to delta */
 	if (unlikely(delta > nr_pagecache_reclaimable))
 		delta = nr_pagecache_reclaimable;
 	return nr_pagecache_reclaimable - delta;
 }
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.may_swap = 1,
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.order = order,
 		.priority = ZONE_RECLAIM_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	unsigned long nr_slab_pages0, nr_slab_pages1;
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_SWAP.
 	 */
 	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
 		do {
 			shrink_zone(zone, &sc);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	if (nr_slab_pages0 > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
 		 * many pages were freed in this zone. So we take the current
 		 * number of slab pages and shake the slab until it is reduced
 		 * by the same nr_pages that we used for reclaiming unmapped
 		 * pages.
 		 */
 		nodes_clear(shrink.nodes_to_scan);
 		node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 		for (;;) {
 			unsigned long lru_pages = zone_reclaimable_pages(zone);
 			/* No reclaimable slab or very low memory pressure */
 			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
 				break;
 			/* Freed enough memory */
 			nr_slab_pages1 = zone_page_state(zone,
 							NR_SLAB_RECLAIMABLE);
 			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
 				break;
 		}
 		/*
 		 * Update nr_reclaimed by the number of slab pages we
 		 * reclaimed from this zone.
 		 */
 		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 		if (nr_slab_pages1 < nr_slab_pages0)
 			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
 	}
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	int node_id;
 	int ret;
 	/*
 	 * Zone reclaim reclaims unmapped file backed pages and
 	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
 	 * thrown out if the zone is overallocated. So we do not reclaim
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
 	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
 		return ZONE_RECLAIM_NOSCAN;
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
 	 * have associated processors. This will favor the local processor
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
 	node_id = zone_to_nid(zone);
 	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
 		return ZONE_RECLAIM_NOSCAN;
 	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
 		return ZONE_RECLAIM_NOSCAN;
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 	if (!ret)
 		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
 	return ret;
 }
 #endif
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
  *
  * Test whether page is evictable--i.e., should be placed on active/inactive
  * lists vs unevictable list.
  *
  * Reasons page might not be evictable:
  * (1) page's mapping marked unevictable
  * (2) page is part of an mlocked VMA
  *
  */
 int page_evictable(struct page *page)
 {
 	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
 }
 #ifdef CONFIG_SHMEM
 /**
  * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
  * @pages:	array of pages to check
  * @nr_pages:	number of pages to check
  *
  * Checks pages for evictability and moves them to the appropriate lru list.
  *
  * This function is only used for SysV IPC SHM_UNLOCK.
  */
 void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
 	struct lruvec *lruvec;
 	struct zone *zone = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = pages[i];
 		struct zone *pagezone;
 		pgscanned++;
 		pagezone = page_zone(page);
 		if (pagezone != zone) {
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
 		if (page_evictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 			VM_BUG_ON(PageActive(page));
 			ClearPageUnevictable(page);
 			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
 			add_page_to_lru_list(page, lruvec, lru);
 			pgrescued++;
 		}
 	}
 	if (zone) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 		spin_unlock_irq(&zone->lru_lock);
 	}
 }
 #endif /* CONFIG_SHMEM */
 static void warn_scan_unevictable_pages(void)
 {
 	printk_once(KERN_WARNING
 		    "%s: The scan_unevictable_pages sysctl/node-interface has been "
 		    "disabled for lack of a legitimate use case.  If you have "
 		    "one, please send an email to linux-mm@kvack.org.\n",
 		    current->comm);
 }
 /*
  * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
  * all nodes' unevictable lists for evictable pages
  */
 unsigned long scan_unevictable_pages;
 int scan_unevictable_handler(struct ctl_table *table, int write,
 			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
 	warn_scan_unevictable_pages();
 	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 	scan_unevictable_pages = 0;
 	return 0;
 }
 #ifdef CONFIG_NUMA
 /*
  * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
  * a specified node's per zone unevictable lists for evictable pages.
  */
 static ssize_t read_scan_unevictable_node(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
 	warn_scan_unevictable_pages();
 	return sprintf(buf, "0\n");	/* always zero; should fit... */
 }
 static ssize_t write_scan_unevictable_node(struct device *dev,
 					   struct device_attribute *attr,
 					const char *buf, size_t count)
 {
 	warn_scan_unevictable_pages();
 	return 1;
 }
 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
 			read_scan_unevictable_node,
 			write_scan_unevictable_node);
 int scan_unevictable_register_node(struct node *node)
 {
 	return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
 }
 void scan_unevictable_unregister_node(struct node *node)