Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/mm/vmscan.c

2

* linux/mm/vmscan.c

3

*

3

*

4

5

*

5

*

6

* Swap reorganised 29.12.95, Stephen Tweedie.

6

* Swap reorganised 29.12.95, Stephen Tweedie.

7

* kswapd added: 7.1.96 sct

7

* kswapd added: 7.1.96 sct

8

* Removed kswapd_ctl limits, and swap out as many pages as needed

8

* Removed kswapd_ctl limits, and swap out as many pages as needed

9

* to bring the system back to freepages.high: 2.4.97, Rik van Riel.

9

* to bring the system back to freepages.high: 2.4.97, Rik van Riel.

10

* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).

10

* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).

11

* Multiqueue VM started 5.8.00, Rik van Riel.

11

* Multiqueue VM started 5.8.00, Rik van Riel.

12

*/

12

*/

13

14

#include <linux/mm.h>

14

#include <linux/mm.h>

15

#include <linux/module.h>

15

#include <linux/module.h>

16

#include <linux/gfp.h>

16

#include <linux/gfp.h>

17

#include <linux/kernel_stat.h>

17

#include <linux/kernel_stat.h>

18

#include <linux/swap.h>

18

#include <linux/swap.h>

19

#include <linux/pagemap.h>

19

#include <linux/pagemap.h>

20

#include <linux/init.h>

20

#include <linux/init.h>

21

#include <linux/highmem.h>

21

#include <linux/highmem.h>

22

#include <linux/vmpressure.h>

22

#include <linux/vmpressure.h>

23

#include <linux/vmstat.h>

23

#include <linux/vmstat.h>

24

#include <linux/file.h>

24

#include <linux/file.h>

25

#include <linux/writeback.h>

25

#include <linux/writeback.h>

26

#include <linux/blkdev.h>

26

#include <linux/blkdev.h>

27

#include <linux/buffer_head.h> /* for try_to_release_page(),

27

#include <linux/buffer_head.h> /* for try_to_release_page(),

28

buffer_heads_over_limit */

28

buffer_heads_over_limit */

29

#include <linux/mm_inline.h>

29

#include <linux/mm_inline.h>

30

#include <linux/backing-dev.h>

30

#include <linux/backing-dev.h>

31

#include <linux/rmap.h>

31

#include <linux/rmap.h>

32

#include <linux/topology.h>

32

#include <linux/topology.h>

33

#include <linux/cpu.h>

33

#include <linux/cpu.h>

34

#include <linux/cpuset.h>

34

#include <linux/cpuset.h>

35

#include <linux/compaction.h>

35

#include <linux/compaction.h>

36

#include <linux/notifier.h>

36

#include <linux/notifier.h>

37

#include <linux/rwsem.h>

37

#include <linux/rwsem.h>

38

#include <linux/delay.h>

38

#include <linux/delay.h>

39

#include <linux/kthread.h>

39

#include <linux/kthread.h>

40

#include <linux/freezer.h>

40

#include <linux/freezer.h>

41

#include <linux/memcontrol.h>

41

#include <linux/memcontrol.h>

42

#include <linux/delayacct.h>

42

#include <linux/delayacct.h>

43

#include <linux/sysctl.h>

43

#include <linux/sysctl.h>

44

#include <linux/oom.h>

44

#include <linux/oom.h>

45

#include <linux/prefetch.h>

45

#include <linux/prefetch.h>

46

47

#include <asm/tlbflush.h>

47

#include <asm/tlbflush.h>

48

#include <asm/div64.h>

48

#include <asm/div64.h>

49

50

#include <linux/swapops.h>

50

#include <linux/swapops.h>

51

#include <linux/balloon_compaction.h>

51

#include <linux/balloon_compaction.h>

52

53

#include "internal.h"

53

#include "internal.h"

54

55

#define CREATE_TRACE_POINTS

55

#define CREATE_TRACE_POINTS

56

#include <trace/events/vmscan.h>

56

#include <trace/events/vmscan.h>

57

58

struct scan_control {

58

struct scan_control {

59

/* Incremented by the number of inactive pages that were scanned */

59

/* Incremented by the number of inactive pages that were scanned */

60

unsigned long nr_scanned;

60

unsigned long nr_scanned;

61

62

/* Number of pages freed so far during a call to shrink_zones() */

62

/* Number of pages freed so far during a call to shrink_zones() */

63

unsigned long nr_reclaimed;

63

unsigned long nr_reclaimed;

64

65

/* How many pages shrink_list() should reclaim */

65

/* How many pages shrink_list() should reclaim */

66

unsigned long nr_to_reclaim;

66

unsigned long nr_to_reclaim;

67

68

unsigned long hibernation_mode;

68

unsigned long hibernation_mode;

69

70

/* This context's GFP mask */

70

/* This context's GFP mask */

71

gfp_t gfp_mask;

71

gfp_t gfp_mask;

72

73

int may_writepage;

73

int may_writepage;

74

75

/* Can mapped pages be reclaimed? */

75

/* Can mapped pages be reclaimed? */

76

int may_unmap;

76

int may_unmap;

77

78

/* Can pages be swapped as part of reclaim? */

78

/* Can pages be swapped as part of reclaim? */

79

int may_swap;

79

int may_swap;

80

81

int order;

81

int order;

82

83

/* Scan (total_size >> priority) pages at once */

83

/* Scan (total_size >> priority) pages at once */

84

int priority;

84

int priority;

85

86

/*

86

/*

87

* The memory cgroup that hit its limit and as a result is the

87

* The memory cgroup that hit its limit and as a result is the

88

* primary target of this reclaim invocation.

88

* primary target of this reclaim invocation.

89

*/

89

*/

90

struct mem_cgroup *target_mem_cgroup;

90

struct mem_cgroup *target_mem_cgroup;

91

92

/*

92

/*

93

* Nodemask of nodes allowed by the caller. If NULL, all nodes

93

* Nodemask of nodes allowed by the caller. If NULL, all nodes

94

* are scanned.

94

* are scanned.

95

*/

95

*/

96

nodemask_t *nodemask;

96

nodemask_t *nodemask;

97

};

97

};

98

99

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

99

#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

100

101

#ifdef ARCH_HAS_PREFETCH

101

#ifdef ARCH_HAS_PREFETCH

102

#define prefetch_prev_lru_page(_page, _base, _field) \

102

#define prefetch_prev_lru_page(_page, _base, _field) \

103

do { \

103

do { \

104

if ((_page)->lru.prev != _base) { \

104

if ((_page)->lru.prev != _base) { \

105

struct page *prev; \

105

struct page *prev; \

106

\

106

\

107

prev = lru_to_page(&(_page->lru)); \

107

prev = lru_to_page(&(_page->lru)); \

108

prefetch(&prev->_field); \

108

prefetch(&prev->_field); \

109

} \

109

} \

110

} while (0)

110

} while (0)

111

#else

111

#else

112

#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)

112

#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)

113

#endif

113

#endif

114

115

#ifdef ARCH_HAS_PREFETCHW

115

#ifdef ARCH_HAS_PREFETCHW

116

#define prefetchw_prev_lru_page(_page, _base, _field) \

116

#define prefetchw_prev_lru_page(_page, _base, _field) \

117

do { \

117

do { \

118

if ((_page)->lru.prev != _base) { \

118

if ((_page)->lru.prev != _base) { \

119

struct page *prev; \

119

struct page *prev; \

120

\

120

\

121

prev = lru_to_page(&(_page->lru)); \

121

prev = lru_to_page(&(_page->lru)); \

122

prefetchw(&prev->_field); \

122

prefetchw(&prev->_field); \

123

} \

123

} \

124

} while (0)

124

} while (0)

125

#else

125

#else

126

#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)

126

#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)

127

#endif

127

#endif

128

129

/*

129

/*

130

* From 0 .. 100. Higher means more swappy.

130

* From 0 .. 100. Higher means more swappy.

131

*/

131

*/

132

int vm_swappiness = 60;

132

int vm_swappiness = 60;

133

unsigned long vm_total_pages; /* The total number of pages which the VM controls */

133

unsigned long vm_total_pages; /* The total number of pages which the VM controls */

134

135

static LIST_HEAD(shrinker_list);

135

static LIST_HEAD(shrinker_list);

136

static DECLARE_RWSEM(shrinker_rwsem);

136

static DECLARE_RWSEM(shrinker_rwsem);

137

138

#ifdef CONFIG_MEMCG

138

#ifdef CONFIG_MEMCG

139

static bool global_reclaim(struct scan_control *sc)

139

static bool global_reclaim(struct scan_control *sc)

140

{

140

{

141

return !sc->target_mem_cgroup;

141

return !sc->target_mem_cgroup;

142

}

142

}

143

#else

143

#else

144

static bool global_reclaim(struct scan_control *sc)

144

static bool global_reclaim(struct scan_control *sc)

145

{

145

{

146

return true;

146

return true;

147

}

147

}

148

#endif

148

#endif

149

150

static unsigned long zone_reclaimable_pages(struct zone *zone)

150

static unsigned long zone_reclaimable_pages(struct zone *zone)

151

{

151

{

152

int nr;

152

int nr;

153

154

nr = zone_page_state(zone, NR_ACTIVE_FILE) +

154

nr = zone_page_state(zone, NR_ACTIVE_FILE) +

155

zone_page_state(zone, NR_INACTIVE_FILE);

155

zone_page_state(zone, NR_INACTIVE_FILE);

156

157

if (get_nr_swap_pages() > 0)

157

if (get_nr_swap_pages() > 0)

158

nr += zone_page_state(zone, NR_ACTIVE_ANON) +

158

nr += zone_page_state(zone, NR_ACTIVE_ANON) +

159

zone_page_state(zone, NR_INACTIVE_ANON);

159

zone_page_state(zone, NR_INACTIVE_ANON);

160

161

return nr;

161

return nr;

162

}

162

}

163

164

bool zone_reclaimable(struct zone *zone)

164

bool zone_reclaimable(struct zone *zone)

165

{

165

{

166

return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;

166

return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;

167

}

167

}

168

169

static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)

169

static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)

170

{

170

{

171

if (!mem_cgroup_disabled())

171

if (!mem_cgroup_disabled())

172

return mem_cgroup_get_lru_size(lruvec, lru);

172

return mem_cgroup_get_lru_size(lruvec, lru);

173

174

return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);

174

return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);

175

}

175

}

176

177

/*

177

/*

178

* Add a shrinker callback to be called from the vm.

178

* Add a shrinker callback to be called from the vm.

179

*/

179

*/

180

int register_shrinker(struct shrinker *shrinker)

180

int register_shrinker(struct shrinker *shrinker)

181

{

181

{

182

size_t size = sizeof(*shrinker->nr_deferred);

182

size_t size = sizeof(*shrinker->nr_deferred);

183

184

/*

184

/*

185

* If we only have one possible node in the system anyway, save

185

* If we only have one possible node in the system anyway, save

186

* ourselves the trouble and disable NUMA aware behavior. This way we

186

* ourselves the trouble and disable NUMA aware behavior. This way we

187

* will save memory and some small loop time later.

187

* will save memory and some small loop time later.

188

*/

188

*/

189

if (nr_node_ids == 1)

189

if (nr_node_ids == 1)

190

shrinker->flags &= ~SHRINKER_NUMA_AWARE;

190

shrinker->flags &= ~SHRINKER_NUMA_AWARE;

191

192

if (shrinker->flags & SHRINKER_NUMA_AWARE)

192

if (shrinker->flags & SHRINKER_NUMA_AWARE)

193

size *= nr_node_ids;

193

size *= nr_node_ids;

194

195

shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);

195

shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);

196

if (!shrinker->nr_deferred)

196

if (!shrinker->nr_deferred)

197

return -ENOMEM;

197

return -ENOMEM;

198

199

down_write(&shrinker_rwsem);

199

down_write(&shrinker_rwsem);

200

list_add_tail(&shrinker->list, &shrinker_list);

200

list_add_tail(&shrinker->list, &shrinker_list);

201

up_write(&shrinker_rwsem);

201

up_write(&shrinker_rwsem);

202

return 0;

202

return 0;

203

}

203

}

204

EXPORT_SYMBOL(register_shrinker);

204

EXPORT_SYMBOL(register_shrinker);

205

206

/*

206

/*

207

* Remove one

207

* Remove one

208

*/

208

*/

209

void unregister_shrinker(struct shrinker *shrinker)

209

void unregister_shrinker(struct shrinker *shrinker)

210

{

210

{

211

down_write(&shrinker_rwsem);

211

down_write(&shrinker_rwsem);

212

list_del(&shrinker->list);

212

list_del(&shrinker->list);

213

up_write(&shrinker_rwsem);

213

up_write(&shrinker_rwsem);

214

kfree(shrinker->nr_deferred);

214

kfree(shrinker->nr_deferred);

215

}

215

}

216

EXPORT_SYMBOL(unregister_shrinker);

216

EXPORT_SYMBOL(unregister_shrinker);

217

218

#define SHRINK_BATCH 128

218

#define SHRINK_BATCH 128

219

220

static unsigned long

220

static unsigned long

221

shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,

221

shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,

222

unsigned long nr_pages_scanned, unsigned long lru_pages)

222

unsigned long nr_pages_scanned, unsigned long lru_pages)

223

{

223

{

224

unsigned long freed = 0;

224

unsigned long freed = 0;

225

unsigned long long delta;

225

unsigned long long delta;

226

long total_scan;

226

long total_scan;

227

long freeable;

227

long freeable;

228

long nr;

228

long nr;

229

long new_nr;

229

long new_nr;

230

int nid = shrinkctl->nid;

230

int nid = shrinkctl->nid;

231

long batch_size = shrinker->batch ? shrinker->batch

231

long batch_size = shrinker->batch ? shrinker->batch

232

: SHRINK_BATCH;

232

: SHRINK_BATCH;

233

234

freeable = shrinker->count_objects(shrinker, shrinkctl);

234

freeable = shrinker->count_objects(shrinker, shrinkctl);

235

if (freeable == 0)

235

if (freeable == 0)

236

return 0;

236

return 0;

237

238

/*

238

/*

239

* copy the current shrinker scan count into a local variable

239

* copy the current shrinker scan count into a local variable

240

* and zero it so that other concurrent shrinker invocations

240

* and zero it so that other concurrent shrinker invocations

241

* don't also do this scanning work.

241

* don't also do this scanning work.

242

*/

242

*/

243

nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);

243

nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);

244

245

total_scan = nr;

245

total_scan = nr;

246

delta = (4 * nr_pages_scanned) / shrinker->seeks;

246

delta = (4 * nr_pages_scanned) / shrinker->seeks;

247

delta *= freeable;

247

delta *= freeable;

248

do_div(delta, lru_pages + 1);

248

do_div(delta, lru_pages + 1);

249

total_scan += delta;

249

total_scan += delta;

250

if (total_scan < 0) {

250

if (total_scan < 0) {

251

printk(KERN_ERR

251

printk(KERN_ERR

252

"shrink_slab: %pF negative objects to delete nr=%ld\n",

252

"shrink_slab: %pF negative objects to delete nr=%ld\n",

253

shrinker->scan_objects, total_scan);

253

shrinker->scan_objects, total_scan);

254

total_scan = freeable;

254

total_scan = freeable;

255

}

255

}

256

257

/*

257

/*

258

* We need to avoid excessive windup on filesystem shrinkers

258

* We need to avoid excessive windup on filesystem shrinkers

259

* due to large numbers of GFP_NOFS allocations causing the

259

* due to large numbers of GFP_NOFS allocations causing the

260

* shrinkers to return -1 all the time. This results in a large

260

* shrinkers to return -1 all the time. This results in a large

261

* nr being built up so when a shrink that can do some work

261

* nr being built up so when a shrink that can do some work

262

* comes along it empties the entire cache due to nr >>>

262

* comes along it empties the entire cache due to nr >>>

263

* freeable. This is bad for sustaining a working set in

263

* freeable. This is bad for sustaining a working set in

264

* memory.

264

* memory.

265

*

265

*

266

* Hence only allow the shrinker to scan the entire cache when

266

* Hence only allow the shrinker to scan the entire cache when

267

* a large delta change is calculated directly.

267

* a large delta change is calculated directly.

268

*/

268

*/

269

if (delta < freeable / 4)

269

if (delta < freeable / 4)

270

total_scan = min(total_scan, freeable / 2);

270

total_scan = min(total_scan, freeable / 2);

271

272

/*

272

/*

273

* Avoid risking looping forever due to too large nr value:

273

* Avoid risking looping forever due to too large nr value:

274

* never try to free more than twice the estimate number of

274

* never try to free more than twice the estimate number of

275

* freeable entries.

275

* freeable entries.

276

*/

276

*/

277

if (total_scan > freeable * 2)

277

if (total_scan > freeable * 2)

278

total_scan = freeable * 2;

278

total_scan = freeable * 2;

279

280

trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,

280

trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,

281

nr_pages_scanned, lru_pages,

281

nr_pages_scanned, lru_pages,

282

freeable, delta, total_scan);

282

freeable, delta, total_scan);

283

284

/*

284

/*

285

* Normally, we should not scan less than batch_size objects in one

285

* Normally, we should not scan less than batch_size objects in one

286

* pass to avoid too frequent shrinker calls, but if the slab has less

286

* pass to avoid too frequent shrinker calls, but if the slab has less

287

* than batch_size objects in total and we are really tight on memory,

287

* than batch_size objects in total and we are really tight on memory,

288

* we will try to reclaim all available objects, otherwise we can end

288

* we will try to reclaim all available objects, otherwise we can end

289

* up failing allocations although there are plenty of reclaimable

289

* up failing allocations although there are plenty of reclaimable

290

* objects spread over several slabs with usage less than the

290

* objects spread over several slabs with usage less than the

291

* batch_size.

291

* batch_size.

292

*

292

*

293

* We detect the "tight on memory" situations by looking at the total

293

* We detect the "tight on memory" situations by looking at the total

294

* number of objects we want to scan (total_scan). If it is greater

294

* number of objects we want to scan (total_scan). If it is greater

295

* than the total number of objects on slab (freeable), we must be

295

* than the total number of objects on slab (freeable), we must be

296

* scanning at high prio and therefore should try to reclaim as much as

296

* scanning at high prio and therefore should try to reclaim as much as

297

* possible.

297

* possible.

298

*/

298

*/

299

while (total_scan >= batch_size ||

299

while (total_scan >= batch_size ||

300

total_scan >= freeable) {

300

total_scan >= freeable) {

301

unsigned long ret;

301

unsigned long ret;

302

unsigned long nr_to_scan = min(batch_size, total_scan);

302

unsigned long nr_to_scan = min(batch_size, total_scan);

303

304

shrinkctl->nr_to_scan = nr_to_scan;

304

shrinkctl->nr_to_scan = nr_to_scan;

305

ret = shrinker->scan_objects(shrinker, shrinkctl);

305

ret = shrinker->scan_objects(shrinker, shrinkctl);

306

if (ret == SHRINK_STOP)

306

if (ret == SHRINK_STOP)

307

break;

307

break;

308

freed += ret;

308

freed += ret;

309

310

count_vm_events(SLABS_SCANNED, nr_to_scan);

310

count_vm_events(SLABS_SCANNED, nr_to_scan);

311

total_scan -= nr_to_scan;

311

total_scan -= nr_to_scan;

312

313

cond_resched();

313

cond_resched();

314

}

314

}

315

316

/*

316

/*

317

* move the unused scan count back into the shrinker in a

317

* move the unused scan count back into the shrinker in a

318

* manner that handles concurrent updates. If we exhausted the

318

* manner that handles concurrent updates. If we exhausted the

319

* scan, there is no need to do an update.

319

* scan, there is no need to do an update.

320

*/

320

*/

321

if (total_scan > 0)

321

if (total_scan > 0)

322

new_nr = atomic_long_add_return(total_scan,

322

new_nr = atomic_long_add_return(total_scan,

323

&shrinker->nr_deferred[nid]);

323

&shrinker->nr_deferred[nid]);

324

else

324

else

325

new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);

325

new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);

326

327

trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);

327

trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);

328

return freed;

328

return freed;

329

}

329

}

330

331

/*

331

/*

332

* Call the shrink functions to age shrinkable caches

332

* Call the shrink functions to age shrinkable caches

333

*

333

*

334

* Here we assume it costs one seek to replace a lru page and that it also

334

* Here we assume it costs one seek to replace a lru page and that it also

335

* takes a seek to recreate a cache object. With this in mind we age equal

335

* takes a seek to recreate a cache object. With this in mind we age equal

336

* percentages of the lru and ageable caches. This should balance the seeks

336

* percentages of the lru and ageable caches. This should balance the seeks

337

* generated by these structures.

337

* generated by these structures.

338

*

338

*

339

* If the vm encountered mapped pages on the LRU it increase the pressure on

339

* If the vm encountered mapped pages on the LRU it increase the pressure on

340

* slab to avoid swapping.

340

* slab to avoid swapping.

341

*

341

*

342

* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.

342

* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.

343

*

343

*

344

* `lru_pages' represents the number of on-LRU pages in all the zones which

344

* `lru_pages' represents the number of on-LRU pages in all the zones which

345

* are eligible for the caller's allocation attempt. It is used for balancing

345

* are eligible for the caller's allocation attempt. It is used for balancing

346

* slab reclaim versus page reclaim.

346

* slab reclaim versus page reclaim.

347

*

347

*

348

* Returns the number of slab objects which we shrunk.

348

* Returns the number of slab objects which we shrunk.

349

*/

349

*/

350

unsigned long shrink_slab(struct shrink_control *shrinkctl,

350

unsigned long shrink_slab(struct shrink_control *shrinkctl,

351

unsigned long nr_pages_scanned,

351

unsigned long nr_pages_scanned,

352

unsigned long lru_pages)

352

unsigned long lru_pages)

353

{

353

{

354

struct shrinker *shrinker;

354

struct shrinker *shrinker;

355

unsigned long freed = 0;

355

unsigned long freed = 0;

356

357

if (nr_pages_scanned == 0)

357

if (nr_pages_scanned == 0)

358

nr_pages_scanned = SWAP_CLUSTER_MAX;

358

nr_pages_scanned = SWAP_CLUSTER_MAX;

359

360

if (!down_read_trylock(&shrinker_rwsem)) {

360

if (!down_read_trylock(&shrinker_rwsem)) {

361

/*

361

/*

362

* If we would return 0, our callers would understand that we

362

* If we would return 0, our callers would understand that we

363

* have nothing else to shrink and give up trying. By returning

363

* have nothing else to shrink and give up trying. By returning

364

* 1 we keep it going and assume we'll be able to shrink next

364

* 1 we keep it going and assume we'll be able to shrink next

365

* time.

365

* time.

366

*/

366

*/

367

freed = 1;

367

freed = 1;

368

goto out;

368

goto out;

369

}

369

}

370

371

list_for_each_entry(shrinker, &shrinker_list, list) {

371

list_for_each_entry(shrinker, &shrinker_list, list) {

372

if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {

372

if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {

373

shrinkctl->nid = 0;

373

shrinkctl->nid = 0;

374

freed += shrink_slab_node(shrinkctl, shrinker,

374

freed += shrink_slab_node(shrinkctl, shrinker,

375

nr_pages_scanned, lru_pages);

375

nr_pages_scanned, lru_pages);

376

continue;

376

continue;

377

}

377

}

378

379

for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {

379

for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {

380

if (node_online(shrinkctl->nid))

380

if (node_online(shrinkctl->nid))

381

freed += shrink_slab_node(shrinkctl, shrinker,

381

freed += shrink_slab_node(shrinkctl, shrinker,

382

nr_pages_scanned, lru_pages);

382

nr_pages_scanned, lru_pages);

383

384

}

384

}

385

}

385

}

386

up_read(&shrinker_rwsem);

386

up_read(&shrinker_rwsem);

387

out:

387

out:

388

cond_resched();

388

cond_resched();

389

return freed;

389

return freed;

390

}

390

}

391

392

static inline int is_page_cache_freeable(struct page *page)

392

static inline int is_page_cache_freeable(struct page *page)

393

{

393

{

394

/*

394

/*

395

* A freeable page cache page is referenced only by the caller

395

* A freeable page cache page is referenced only by the caller

396

* that isolated the page, the page cache radix tree and

396

* that isolated the page, the page cache radix tree and

397

* optional buffer heads at page->private.

397

* optional buffer heads at page->private.

398

*/

398

*/

399

return page_count(page) - page_has_private(page) == 2;

399

return page_count(page) - page_has_private(page) == 2;

400

}

400

}

401

402

static int may_write_to_queue(struct backing_dev_info *bdi,

402

static int may_write_to_queue(struct backing_dev_info *bdi,

403

struct scan_control *sc)

403

struct scan_control *sc)

404

{

404

{

405

if (current->flags & PF_SWAPWRITE)

405

if (current->flags & PF_SWAPWRITE)

406

return 1;

406

return 1;

407

if (!bdi_write_congested(bdi))

407

if (!bdi_write_congested(bdi))

408

return 1;

408

return 1;

409

if (bdi == current->backing_dev_info)

409

if (bdi == current->backing_dev_info)

410

return 1;

410

return 1;

411

return 0;

411

return 0;

412

}

412

}

413

414

/*

414

/*

415

* We detected a synchronous write error writing a page out. Probably

415

* We detected a synchronous write error writing a page out. Probably

416

* -ENOSPC. We need to propagate that into the address_space for a subsequent

416

* -ENOSPC. We need to propagate that into the address_space for a subsequent

417

* fsync(), msync() or close().

417

* fsync(), msync() or close().

418

*

418

*

419

* The tricky part is that after writepage we cannot touch the mapping: nothing

419

* The tricky part is that after writepage we cannot touch the mapping: nothing

420

* prevents it from being freed up. But we have a ref on the page and once

420

* prevents it from being freed up. But we have a ref on the page and once

421

* that page is locked, the mapping is pinned.

421

* that page is locked, the mapping is pinned.

422

*

422

*

423

* We're allowed to run sleeping lock_page() here because we know the caller has

423

* We're allowed to run sleeping lock_page() here because we know the caller has

424

* __GFP_FS.

424

* __GFP_FS.

425

*/

425

*/

426

static void handle_write_error(struct address_space *mapping,

426

static void handle_write_error(struct address_space *mapping,

427

struct page *page, int error)

427

struct page *page, int error)

428

{

428

{

429

lock_page(page);

429

lock_page(page);

430

if (page_mapping(page) == mapping)

430

if (page_mapping(page) == mapping)

431

mapping_set_error(mapping, error);

431

mapping_set_error(mapping, error);

432

unlock_page(page);

432

unlock_page(page);

433

}

433

}

434

435

/* possible outcome of pageout() */

435

/* possible outcome of pageout() */

436

typedef enum {

436

typedef enum {

437

/* failed to write page out, page is locked */

437

/* failed to write page out, page is locked */

438

PAGE_KEEP,

438

PAGE_KEEP,

439

/* move page to the active list, page is locked */

439

/* move page to the active list, page is locked */

440

PAGE_ACTIVATE,

440

PAGE_ACTIVATE,

441

/* page has been sent to the disk successfully, page is unlocked */

441

/* page has been sent to the disk successfully, page is unlocked */

442

PAGE_SUCCESS,

442

PAGE_SUCCESS,

443

/* page is clean and locked */

443

/* page is clean and locked */

444

PAGE_CLEAN,

444

PAGE_CLEAN,

445

} pageout_t;

445

} pageout_t;

446

447

/*

447

/*

448

* pageout is called by shrink_page_list() for each dirty page.

448

* pageout is called by shrink_page_list() for each dirty page.

449

* Calls ->writepage().

449

* Calls ->writepage().

450

*/

450

*/

451

static pageout_t pageout(struct page *page, struct address_space *mapping,

451

static pageout_t pageout(struct page *page, struct address_space *mapping,

452

struct scan_control *sc)

452

struct scan_control *sc)

453

{

453

{

454

/*

454

/*

455

* If the page is dirty, only perform writeback if that write

455

* If the page is dirty, only perform writeback if that write

456

* will be non-blocking. To prevent this allocation from being

456

* will be non-blocking. To prevent this allocation from being

457

* stalled by pagecache activity. But note that there may be

457

* stalled by pagecache activity. But note that there may be

458

* stalls if we need to run get_block(). We could test

458

* stalls if we need to run get_block(). We could test

459

* PagePrivate for that.

459

* PagePrivate for that.

460

*

460

*

461

* If this process is currently in __generic_file_aio_write() against

461

* If this process is currently in __generic_file_aio_write() against

462

* this page's queue, we can perform writeback even if that

462

* this page's queue, we can perform writeback even if that

463

* will block.

463

* will block.

464

*

464

*

465

* If the page is swapcache, write it back even if that would

465

* If the page is swapcache, write it back even if that would

466

* block, for some throttling. This happens by accident, because

466

* block, for some throttling. This happens by accident, because

467

* swap_backing_dev_info is bust: it doesn't reflect the

467

* swap_backing_dev_info is bust: it doesn't reflect the

468

* congestion state of the swapdevs. Easy to fix, if needed.

468

* congestion state of the swapdevs. Easy to fix, if needed.

469

*/

469

*/

470

if (!is_page_cache_freeable(page))

470

if (!is_page_cache_freeable(page))

471

return PAGE_KEEP;

471

return PAGE_KEEP;

472

if (!mapping) {

472

if (!mapping) {

473

/*

473

/*

474

* Some data journaling orphaned pages can have

474

* Some data journaling orphaned pages can have

475

* page->mapping == NULL while being dirty with clean buffers.

475

* page->mapping == NULL while being dirty with clean buffers.

476

*/

476

*/

477

if (page_has_private(page)) {

477

if (page_has_private(page)) {

478

if (try_to_free_buffers(page)) {

478

if (try_to_free_buffers(page)) {

479

ClearPageDirty(page);

479

ClearPageDirty(page);

480

printk("%s: orphaned page\n", __func__);

480

printk("%s: orphaned page\n", __func__);

481

return PAGE_CLEAN;

481

return PAGE_CLEAN;

482

}

482

}

483

}

483

}

484

return PAGE_KEEP;

484

return PAGE_KEEP;

485

}

485

}

486

if (mapping->a_ops->writepage == NULL)

486

if (mapping->a_ops->writepage == NULL)

487

return PAGE_ACTIVATE;

487

return PAGE_ACTIVATE;

488

if (!may_write_to_queue(mapping->backing_dev_info, sc))

488

if (!may_write_to_queue(mapping->backing_dev_info, sc))

489

return PAGE_KEEP;

489

return PAGE_KEEP;

490

491

if (clear_page_dirty_for_io(page)) {

491

if (clear_page_dirty_for_io(page)) {

492

int res;

492

int res;

493

struct writeback_control wbc = {

493

struct writeback_control wbc = {

494

.sync_mode = WB_SYNC_NONE,

494

.sync_mode = WB_SYNC_NONE,

495

.nr_to_write = SWAP_CLUSTER_MAX,

495

.nr_to_write = SWAP_CLUSTER_MAX,

496

.range_start = 0,

496

.range_start = 0,

497

.range_end = LLONG_MAX,

497

.range_end = LLONG_MAX,

498

.for_reclaim = 1,

498

.for_reclaim = 1,

499

};

499

};

500

501

SetPageReclaim(page);

501

SetPageReclaim(page);

502

res = mapping->a_ops->writepage(page, &wbc);

502

res = mapping->a_ops->writepage(page, &wbc);

503

if (res < 0)

503

if (res < 0)

504

handle_write_error(mapping, page, res);

504

handle_write_error(mapping, page, res);

505

if (res == AOP_WRITEPAGE_ACTIVATE) {

505

if (res == AOP_WRITEPAGE_ACTIVATE) {

506

ClearPageReclaim(page);

506

ClearPageReclaim(page);

507

return PAGE_ACTIVATE;

507

return PAGE_ACTIVATE;

508

}

508

}

509

510

if (!PageWriteback(page)) {

510

if (!PageWriteback(page)) {

511

/* synchronous write or broken a_ops? */

511

/* synchronous write or broken a_ops? */

512

ClearPageReclaim(page);

512

ClearPageReclaim(page);

513

}

513

}

514

trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));

514

trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));

515

inc_zone_page_state(page, NR_VMSCAN_WRITE);

515

inc_zone_page_state(page, NR_VMSCAN_WRITE);

516

return PAGE_SUCCESS;

516

return PAGE_SUCCESS;

517

}

517

}

518

519

return PAGE_CLEAN;

519

return PAGE_CLEAN;

520

}

520

}

521

522

/*

522

/*

523

* Same as remove_mapping, but if the page is removed from the mapping, it

523

* Same as remove_mapping, but if the page is removed from the mapping, it

524

* gets returned with a refcount of 0.

524

* gets returned with a refcount of 0.

525

*/

525

*/

526

static int __remove_mapping(struct address_space *mapping, struct page *page)

526

static int __remove_mapping(struct address_space *mapping, struct page *page)

527

{

527

{

528

BUG_ON(!PageLocked(page));

528

BUG_ON(!PageLocked(page));

529

BUG_ON(mapping != page_mapping(page));

529

BUG_ON(mapping != page_mapping(page));

530

531

spin_lock_irq(&mapping->tree_lock);

531

spin_lock_irq(&mapping->tree_lock);

532

/*

532

/*

533

* The non racy check for a busy page.

533

* The non racy check for a busy page.

534

*

534

*

535

* Must be careful with the order of the tests. When someone has

535

* Must be careful with the order of the tests. When someone has

536

* a ref to the page, it may be possible that they dirty it then

536

* a ref to the page, it may be possible that they dirty it then

537

* drop the reference. So if PageDirty is tested before page_count

537

* drop the reference. So if PageDirty is tested before page_count

538

* here, then the following race may occur:

538

* here, then the following race may occur:

539

*

539

*

540

* get_user_pages(&page);

540

* get_user_pages(&page);

541

* [user mapping goes away]

541

* [user mapping goes away]

542

* write_to(page);

542

* write_to(page);

543

* !PageDirty(page) [good]

543

* !PageDirty(page) [good]

544

* SetPageDirty(page);

544

* SetPageDirty(page);

545

* put_page(page);

545

* put_page(page);

546

* !page_count(page) [good, discard it]

546

* !page_count(page) [good, discard it]

547

*

547

*

548

* [oops, our write_to data is lost]

548

* [oops, our write_to data is lost]

549

*

549

*

550

* Reversing the order of the tests ensures such a situation cannot

550

* Reversing the order of the tests ensures such a situation cannot

551

* escape unnoticed. The smp_rmb is needed to ensure the page->flags

551

* escape unnoticed. The smp_rmb is needed to ensure the page->flags

552

* load is not satisfied before that of page->_count.

552

* load is not satisfied before that of page->_count.

553

*

553

*

554

* Note that if SetPageDirty is always performed via set_page_dirty,

554

* Note that if SetPageDirty is always performed via set_page_dirty,

555

* and thus under tree_lock, then this ordering is not required.

555

* and thus under tree_lock, then this ordering is not required.

556

*/

556

*/

557

if (!page_freeze_refs(page, 2))

557

if (!page_freeze_refs(page, 2))

558

goto cannot_free;

558

goto cannot_free;

559

/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */

559

/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */

560

if (unlikely(PageDirty(page))) {

560

if (unlikely(PageDirty(page))) {

561

page_unfreeze_refs(page, 2);

561

page_unfreeze_refs(page, 2);

562

goto cannot_free;

562

goto cannot_free;

563

}

563

}

564

565

if (PageSwapCache(page)) {

565

if (PageSwapCache(page)) {

566

swp_entry_t swap = { .val = page_private(page) };

566

swp_entry_t swap = { .val = page_private(page) };

567

__delete_from_swap_cache(page);

567

__delete_from_swap_cache(page);

568

spin_unlock_irq(&mapping->tree_lock);

568

spin_unlock_irq(&mapping->tree_lock);

569

swapcache_free(swap, page);

569

swapcache_free(swap, page);

570

} else {

570

} else {

571

void (*freepage)(struct page *);

571

void (*freepage)(struct page *);

572

573

freepage = mapping->a_ops->freepage;

573

freepage = mapping->a_ops->freepage;

574

575

__delete_from_page_cache(page);

575

__delete_from_page_cache(page);

576

spin_unlock_irq(&mapping->tree_lock);

576

spin_unlock_irq(&mapping->tree_lock);

577

mem_cgroup_uncharge_cache_page(page);

577

mem_cgroup_uncharge_cache_page(page);

578

579

if (freepage != NULL)

579

if (freepage != NULL)

580

freepage(page);

580

freepage(page);

581

}

581

}

582

583

return 1;

583

return 1;

584

585

cannot_free:

585

cannot_free:

586

spin_unlock_irq(&mapping->tree_lock);

586

spin_unlock_irq(&mapping->tree_lock);

587

return 0;

587

return 0;

588

}

588

}

589

590

/*

590

/*

591

* Attempt to detach a locked page from its ->mapping. If it is dirty or if

591

* Attempt to detach a locked page from its ->mapping. If it is dirty or if

592

* someone else has a ref on the page, abort and return 0. If it was

592

* someone else has a ref on the page, abort and return 0. If it was

593

* successfully detached, return 1. Assumes the caller has a single ref on

593

* successfully detached, return 1. Assumes the caller has a single ref on

594

* this page.

594

* this page.

595

*/

595

*/

596

int remove_mapping(struct address_space *mapping, struct page *page)

596

int remove_mapping(struct address_space *mapping, struct page *page)

597

{

597

{

598

if (__remove_mapping(mapping, page)) {

598

if (__remove_mapping(mapping, page)) {

599

/*

599

/*

600

* Unfreezing the refcount with 1 rather than 2 effectively

600

* Unfreezing the refcount with 1 rather than 2 effectively

601

* drops the pagecache ref for us without requiring another

601

* drops the pagecache ref for us without requiring another

602

* atomic operation.

602

* atomic operation.

603

*/

603

*/

604

page_unfreeze_refs(page, 1);

604

page_unfreeze_refs(page, 1);

605

return 1;

605

return 1;

606

}

606

}

607

return 0;

607

return 0;

608

}

608

}

609

610

/**

610

/**

611

* putback_lru_page - put previously isolated page onto appropriate LRU list

611

* putback_lru_page - put previously isolated page onto appropriate LRU list

612

* @page: page to be put back to appropriate lru list

612

* @page: page to be put back to appropriate lru list

613

*

613

*

614

* Add previously isolated @page to appropriate LRU list.

614

* Add previously isolated @page to appropriate LRU list.

615

* Page may still be unevictable for other reasons.

615

* Page may still be unevictable for other reasons.

616

*

616

*

617

* lru_lock must not be held, interrupts must be enabled.

617

* lru_lock must not be held, interrupts must be enabled.

618

*/

618

*/

619

void putback_lru_page(struct page *page)

619

void putback_lru_page(struct page *page)

620

{

620

{

621

bool is_unevictable;

621

bool is_unevictable;

622

int was_unevictable = PageUnevictable(page);

622

int was_unevictable = PageUnevictable(page);

623

624

VM_BUG_ON(PageLRU(page));

624

VM_BUG_ON(PageLRU(page));

625

626

redo:

626

redo:

627

ClearPageUnevictable(page);

627

ClearPageUnevictable(page);

628

629

if (page_evictable(page)) {

629

if (page_evictable(page)) {

630

/*

630

/*

631

* For evictable pages, we can use the cache.

631

* For evictable pages, we can use the cache.

632

* In event of a race, worst case is we end up with an

632

* In event of a race, worst case is we end up with an

633

* unevictable page on [in]active list.

633

* unevictable page on [in]active list.

634

* We know how to handle that.

634

* We know how to handle that.

635

*/

635

*/

636

is_unevictable = false;

636

is_unevictable = false;

637

lru_cache_add(page);

637

lru_cache_add(page);

638

} else {

638

} else {

639

/*

639

/*

640

* Put unevictable pages directly on zone's unevictable

640

* Put unevictable pages directly on zone's unevictable

641

* list.

641

* list.

642

*/

642

*/

643

is_unevictable = true;

643

is_unevictable = true;

644

add_page_to_unevictable_list(page);

644

add_page_to_unevictable_list(page);

645

/*

645

/*

646

* When racing with an mlock or AS_UNEVICTABLE clearing

646

* When racing with an mlock or AS_UNEVICTABLE clearing

647

* (page is unlocked) make sure that if the other thread

647

* (page is unlocked) make sure that if the other thread

648

* does not observe our setting of PG_lru and fails

648

* does not observe our setting of PG_lru and fails

649

* isolation/check_move_unevictable_pages,

649

* isolation/check_move_unevictable_pages,

650

* we see PG_mlocked/AS_UNEVICTABLE cleared below and move

650

* we see PG_mlocked/AS_UNEVICTABLE cleared below and move

651

* the page back to the evictable list.

651

* the page back to the evictable list.

652

*

652

*

653

* The other side is TestClearPageMlocked() or shmem_lock().

653

* The other side is TestClearPageMlocked() or shmem_lock().

654

*/

654

*/

655

smp_mb();

655

smp_mb();

656

}

656

}

657

658

/*

658

/*

659

* page's status can change while we move it among lru. If an evictable

659

* page's status can change while we move it among lru. If an evictable

660

* page is on unevictable list, it never be freed. To avoid that,

660

* page is on unevictable list, it never be freed. To avoid that,

661

* check after we added it to the list, again.

661

* check after we added it to the list, again.

662

*/

662

*/

663

if (is_unevictable && page_evictable(page)) {

663

if (is_unevictable && page_evictable(page)) {

664

if (!isolate_lru_page(page)) {

664

if (!isolate_lru_page(page)) {

665

put_page(page);

665

put_page(page);

666

goto redo;

666

goto redo;

667

}

667

}

668

/* This means someone else dropped this page from LRU

668

/* This means someone else dropped this page from LRU

669

* So, it will be freed or putback to LRU again. There is

669

* So, it will be freed or putback to LRU again. There is

670

* nothing to do here.

670

* nothing to do here.

671

*/

671

*/

672

}

672

}

673

674

if (was_unevictable && !is_unevictable)

674

if (was_unevictable && !is_unevictable)

675

count_vm_event(UNEVICTABLE_PGRESCUED);

675

count_vm_event(UNEVICTABLE_PGRESCUED);

676

else if (!was_unevictable && is_unevictable)

676

else if (!was_unevictable && is_unevictable)

677

count_vm_event(UNEVICTABLE_PGCULLED);

677

count_vm_event(UNEVICTABLE_PGCULLED);

678

679

put_page(page); /* drop ref from isolate */

679

put_page(page); /* drop ref from isolate */

680

}

680

}

681

682

enum page_references {

682

enum page_references {

683

PAGEREF_RECLAIM,

683

PAGEREF_RECLAIM,

684

PAGEREF_RECLAIM_CLEAN,

684

PAGEREF_RECLAIM_CLEAN,

685

PAGEREF_KEEP,

685

PAGEREF_KEEP,

686

PAGEREF_ACTIVATE,

686

PAGEREF_ACTIVATE,

687

};

687

};

688

689

static enum page_references page_check_references(struct page *page,

689

static enum page_references page_check_references(struct page *page,

690

struct scan_control *sc)

690

struct scan_control *sc)

691

{

691

{

692

int referenced_ptes, referenced_page;

692

int referenced_ptes, referenced_page;

693

unsigned long vm_flags;

693

unsigned long vm_flags;

694

695

referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,

695

referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,

696

&vm_flags);

696

&vm_flags);

697

referenced_page = TestClearPageReferenced(page);

697

referenced_page = TestClearPageReferenced(page);

698

699

/*

699

/*

700

* Mlock lost the isolation race with us. Let try_to_unmap()

700

* Mlock lost the isolation race with us. Let try_to_unmap()

701

* move the page to the unevictable list.

701

* move the page to the unevictable list.

702

*/

702

*/

703

if (vm_flags & VM_LOCKED)

703

if (vm_flags & VM_LOCKED)

704

return PAGEREF_RECLAIM;

704

return PAGEREF_RECLAIM;

705

706

if (referenced_ptes) {

706

if (referenced_ptes) {

707

if (PageSwapBacked(page))

707

if (PageSwapBacked(page))

708

return PAGEREF_ACTIVATE;

708

return PAGEREF_ACTIVATE;

709

/*

709

/*

710

* All mapped pages start out with page table

710

* All mapped pages start out with page table

711

* references from the instantiating fault, so we need

711

* references from the instantiating fault, so we need

712

* to look twice if a mapped file page is used more

712

* to look twice if a mapped file page is used more

713

* than once.

713

* than once.

714

*

714

*

715

* Mark it and spare it for another trip around the

715

* Mark it and spare it for another trip around the

716

* inactive list. Another page table reference will

716

* inactive list. Another page table reference will

717

* lead to its activation.

717

* lead to its activation.

718

*

718

*

719

* Note: the mark is set for activated pages as well

719

* Note: the mark is set for activated pages as well

720

* so that recently deactivated but used pages are

720

* so that recently deactivated but used pages are

721

* quickly recovered.

721

* quickly recovered.

722

*/

722

*/

723

SetPageReferenced(page);

723

SetPageReferenced(page);

724

725

if (referenced_page || referenced_ptes > 1)

725

if (referenced_page || referenced_ptes > 1)

726

return PAGEREF_ACTIVATE;

726

return PAGEREF_ACTIVATE;

727

728

/*

728

/*

729

* Activate file-backed executable pages after first usage.

729

* Activate file-backed executable pages after first usage.

730

*/

730

*/

731

if (vm_flags & VM_EXEC)

731

if (vm_flags & VM_EXEC)

732

return PAGEREF_ACTIVATE;

732

return PAGEREF_ACTIVATE;

733

734

return PAGEREF_KEEP;

734

return PAGEREF_KEEP;

735

}

735

}

736

737

/* Reclaim if clean, defer dirty pages to writeback */

737

/* Reclaim if clean, defer dirty pages to writeback */

738

if (referenced_page && !PageSwapBacked(page))

738

if (referenced_page && !PageSwapBacked(page))

739

return PAGEREF_RECLAIM_CLEAN;

739

return PAGEREF_RECLAIM_CLEAN;

740

741

return PAGEREF_RECLAIM;

741

return PAGEREF_RECLAIM;

742

}

742

}

743

744

/* Check if a page is dirty or under writeback */

744

/* Check if a page is dirty or under writeback */

745

static void page_check_dirty_writeback(struct page *page,

745

static void page_check_dirty_writeback(struct page *page,

746

bool *dirty, bool *writeback)

746

bool *dirty, bool *writeback)

747

{

747

{

748

struct address_space *mapping;

748

struct address_space *mapping;

749

750

/*

750

/*

751

* Anonymous pages are not handled by flushers and must be written

751

* Anonymous pages are not handled by flushers and must be written

752

* from reclaim context. Do not stall reclaim based on them

752

* from reclaim context. Do not stall reclaim based on them

753

*/

753

*/

754

if (!page_is_file_cache(page)) {

754

if (!page_is_file_cache(page)) {

755

*dirty = false;

755

*dirty = false;

756

*writeback = false;

756

*writeback = false;

757

return;

757

return;

758

}

758

}

759

760

/* By default assume that the page flags are accurate */

760

/* By default assume that the page flags are accurate */

761

*dirty = PageDirty(page);

761

*dirty = PageDirty(page);

762

*writeback = PageWriteback(page);

762

*writeback = PageWriteback(page);

763

764

/* Verify dirty/writeback state if the filesystem supports it */

764

/* Verify dirty/writeback state if the filesystem supports it */

765

if (!page_has_private(page))

765

if (!page_has_private(page))

766

return;

766

return;

767

768

mapping = page_mapping(page);

768

mapping = page_mapping(page);

769

if (mapping && mapping->a_ops->is_dirty_writeback)

769

if (mapping && mapping->a_ops->is_dirty_writeback)

770

mapping->a_ops->is_dirty_writeback(page, dirty, writeback);

770

mapping->a_ops->is_dirty_writeback(page, dirty, writeback);

771

}

771

}

772

773

/*

773

/*

774

* shrink_page_list() returns the number of reclaimed pages

774

* shrink_page_list() returns the number of reclaimed pages

775

*/

775

*/

776

static unsigned long shrink_page_list(struct list_head *page_list,

776

static unsigned long shrink_page_list(struct list_head *page_list,

777

struct zone *zone,

777

struct zone *zone,

778

struct scan_control *sc,

778

struct scan_control *sc,

779

enum ttu_flags ttu_flags,

779

enum ttu_flags ttu_flags,

780

unsigned long *ret_nr_dirty,

780

unsigned long *ret_nr_dirty,

781

unsigned long *ret_nr_unqueued_dirty,

781

unsigned long *ret_nr_unqueued_dirty,

782

unsigned long *ret_nr_congested,

782

unsigned long *ret_nr_congested,

783

unsigned long *ret_nr_writeback,

783

unsigned long *ret_nr_writeback,

784

unsigned long *ret_nr_immediate,

784

unsigned long *ret_nr_immediate,

785

bool force_reclaim)

785

bool force_reclaim)

786

{

786

{

787

LIST_HEAD(ret_pages);

787

LIST_HEAD(ret_pages);

788

LIST_HEAD(free_pages);

788

LIST_HEAD(free_pages);

789

int pgactivate = 0;

789

int pgactivate = 0;

790

unsigned long nr_unqueued_dirty = 0;

790

unsigned long nr_unqueued_dirty = 0;

791

unsigned long nr_dirty = 0;

791

unsigned long nr_dirty = 0;

792

unsigned long nr_congested = 0;

792

unsigned long nr_congested = 0;

793

unsigned long nr_reclaimed = 0;

793

unsigned long nr_reclaimed = 0;

794

unsigned long nr_writeback = 0;

794

unsigned long nr_writeback = 0;

795

unsigned long nr_immediate = 0;

795

unsigned long nr_immediate = 0;

796

797

cond_resched();

797

cond_resched();

798

799

mem_cgroup_uncharge_start();

799

mem_cgroup_uncharge_start();

800

while (!list_empty(page_list)) {

800

while (!list_empty(page_list)) {

801

struct address_space *mapping;

801

struct address_space *mapping;

802

struct page *page;

802

struct page *page;

803

int may_enter_fs;

803

int may_enter_fs;

804

enum page_references references = PAGEREF_RECLAIM_CLEAN;

804

enum page_references references = PAGEREF_RECLAIM_CLEAN;

805

bool dirty, writeback;

805

bool dirty, writeback;

806

807

cond_resched();

807

cond_resched();

808

809

page = lru_to_page(page_list);

809

page = lru_to_page(page_list);

810

list_del(&page->lru);

810

list_del(&page->lru);

811

812

if (!trylock_page(page))

812

if (!trylock_page(page))

813

goto keep;

813

goto keep;

814

815

VM_BUG_ON(PageActive(page));

815

VM_BUG_ON(PageActive(page));

816

VM_BUG_ON(page_zone(page) != zone);

816

VM_BUG_ON(page_zone(page) != zone);

817

818

sc->nr_scanned++;

818

sc->nr_scanned++;

819

820

if (unlikely(!page_evictable(page)))

820

if (unlikely(!page_evictable(page)))

821

goto cull_mlocked;

821

goto cull_mlocked;

822

823

if (!sc->may_unmap && page_mapped(page))

823

if (!sc->may_unmap && page_mapped(page))

824

goto keep_locked;

824

goto keep_locked;

825

826

/* Double the slab pressure for mapped and swapcache pages */

826

/* Double the slab pressure for mapped and swapcache pages */

827

if (page_mapped(page) || PageSwapCache(page))

827

if (page_mapped(page) || PageSwapCache(page))

828

sc->nr_scanned++;

828

sc->nr_scanned++;

829

830

may_enter_fs = (sc->gfp_mask & __GFP_FS) ||

830

may_enter_fs = (sc->gfp_mask & __GFP_FS) ||

831

(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

831

(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));

832

833

/*

833

/*

834

* The number of dirty pages determines if a zone is marked

834

* The number of dirty pages determines if a zone is marked

835

* reclaim_congested which affects wait_iff_congested. kswapd

835

* reclaim_congested which affects wait_iff_congested. kswapd

836

* will stall and start writing pages if the tail of the LRU

836

* will stall and start writing pages if the tail of the LRU

837

* is all dirty unqueued pages.

837

* is all dirty unqueued pages.

838

*/

838

*/

839

page_check_dirty_writeback(page, &dirty, &writeback);

839

page_check_dirty_writeback(page, &dirty, &writeback);

840

if (dirty || writeback)

840

if (dirty || writeback)

841

nr_dirty++;

841

nr_dirty++;

842

843

if (dirty && !writeback)

843

if (dirty && !writeback)

844

nr_unqueued_dirty++;

844

nr_unqueued_dirty++;

845

846

/*

846

/*

847

* Treat this page as congested if the underlying BDI is or if

847

* Treat this page as congested if the underlying BDI is or if

848

* pages are cycling through the LRU so quickly that the

848

* pages are cycling through the LRU so quickly that the

849

* pages marked for immediate reclaim are making it to the

849

* pages marked for immediate reclaim are making it to the

850

* end of the LRU a second time.

850

* end of the LRU a second time.

851

*/

851

*/

852

mapping = page_mapping(page);

852

mapping = page_mapping(page);

853

if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||

853

if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||

854

(writeback && PageReclaim(page)))

854

(writeback && PageReclaim(page)))

855

nr_congested++;

855

nr_congested++;

856

857

/*

857

/*

858

* If a page at the tail of the LRU is under writeback, there

858

* If a page at the tail of the LRU is under writeback, there

859

* are three cases to consider.

859

* are three cases to consider.

860

*

860

*

861

* 1) If reclaim is encountering an excessive number of pages

861

* 1) If reclaim is encountering an excessive number of pages

862

* under writeback and this page is both under writeback and

862

* under writeback and this page is both under writeback and

863

* PageReclaim then it indicates that pages are being queued

863

* PageReclaim then it indicates that pages are being queued

864

* for IO but are being recycled through the LRU before the

864

* for IO but are being recycled through the LRU before the

865

* IO can complete. Waiting on the page itself risks an

865

* IO can complete. Waiting on the page itself risks an

866

* indefinite stall if it is impossible to writeback the

866

* indefinite stall if it is impossible to writeback the

867

* page due to IO error or disconnected storage so instead

867

* page due to IO error or disconnected storage so instead

868

* note that the LRU is being scanned too quickly and the

868

* note that the LRU is being scanned too quickly and the

869

* caller can stall after page list has been processed.

869

* caller can stall after page list has been processed.

870

*

870

*

871

* 2) Global reclaim encounters a page, memcg encounters a

871

* 2) Global reclaim encounters a page, memcg encounters a

872

* page that is not marked for immediate reclaim or

872

* page that is not marked for immediate reclaim or

873

* the caller does not have __GFP_IO. In this case mark

873

* the caller does not have __GFP_IO. In this case mark

874

* the page for immediate reclaim and continue scanning.

874

* the page for immediate reclaim and continue scanning.

875

*

875

*

876

* __GFP_IO is checked because a loop driver thread might

876

* __GFP_IO is checked because a loop driver thread might

877

* enter reclaim, and deadlock if it waits on a page for

877

* enter reclaim, and deadlock if it waits on a page for

878

* which it is needed to do the write (loop masks off

878

* which it is needed to do the write (loop masks off

879

* __GFP_IO|__GFP_FS for this reason); but more thought

879

* __GFP_IO|__GFP_FS for this reason); but more thought

880

* would probably show more reasons.

880

* would probably show more reasons.

881

*

881

*

882

* Don't require __GFP_FS, since we're not going into the

882

* Don't require __GFP_FS, since we're not going into the

883

* FS, just waiting on its writeback completion. Worryingly,

883

* FS, just waiting on its writeback completion. Worryingly,

884

* ext4 gfs2 and xfs allocate pages with

884

* ext4 gfs2 and xfs allocate pages with

885

* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing

885

* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing

886

* may_enter_fs here is liable to OOM on them.

886

* may_enter_fs here is liable to OOM on them.

887

*

887

*

888

* 3) memcg encounters a page that is not already marked

888

* 3) memcg encounters a page that is not already marked

889

* PageReclaim. memcg does not have any dirty pages

889

* PageReclaim. memcg does not have any dirty pages

890

* throttling so we could easily OOM just because too many

890

* throttling so we could easily OOM just because too many

891

* pages are in writeback and there is nothing else to

891

* pages are in writeback and there is nothing else to

892

* reclaim. Wait for the writeback to complete.

892

* reclaim. Wait for the writeback to complete.

893

*/

893

*/

894

if (PageWriteback(page)) {

894

if (PageWriteback(page)) {

895

/* Case 1 above */

895

/* Case 1 above */

896

if (current_is_kswapd() &&

896

if (current_is_kswapd() &&

897

PageReclaim(page) &&

897

PageReclaim(page) &&

898

zone_is_reclaim_writeback(zone)) {

898

zone_is_reclaim_writeback(zone)) {

899

nr_immediate++;

899

nr_immediate++;

900

goto keep_locked;

900

goto keep_locked;

901

902

/* Case 2 above */

902

/* Case 2 above */

903

} else if (global_reclaim(sc) ||

903

} else if (global_reclaim(sc) ||

904

!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {

904

!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {

905

/*

905

/*

906

* This is slightly racy - end_page_writeback()

906

* This is slightly racy - end_page_writeback()

907

* might have just cleared PageReclaim, then

907

* might have just cleared PageReclaim, then

908

* setting PageReclaim here end up interpreted

908

* setting PageReclaim here end up interpreted

909

* as PageReadahead - but that does not matter

909

* as PageReadahead - but that does not matter

910

* enough to care. What we do want is for this

910

* enough to care. What we do want is for this

911

* page to have PageReclaim set next time memcg

911

* page to have PageReclaim set next time memcg

912

* reclaim reaches the tests above, so it will

912

* reclaim reaches the tests above, so it will

913

* then wait_on_page_writeback() to avoid OOM;

913

* then wait_on_page_writeback() to avoid OOM;

914

* and it's also appropriate in global reclaim.

914

* and it's also appropriate in global reclaim.

915

*/

915

*/

916

SetPageReclaim(page);

916

SetPageReclaim(page);

917

nr_writeback++;

917

nr_writeback++;

918

919

goto keep_locked;

919

goto keep_locked;

920

921

/* Case 3 above */

921

/* Case 3 above */

922

} else {

922

} else {

923

wait_on_page_writeback(page);

923

wait_on_page_writeback(page);

924

}

924

}

925

}

925

}

926

927

if (!force_reclaim)

927

if (!force_reclaim)

928

references = page_check_references(page, sc);

928

references = page_check_references(page, sc);

929

930

switch (references) {

930

switch (references) {

931

case PAGEREF_ACTIVATE:

931

case PAGEREF_ACTIVATE:

932

goto activate_locked;

932

goto activate_locked;

933

case PAGEREF_KEEP:

933

case PAGEREF_KEEP:

934

goto keep_locked;

934

goto keep_locked;

935

case PAGEREF_RECLAIM:

935

case PAGEREF_RECLAIM:

936

case PAGEREF_RECLAIM_CLEAN:

936

case PAGEREF_RECLAIM_CLEAN:

937

; /* try to reclaim the page below */

937

; /* try to reclaim the page below */

938

}

938

}

939

940

/*

940

/*

941

* Anonymous process memory has backing store?

941

* Anonymous process memory has backing store?

942

* Try to allocate it some swap space here.

942

* Try to allocate it some swap space here.

943

*/

943

*/

944

if (PageAnon(page) && !PageSwapCache(page)) {

944

if (PageAnon(page) && !PageSwapCache(page)) {

945

if (!(sc->gfp_mask & __GFP_IO))

945

if (!(sc->gfp_mask & __GFP_IO))

946

goto keep_locked;

946

goto keep_locked;

947

if (!add_to_swap(page, page_list))

947

if (!add_to_swap(page, page_list))

948

goto activate_locked;

948

goto activate_locked;

949

may_enter_fs = 1;

949

may_enter_fs = 1;

950

951

/* Adding to swap updated mapping */

951

/* Adding to swap updated mapping */

952

mapping = page_mapping(page);

952

mapping = page_mapping(page);

953

}

953

}

954

955

/*

955

/*

956

* The page is mapped into the page tables of one or more

956

* The page is mapped into the page tables of one or more

957

* processes. Try to unmap it here.

957

* processes. Try to unmap it here.

958

*/

958

*/

959

if (page_mapped(page) && mapping) {

959

if (page_mapped(page) && mapping) {

960

switch (try_to_unmap(page, ttu_flags)) {

960

switch (try_to_unmap(page, ttu_flags)) {

961

case SWAP_FAIL:

961

case SWAP_FAIL:

962

goto activate_locked;

962

goto activate_locked;

963

case SWAP_AGAIN:

963

case SWAP_AGAIN:

964

goto keep_locked;

964

goto keep_locked;

965

case SWAP_MLOCK:

965

case SWAP_MLOCK:

966

goto cull_mlocked;

966

goto cull_mlocked;

967

case SWAP_SUCCESS:

967

case SWAP_SUCCESS:

968

; /* try to free the page below */

968

; /* try to free the page below */

969

}

969

}

970

}

970

}

971

972

if (PageDirty(page)) {

972

if (PageDirty(page)) {

973

/*

973

/*

974

* Only kswapd can writeback filesystem pages to

974

* Only kswapd can writeback filesystem pages to

975

* avoid risk of stack overflow but only writeback

975

* avoid risk of stack overflow but only writeback

976

* if many dirty pages have been encountered.

976

* if many dirty pages have been encountered.

977

*/

977

*/

978

if (page_is_file_cache(page) &&

978

if (page_is_file_cache(page) &&

979

(!current_is_kswapd() ||

979

(!current_is_kswapd() ||

980

!zone_is_reclaim_dirty(zone))) {

980

!zone_is_reclaim_dirty(zone))) {

981

/*

981

/*

982

* Immediately reclaim when written back.

982

* Immediately reclaim when written back.

983

* Similar in principal to deactivate_page()

983

* Similar in principal to deactivate_page()

984

* except we already have the page isolated

984

* except we already have the page isolated

985

* and know it's dirty

985

* and know it's dirty

986

*/

986

*/

987

inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);

987

inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);

988

SetPageReclaim(page);

988

SetPageReclaim(page);

989

990

goto keep_locked;

990

goto keep_locked;

991

}

991

}

992

993

if (references == PAGEREF_RECLAIM_CLEAN)

993

if (references == PAGEREF_RECLAIM_CLEAN)

994

goto keep_locked;

994

goto keep_locked;

995

if (!may_enter_fs)

995

if (!may_enter_fs)

996

goto keep_locked;

996

goto keep_locked;

997

if (!sc->may_writepage)

997

if (!sc->may_writepage)

998

goto keep_locked;

998

goto keep_locked;

999

1000

/* Page is dirty, try to write it out here */

1000

/* Page is dirty, try to write it out here */

1001

switch (pageout(page, mapping, sc)) {

1001

switch (pageout(page, mapping, sc)) {

1002

case PAGE_KEEP:

1002

case PAGE_KEEP:

1003

goto keep_locked;

1003

goto keep_locked;

1004

case PAGE_ACTIVATE:

1004

case PAGE_ACTIVATE:

1005

goto activate_locked;

1005

goto activate_locked;

1006

case PAGE_SUCCESS:

1006

case PAGE_SUCCESS:

1007

if (PageWriteback(page))

1007

if (PageWriteback(page))

1008

goto keep;

1008

goto keep;

1009

if (PageDirty(page))

1009

if (PageDirty(page))

1010

goto keep;

1010

goto keep;

1011

1012

/*

1012

/*

1013

* A synchronous write - probably a ramdisk. Go

1013

* A synchronous write - probably a ramdisk. Go

1014

* ahead and try to reclaim the page.

1014

* ahead and try to reclaim the page.

1015

*/

1015

*/

1016

if (!trylock_page(page))

1016

if (!trylock_page(page))

1017

goto keep;

1017

goto keep;

1018

if (PageDirty(page) || PageWriteback(page))

1018

if (PageDirty(page) || PageWriteback(page))

1019

goto keep_locked;

1019

goto keep_locked;

1020

mapping = page_mapping(page);

1020

mapping = page_mapping(page);

1021

case PAGE_CLEAN:

1021

case PAGE_CLEAN:

1022

; /* try to free the page below */

1022

; /* try to free the page below */

1023

}

1023

}

1024

}

1024

}

1025

1026

/*

1026

/*

1027

* If the page has buffers, try to free the buffer mappings

1027

* If the page has buffers, try to free the buffer mappings

1028

* associated with this page. If we succeed we try to free

1028

* associated with this page. If we succeed we try to free

1029

* the page as well.

1029

* the page as well.

1030

*

1030

*

1031

* We do this even if the page is PageDirty().

1031

* We do this even if the page is PageDirty().

1032

* try_to_release_page() does not perform I/O, but it is

1032

* try_to_release_page() does not perform I/O, but it is

1033

* possible for a page to have PageDirty set, but it is actually

1033

* possible for a page to have PageDirty set, but it is actually

1034

* clean (all its buffers are clean). This happens if the

1034

* clean (all its buffers are clean). This happens if the

1035

* buffers were written out directly, with submit_bh(). ext3

1035

* buffers were written out directly, with submit_bh(). ext3

1036

* will do this, as well as the blockdev mapping.

1036

* will do this, as well as the blockdev mapping.

1037

* try_to_release_page() will discover that cleanness and will

1037

* try_to_release_page() will discover that cleanness and will

1038

* drop the buffers and mark the page clean - it can be freed.

1038

* drop the buffers and mark the page clean - it can be freed.

1039

*

1039

*

1040

* Rarely, pages can have buffers and no ->mapping. These are

1040

* Rarely, pages can have buffers and no ->mapping. These are

1041

* the pages which were not successfully invalidated in

1041

* the pages which were not successfully invalidated in

1042

* truncate_complete_page(). We try to drop those buffers here

1042

* truncate_complete_page(). We try to drop those buffers here

1043

* and if that worked, and the page is no longer mapped into

1043

* and if that worked, and the page is no longer mapped into

1044

* process address space (page_count == 1) it can be freed.

1044

* process address space (page_count == 1) it can be freed.

1045

* Otherwise, leave the page on the LRU so it is swappable.

1045

* Otherwise, leave the page on the LRU so it is swappable.

1046

*/

1046

*/

1047

if (page_has_private(page)) {

1047

if (page_has_private(page)) {

1048

if (!try_to_release_page(page, sc->gfp_mask))

1048

if (!try_to_release_page(page, sc->gfp_mask))

1049

goto activate_locked;

1049

goto activate_locked;

1050

if (!mapping && page_count(page) == 1) {

1050

if (!mapping && page_count(page) == 1) {

1051

unlock_page(page);

1051

unlock_page(page);

1052

if (put_page_testzero(page))

1052

if (put_page_testzero(page))

1053

goto free_it;

1053

goto free_it;

1054

else {

1054

else {

1055

/*

1055

/*

1056

* rare race with speculative reference.

1056

* rare race with speculative reference.

1057

* the speculative reference will free

1057

* the speculative reference will free

1058

* this page shortly, so we may

1058

* this page shortly, so we may

1059

* increment nr_reclaimed here (and

1059

* increment nr_reclaimed here (and

1060

* leave it off the LRU).

1060

* leave it off the LRU).

1061

*/

1061

*/

1062

nr_reclaimed++;

1062

nr_reclaimed++;

1063

continue;

1063

continue;

1064

}

1064

}

1065

}

1065

}

1066

}

1066

}

1067

1068

if (!mapping || !__remove_mapping(mapping, page))

1068

if (!mapping || !__remove_mapping(mapping, page))

1069

goto keep_locked;

1069

goto keep_locked;

1070

1071

/*

1071

/*

1072

* At this point, we have no other references and there is

1072

* At this point, we have no other references and there is

1073

* no way to pick any more up (removed from LRU, removed

1073

* no way to pick any more up (removed from LRU, removed

1074

* from pagecache). Can use non-atomic bitops now (and

1074

* from pagecache). Can use non-atomic bitops now (and

1075

* we obviously don't have to worry about waking up a process

1075

* we obviously don't have to worry about waking up a process

1076

* waiting on the page lock, because there are no references.

1076

* waiting on the page lock, because there are no references.

1077

*/

1077

*/

1078

__clear_page_locked(page);

1078

__clear_page_locked(page);

1079

free_it:

1079

free_it:

1080

nr_reclaimed++;

1080

nr_reclaimed++;

1081

1082

/*

1082

/*

1083

* Is there need to periodically free_page_list? It would

1083

* Is there need to periodically free_page_list? It would

1084

* appear not as the counts should be low

1084

* appear not as the counts should be low

1085

*/

1085

*/

1086

list_add(&page->lru, &free_pages);

1086

list_add(&page->lru, &free_pages);

1087

continue;

1087

continue;

1088

1089

cull_mlocked:

1089

cull_mlocked:

1090

if (PageSwapCache(page))

1090

if (PageSwapCache(page))

1091

try_to_free_swap(page);

1091

try_to_free_swap(page);

1092

unlock_page(page);

1092

unlock_page(page);

1093

putback_lru_page(page);

1093

putback_lru_page(page);

1094

continue;

1094

continue;

1095

1096

activate_locked:

1096

activate_locked:

1097

/* Not a candidate for swapping, so reclaim swap space. */

1097

/* Not a candidate for swapping, so reclaim swap space. */

1098

if (PageSwapCache(page) && vm_swap_full())

1098

if (PageSwapCache(page) && vm_swap_full())

1099

try_to_free_swap(page);

1099

try_to_free_swap(page);

1100

VM_BUG_ON(PageActive(page));

1100

VM_BUG_ON(PageActive(page));

1101

SetPageActive(page);

1101

SetPageActive(page);

1102

pgactivate++;

1102

pgactivate++;

1103

keep_locked:

1103

keep_locked:

1104

unlock_page(page);

1104

unlock_page(page);

1105

keep:

1105

keep:

1106

list_add(&page->lru, &ret_pages);

1106

list_add(&page->lru, &ret_pages);

1107

VM_BUG_ON(PageLRU(page) || PageUnevictable(page));

1107

VM_BUG_ON(PageLRU(page) || PageUnevictable(page));

1108

}

1108

}

1109

1110

free_hot_cold_page_list(&free_pages, 1);

1110

free_hot_cold_page_list(&free_pages, 1);

1111

1112

list_splice(&ret_pages, page_list);

1112

list_splice(&ret_pages, page_list);

1113

count_vm_events(PGACTIVATE, pgactivate);

1113

count_vm_events(PGACTIVATE, pgactivate);

1114

mem_cgroup_uncharge_end();

1114

mem_cgroup_uncharge_end();

1115

*ret_nr_dirty += nr_dirty;

1115

*ret_nr_dirty += nr_dirty;

1116

*ret_nr_congested += nr_congested;

1116

*ret_nr_congested += nr_congested;

1117

*ret_nr_unqueued_dirty += nr_unqueued_dirty;

1117

*ret_nr_unqueued_dirty += nr_unqueued_dirty;

1118

*ret_nr_writeback += nr_writeback;

1118

*ret_nr_writeback += nr_writeback;

1119

*ret_nr_immediate += nr_immediate;

1119

*ret_nr_immediate += nr_immediate;

1120

return nr_reclaimed;

1120

return nr_reclaimed;

1121

}

1121

}

1122

1123

unsigned long reclaim_clean_pages_from_list(struct zone *zone,

1123

unsigned long reclaim_clean_pages_from_list(struct zone *zone,

1124

struct list_head *page_list)

1124

struct list_head *page_list)

1125

{

1125

{

1126

struct scan_control sc = {

1126

struct scan_control sc = {

1127

.gfp_mask = GFP_KERNEL,

1127

.gfp_mask = GFP_KERNEL,

1128

.priority = DEF_PRIORITY,

1128

.priority = DEF_PRIORITY,

1129

.may_unmap = 1,

1129

.may_unmap = 1,

1130

};

1130

};

1131

unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;

1131

unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;

1132

struct page *page, *next;

1132

struct page *page, *next;

1133

LIST_HEAD(clean_pages);

1133

LIST_HEAD(clean_pages);

1134

1135

list_for_each_entry_safe(page, next, page_list, lru) {

1135

list_for_each_entry_safe(page, next, page_list, lru) {

1136

if (page_is_file_cache(page) && !PageDirty(page) &&

1136

if (page_is_file_cache(page) && !PageDirty(page) &&

1137

!isolated_balloon_page(page)) {

1137

!isolated_balloon_page(page)) {

1138

ClearPageActive(page);

1138

ClearPageActive(page);

1139

list_move(&page->lru, &clean_pages);

1139

list_move(&page->lru, &clean_pages);

1140

}

1140

}

1141

}

1141

}

1142

1143

ret = shrink_page_list(&clean_pages, zone, &sc,

1143

ret = shrink_page_list(&clean_pages, zone, &sc,

1144

TTU_UNMAP|TTU_IGNORE_ACCESS,

1144

TTU_UNMAP|TTU_IGNORE_ACCESS,

1145

&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);

1145

&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);

1146

list_splice(&clean_pages, page_list);

1146

list_splice(&clean_pages, page_list);

1147

mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);

1147

mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);

1148

return ret;

1148

return ret;

1149

}

1149

}

1150

1151

/*

1151

/*

1152

* Attempt to remove the specified page from its LRU. Only take this page

1152

* Attempt to remove the specified page from its LRU. Only take this page

1153

* if it is of the appropriate PageActive status. Pages which are being

1153

* if it is of the appropriate PageActive status. Pages which are being

1154

* freed elsewhere are also ignored.

1154

* freed elsewhere are also ignored.

1155

*

1155

*

1156

* page: page to consider

1156

* page: page to consider

1157

* mode: one of the LRU isolation modes defined above

1157

* mode: one of the LRU isolation modes defined above

1158

*

1158

*

1159

* returns 0 on success, -ve errno on failure.

1159

* returns 0 on success, -ve errno on failure.

1160

*/

1160

*/

1161

int __isolate_lru_page(struct page *page, isolate_mode_t mode)

1161

int __isolate_lru_page(struct page *page, isolate_mode_t mode)

1162

{

1162

{

1163

int ret = -EINVAL;

1163

int ret = -EINVAL;

1164

1165

/* Only take pages on the LRU. */

1165

/* Only take pages on the LRU. */

1166

if (!PageLRU(page))

1166

if (!PageLRU(page))

1167

return ret;

1167

return ret;

1168

1169

/* Compaction should not handle unevictable pages but CMA can do so */

1169

/* Compaction should not handle unevictable pages but CMA can do so */

1170

if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))

1170

if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))

1171

return ret;

1171

return ret;

1172

1173

ret = -EBUSY;

1173

ret = -EBUSY;

1174

1175

/*

1175

/*

1176

* To minimise LRU disruption, the caller can indicate that it only

1176

* To minimise LRU disruption, the caller can indicate that it only

1177

* wants to isolate pages it will be able to operate on without

1177

* wants to isolate pages it will be able to operate on without

1178

* blocking - clean pages for the most part.

1178

* blocking - clean pages for the most part.

1179

*

1179

*

1180

* ISOLATE_CLEAN means that only clean pages should be isolated. This

1180

* ISOLATE_CLEAN means that only clean pages should be isolated. This

1181

* is used by reclaim when it is cannot write to backing storage

1181

* is used by reclaim when it is cannot write to backing storage

1182

*

1182

*

1183

* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages

1183

* ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages

1184

* that it is possible to migrate without blocking

1184

* that it is possible to migrate without blocking

1185

*/

1185

*/

1186

if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {

1186

if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {

1187

/* All the caller can do on PageWriteback is block */

1187

/* All the caller can do on PageWriteback is block */

1188

if (PageWriteback(page))

1188

if (PageWriteback(page))

1189

return ret;

1189

return ret;

1190

1191

if (PageDirty(page)) {

1191

if (PageDirty(page)) {

1192

struct address_space *mapping;

1192

struct address_space *mapping;

1193

1194

/* ISOLATE_CLEAN means only clean pages */

1194

/* ISOLATE_CLEAN means only clean pages */

1195

if (mode & ISOLATE_CLEAN)

1195

if (mode & ISOLATE_CLEAN)

1196

return ret;

1196

return ret;

1197

1198

/*

1198

/*

1199

* Only pages without mappings or that have a

1199

* Only pages without mappings or that have a

1200

* ->migratepage callback are possible to migrate

1200

* ->migratepage callback are possible to migrate

1201

* without blocking

1201

* without blocking

1202

*/

1202

*/

1203

mapping = page_mapping(page);

1203

mapping = page_mapping(page);

1204

if (mapping && !mapping->a_ops->migratepage)

1204

if (mapping && !mapping->a_ops->migratepage)

1205

return ret;

1205

return ret;

1206

}

1206

}

1207

}

1207

}

1208

1209

if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))

1209

if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))

1210

return ret;

1210

return ret;

1211

1212

if (likely(get_page_unless_zero(page))) {

1212

if (likely(get_page_unless_zero(page))) {

1213

/*

1213

/*

1214

* Be careful not to clear PageLRU until after we're

1214

* Be careful not to clear PageLRU until after we're

1215

* sure the page is not being freed elsewhere -- the

1215

* sure the page is not being freed elsewhere -- the

1216

* page release code relies on it.

1216

* page release code relies on it.

1217

*/

1217

*/

1218

ClearPageLRU(page);

1218

ClearPageLRU(page);

1219

ret = 0;

1219

ret = 0;

1220

}

1220

}

1221

1222

return ret;

1222

return ret;

1223

}

1223

}

1224

1225

/*

1225

/*

1226

* zone->lru_lock is heavily contended. Some of the functions that

1226

* zone->lru_lock is heavily contended. Some of the functions that

1227

* shrink the lists perform better by taking out a batch of pages

1227

* shrink the lists perform better by taking out a batch of pages

1228

* and working on them outside the LRU lock.

1228

* and working on them outside the LRU lock.

1229

*

1229

*

1230

* For pagecache intensive workloads, this function is the hottest

1230

* For pagecache intensive workloads, this function is the hottest

1231

* spot in the kernel (apart from copy_*_user functions).

1231

* spot in the kernel (apart from copy_*_user functions).

1232

*

1232

*

1233

* Appropriate locks must be held before calling this function.

1233

* Appropriate locks must be held before calling this function.

1234

*

1234

*

1235

* @nr_to_scan: The number of pages to look through on the list.

1235

* @nr_to_scan: The number of pages to look through on the list.

1236

* @lruvec: The LRU vector to pull pages from.

1236

* @lruvec: The LRU vector to pull pages from.

1237

* @dst: The temp list to put pages on to.

1237

* @dst: The temp list to put pages on to.

1238

* @nr_scanned: The number of pages that were scanned.

1238

* @nr_scanned: The number of pages that were scanned.

1239

* @sc: The scan_control struct for this reclaim session

1239

* @sc: The scan_control struct for this reclaim session

1240

* @mode: One of the LRU isolation modes

1240

* @mode: One of the LRU isolation modes

1241

* @lru: LRU list id for isolating

1241

* @lru: LRU list id for isolating

1242

*

1242

*

1243

* returns how many pages were moved onto *@dst.

1243

* returns how many pages were moved onto *@dst.

1244

*/

1244

*/

1245

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,

1245

static unsigned long isolate_lru_pages(unsigned long nr_to_scan,

1246

struct lruvec *lruvec, struct list_head *dst,

1246

struct lruvec *lruvec, struct list_head *dst,

1247

unsigned long *nr_scanned, struct scan_control *sc,

1247

unsigned long *nr_scanned, struct scan_control *sc,

1248

isolate_mode_t mode, enum lru_list lru)

1248

isolate_mode_t mode, enum lru_list lru)

1249

{

1249

{

1250

struct list_head *src = &lruvec->lists[lru];

1250

struct list_head *src = &lruvec->lists[lru];

1251

unsigned long nr_taken = 0;

1251

unsigned long nr_taken = 0;

1252

unsigned long scan;

1252

unsigned long scan;

1253

1254

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {

1254

for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {

1255

struct page *page;

1255

struct page *page;

1256

int nr_pages;

1256

int nr_pages;

1257

1258

page = lru_to_page(src);

1258

page = lru_to_page(src);

1259

prefetchw_prev_lru_page(page, src, flags);

1259

prefetchw_prev_lru_page(page, src, flags);

1260

1261

VM_BUG_ON(!PageLRU(page));

1261

VM_BUG_ON(!PageLRU(page));

1262

1263

switch (__isolate_lru_page(page, mode)) {

1263

switch (__isolate_lru_page(page, mode)) {

1264

case 0:

1264

case 0:

1265

nr_pages = hpage_nr_pages(page);

1265

nr_pages = hpage_nr_pages(page);

1266

mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);

1266

mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);

1267

list_move(&page->lru, dst);

1267

list_move(&page->lru, dst);

1268

nr_taken += nr_pages;

1268

nr_taken += nr_pages;

1269

break;

1269

break;

1270

1271

case -EBUSY:

1271

case -EBUSY:

1272

/* else it is being freed elsewhere */

1272

/* else it is being freed elsewhere */

1273

list_move(&page->lru, src);

1273

list_move(&page->lru, src);

1274

continue;

1274

continue;

1275

1276

default:

1276

default:

1277

BUG();

1277

BUG();

1278

}

1278

}

1279

}

1279

}

1280

1281

*nr_scanned = scan;

1281

*nr_scanned = scan;

1282

trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,

1282

trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,

1283

nr_taken, mode, is_file_lru(lru));

1283

nr_taken, mode, is_file_lru(lru));

1284

return nr_taken;

1284

return nr_taken;

1285

}

1285

}

1286

1287

/**

1287

/**

1288

* isolate_lru_page - tries to isolate a page from its LRU list

1288

* isolate_lru_page - tries to isolate a page from its LRU list

1289

* @page: page to isolate from its LRU list

1289

* @page: page to isolate from its LRU list

1290

*

1290

*

1291

* Isolates a @page from an LRU list, clears PageLRU and adjusts the

1291

* Isolates a @page from an LRU list, clears PageLRU and adjusts the

1292

* vmstat statistic corresponding to whatever LRU list the page was on.

1292

* vmstat statistic corresponding to whatever LRU list the page was on.

1293

*

1293

*

1294

* Returns 0 if the page was removed from an LRU list.

1294

* Returns 0 if the page was removed from an LRU list.

1295

* Returns -EBUSY if the page was not on an LRU list.

1295

* Returns -EBUSY if the page was not on an LRU list.

1296

*

1296

*

1297

* The returned page will have PageLRU() cleared. If it was found on

1297

* The returned page will have PageLRU() cleared. If it was found on

1298

* the active list, it will have PageActive set. If it was found on

1298

* the active list, it will have PageActive set. If it was found on

1299

* the unevictable list, it will have the PageUnevictable bit set. That flag

1299

* the unevictable list, it will have the PageUnevictable bit set. That flag

1300

* may need to be cleared by the caller before letting the page go.

1300

* may need to be cleared by the caller before letting the page go.

1301

*

1301

*

1302

* The vmstat statistic corresponding to the list on which the page was

1302

* The vmstat statistic corresponding to the list on which the page was

1303

* found will be decremented.

1303

* found will be decremented.

1304

*

1304

*

1305

* Restrictions:

1305

* Restrictions:

1306

* (1) Must be called with an elevated refcount on the page. This is a

1306

* (1) Must be called with an elevated refcount on the page. This is a

1307

* fundamentnal difference from isolate_lru_pages (which is called

1307

* fundamentnal difference from isolate_lru_pages (which is called

1308

* without a stable reference).

1308

* without a stable reference).

1309

* (2) the lru_lock must not be held.

1309

* (2) the lru_lock must not be held.

1310

* (3) interrupts must be enabled.

1310

* (3) interrupts must be enabled.

1311

*/

1311

*/

1312

int isolate_lru_page(struct page *page)

1312

int isolate_lru_page(struct page *page)

1313

{

1313

{

1314

int ret = -EBUSY;

1314

int ret = -EBUSY;

1315

1316

VM_BUG_ON(!page_count(page));

1316

VM_BUG_ON(!page_count(page));

1317

1318

if (PageLRU(page)) {

1318

if (PageLRU(page)) {

1319

struct zone *zone = page_zone(page);

1319

struct zone *zone = page_zone(page);

1320

struct lruvec *lruvec;

1320

struct lruvec *lruvec;

1321

1322

spin_lock_irq(&zone->lru_lock);

1322

spin_lock_irq(&zone->lru_lock);

1323

lruvec = mem_cgroup_page_lruvec(page, zone);

1323

lruvec = mem_cgroup_page_lruvec(page, zone);

1324

if (PageLRU(page)) {

1324

if (PageLRU(page)) {

1325

int lru = page_lru(page);

1325

int lru = page_lru(page);

1326

get_page(page);

1326

get_page(page);

1327

ClearPageLRU(page);

1327

ClearPageLRU(page);

1328

del_page_from_lru_list(page, lruvec, lru);

1328

del_page_from_lru_list(page, lruvec, lru);

1329

ret = 0;

1329

ret = 0;

1330

}

1330

}

1331

spin_unlock_irq(&zone->lru_lock);

1331

spin_unlock_irq(&zone->lru_lock);

1332

}

1332

}

1333

return ret;

1333

return ret;

1334

}

1334

}

1335

1336

/*

1336

/*

1337

* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and

1337

* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and

1338

* then get resheduled. When there are massive number of tasks doing page

1338

* then get resheduled. When there are massive number of tasks doing page

1339

* allocation, such sleeping direct reclaimers may keep piling up on each CPU,

1339

* allocation, such sleeping direct reclaimers may keep piling up on each CPU,

1340

* the LRU list will go small and be scanned faster than necessary, leading to

1340

* the LRU list will go small and be scanned faster than necessary, leading to

1341

* unnecessary swapping, thrashing and OOM.

1341

* unnecessary swapping, thrashing and OOM.

1342

*/

1342

*/

1343

static int too_many_isolated(struct zone *zone, int file,

1343

static int too_many_isolated(struct zone *zone, int file,

1344

struct scan_control *sc)

1344

struct scan_control *sc)

1345

{

1345

{

1346

unsigned long inactive, isolated;

1346

unsigned long inactive, isolated;

1347

1348

if (current_is_kswapd())

1348

if (current_is_kswapd())

1349

return 0;

1349

return 0;

1350

1351

if (!global_reclaim(sc))

1351

if (!global_reclaim(sc))

1352

return 0;

1352

return 0;

1353

1354

if (file) {

1354

if (file) {

1355

inactive = zone_page_state(zone, NR_INACTIVE_FILE);

1355

inactive = zone_page_state(zone, NR_INACTIVE_FILE);

1356

isolated = zone_page_state(zone, NR_ISOLATED_FILE);

1356

isolated = zone_page_state(zone, NR_ISOLATED_FILE);

1357

} else {

1357

} else {

1358

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1358

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1359

isolated = zone_page_state(zone, NR_ISOLATED_ANON);

1359

isolated = zone_page_state(zone, NR_ISOLATED_ANON);

1360

}

1360

}

1361

1362

/*

1362

/*

1363

* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they

1363

* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they

1364

* won't get blocked by normal direct-reclaimers, forming a circular

1364

* won't get blocked by normal direct-reclaimers, forming a circular

1365

* deadlock.

1365

* deadlock.

1366

*/

1366

*/

1367

if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)

1367

if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)

1368

inactive >>= 3;

1368

inactive >>= 3;

1369

1370

return isolated > inactive;

1370

return isolated > inactive;

1371

}

1371

}

1372

1373

static noinline_for_stack void

1373

static noinline_for_stack void

1374

putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)

1374

putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)

1375

{

1375

{

1376

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1376

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1377

struct zone *zone = lruvec_zone(lruvec);

1377

struct zone *zone = lruvec_zone(lruvec);

1378

LIST_HEAD(pages_to_free);

1378

LIST_HEAD(pages_to_free);

1379

1380

/*

1380

/*

1381

* Put back any unfreeable pages.

1381

* Put back any unfreeable pages.

1382

*/

1382

*/

1383

while (!list_empty(page_list)) {

1383

while (!list_empty(page_list)) {

1384

struct page *page = lru_to_page(page_list);

1384

struct page *page = lru_to_page(page_list);

1385

int lru;

1385

int lru;

1386

1387

VM_BUG_ON(PageLRU(page));

1387

VM_BUG_ON(PageLRU(page));

1388

list_del(&page->lru);

1388

list_del(&page->lru);

1389

if (unlikely(!page_evictable(page))) {

1389

if (unlikely(!page_evictable(page))) {

1390

spin_unlock_irq(&zone->lru_lock);

1390

spin_unlock_irq(&zone->lru_lock);

1391

putback_lru_page(page);

1391

putback_lru_page(page);

1392

spin_lock_irq(&zone->lru_lock);

1392

spin_lock_irq(&zone->lru_lock);

1393

continue;

1393

continue;

1394

}

1394

}

1395

1396

lruvec = mem_cgroup_page_lruvec(page, zone);

1396

lruvec = mem_cgroup_page_lruvec(page, zone);

1397

1398

SetPageLRU(page);

1398

SetPageLRU(page);

1399

lru = page_lru(page);

1399

lru = page_lru(page);

1400

add_page_to_lru_list(page, lruvec, lru);

1400

add_page_to_lru_list(page, lruvec, lru);

1401

1402

if (is_active_lru(lru)) {

1402

if (is_active_lru(lru)) {

1403

int file = is_file_lru(lru);

1403

int file = is_file_lru(lru);

1404

int numpages = hpage_nr_pages(page);

1404

int numpages = hpage_nr_pages(page);

1405

reclaim_stat->recent_rotated[file] += numpages;

1405

reclaim_stat->recent_rotated[file] += numpages;

1406

}

1406

}

1407

if (put_page_testzero(page)) {

1407

if (put_page_testzero(page)) {

1408

__ClearPageLRU(page);

1408

__ClearPageLRU(page);

1409

__ClearPageActive(page);

1409

__ClearPageActive(page);

1410

del_page_from_lru_list(page, lruvec, lru);

1410

del_page_from_lru_list(page, lruvec, lru);

1411

1412

if (unlikely(PageCompound(page))) {

1412

if (unlikely(PageCompound(page))) {

1413

spin_unlock_irq(&zone->lru_lock);

1413

spin_unlock_irq(&zone->lru_lock);

1414

(*get_compound_page_dtor(page))(page);

1414

(*get_compound_page_dtor(page))(page);

1415

spin_lock_irq(&zone->lru_lock);

1415

spin_lock_irq(&zone->lru_lock);

1416

} else

1416

} else

1417

list_add(&page->lru, &pages_to_free);

1417

list_add(&page->lru, &pages_to_free);

1418

}

1418

}

1419

}

1419

}

1420

1421

/*

1421

/*

1422

* To save our caller's stack, now use input list for pages to free.

1422

* To save our caller's stack, now use input list for pages to free.

1423

*/

1423

*/

1424

list_splice(&pages_to_free, page_list);

1424

list_splice(&pages_to_free, page_list);

1425

}

1425

}

1426

1427

/*

1427

/*

1428

* shrink_inactive_list() is a helper for shrink_zone(). It returns the number

1428

* shrink_inactive_list() is a helper for shrink_zone(). It returns the number

1429

* of reclaimed pages

1429

* of reclaimed pages

1430

*/

1430

*/

1431

static noinline_for_stack unsigned long

1431

static noinline_for_stack unsigned long

1432

shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,

1432

shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,

1433

struct scan_control *sc, enum lru_list lru)

1433

struct scan_control *sc, enum lru_list lru)

1434

{

1434

{

1435

LIST_HEAD(page_list);

1435

LIST_HEAD(page_list);

1436

unsigned long nr_scanned;

1436

unsigned long nr_scanned;

1437

unsigned long nr_reclaimed = 0;

1437

unsigned long nr_reclaimed = 0;

1438

unsigned long nr_taken;

1438

unsigned long nr_taken;

1439

unsigned long nr_dirty = 0;

1439

unsigned long nr_dirty = 0;

1440

unsigned long nr_congested = 0;

1440

unsigned long nr_congested = 0;

1441

unsigned long nr_unqueued_dirty = 0;

1441

unsigned long nr_unqueued_dirty = 0;

1442

unsigned long nr_writeback = 0;

1442

unsigned long nr_writeback = 0;

1443

unsigned long nr_immediate = 0;

1443

unsigned long nr_immediate = 0;

1444

isolate_mode_t isolate_mode = 0;

1444

isolate_mode_t isolate_mode = 0;

1445

int file = is_file_lru(lru);

1445

int file = is_file_lru(lru);

1446

struct zone *zone = lruvec_zone(lruvec);

1446

struct zone *zone = lruvec_zone(lruvec);

1447

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1447

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1448

1449

while (unlikely(too_many_isolated(zone, file, sc))) {

1449

while (unlikely(too_many_isolated(zone, file, sc))) {

1450

congestion_wait(BLK_RW_ASYNC, HZ/10);

1450

congestion_wait(BLK_RW_ASYNC, HZ/10);

1451

1452

/* We are about to die and free our memory. Return now. */

1452

/* We are about to die and free our memory. Return now. */

1453

if (fatal_signal_pending(current))

1453

if (fatal_signal_pending(current))

1454

return SWAP_CLUSTER_MAX;

1454

return SWAP_CLUSTER_MAX;

1455

}

1455

}

1456

1457

lru_add_drain();

1457

lru_add_drain();

1458

1459

if (!sc->may_unmap)

1459

if (!sc->may_unmap)

1460

isolate_mode |= ISOLATE_UNMAPPED;

1460

isolate_mode |= ISOLATE_UNMAPPED;

1461

if (!sc->may_writepage)

1461

if (!sc->may_writepage)

1462

isolate_mode |= ISOLATE_CLEAN;

1462

isolate_mode |= ISOLATE_CLEAN;

1463

1464

spin_lock_irq(&zone->lru_lock);

1464

spin_lock_irq(&zone->lru_lock);

1465

1466

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,

1466

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,

1467

&nr_scanned, sc, isolate_mode, lru);

1467

&nr_scanned, sc, isolate_mode, lru);

1468

1469

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1469

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1470

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1470

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1471

1472

if (global_reclaim(sc)) {

1472

if (global_reclaim(sc)) {

1473

zone->pages_scanned += nr_scanned;

1473

zone->pages_scanned += nr_scanned;

1474

if (current_is_kswapd())

1474

if (current_is_kswapd())

1475

__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);

1475

__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);

1476

else

1476

else

1477

__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);

1477

__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);

1478

}

1478

}

1479

spin_unlock_irq(&zone->lru_lock);

1479

spin_unlock_irq(&zone->lru_lock);

1480

1481

if (nr_taken == 0)

1481

if (nr_taken == 0)

1482

return 0;

1482

return 0;

1483

1484

nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,

1484

nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,

1485

&nr_dirty, &nr_unqueued_dirty, &nr_congested,

1485

&nr_dirty, &nr_unqueued_dirty, &nr_congested,

1486

&nr_writeback, &nr_immediate,

1486

&nr_writeback, &nr_immediate,

1487

false);

1487

false);

1488

1489

spin_lock_irq(&zone->lru_lock);

1489

spin_lock_irq(&zone->lru_lock);

1490

1491

reclaim_stat->recent_scanned[file] += nr_taken;

1491

reclaim_stat->recent_scanned[file] += nr_taken;

1492

1493

if (global_reclaim(sc)) {

1493

if (global_reclaim(sc)) {

1494

if (current_is_kswapd())

1494

if (current_is_kswapd())

1495

__count_zone_vm_events(PGSTEAL_KSWAPD, zone,

1495

__count_zone_vm_events(PGSTEAL_KSWAPD, zone,

1496

nr_reclaimed);

1496

nr_reclaimed);

1497

else

1497

else

1498

__count_zone_vm_events(PGSTEAL_DIRECT, zone,

1498

__count_zone_vm_events(PGSTEAL_DIRECT, zone,

1499

nr_reclaimed);

1499

nr_reclaimed);

1500

}

1500

}

1501

1502

putback_inactive_pages(lruvec, &page_list);

1502

putback_inactive_pages(lruvec, &page_list);

1503

1504

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1504

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1505

1506

spin_unlock_irq(&zone->lru_lock);

1506

spin_unlock_irq(&zone->lru_lock);

1507

1508

free_hot_cold_page_list(&page_list, 1);

1508

free_hot_cold_page_list(&page_list, 1);

1509

1510

/*

1510

/*

1511

* If reclaim is isolating dirty pages under writeback, it implies

1511

* If reclaim is isolating dirty pages under writeback, it implies

1512

* that the long-lived page allocation rate is exceeding the page

1512

* that the long-lived page allocation rate is exceeding the page

1513

* laundering rate. Either the global limits are not being effective

1513

* laundering rate. Either the global limits are not being effective

1514

* at throttling processes due to the page distribution throughout

1514

* at throttling processes due to the page distribution throughout

1515

* zones or there is heavy usage of a slow backing device. The

1515

* zones or there is heavy usage of a slow backing device. The

1516

* only option is to throttle from reclaim context which is not ideal

1516

* only option is to throttle from reclaim context which is not ideal

1517

* as there is no guarantee the dirtying process is throttled in the

1517

* as there is no guarantee the dirtying process is throttled in the

1518

* same way balance_dirty_pages() manages.

1518

* same way balance_dirty_pages() manages.

1519

*

1519

*

1520

* Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number

1520

* Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number

1521

* of pages under pages flagged for immediate reclaim and stall if any

1521

* of pages under pages flagged for immediate reclaim and stall if any

1522

* are encountered in the nr_immediate check below.

1522

* are encountered in the nr_immediate check below.

1523

*/

1523

*/

1524

if (nr_writeback && nr_writeback == nr_taken)

1524

if (nr_writeback && nr_writeback == nr_taken)

1525

zone_set_flag(zone, ZONE_WRITEBACK);

1525

zone_set_flag(zone, ZONE_WRITEBACK);

1526

1527

/*

1527

/*

1528

* memcg will stall in page writeback so only consider forcibly

1528

* memcg will stall in page writeback so only consider forcibly

1529

* stalling for global reclaim

1529

* stalling for global reclaim

1530

*/

1530

*/

1531

if (global_reclaim(sc)) {

1531

if (global_reclaim(sc)) {

1532

/*

1532

/*

1533

* Tag a zone as congested if all the dirty pages scanned were

1533

* Tag a zone as congested if all the dirty pages scanned were

1534

* backed by a congested BDI and wait_iff_congested will stall.

1534

* backed by a congested BDI and wait_iff_congested will stall.

1535

*/

1535

*/

1536

if (nr_dirty && nr_dirty == nr_congested)

1536

if (nr_dirty && nr_dirty == nr_congested)

1537

zone_set_flag(zone, ZONE_CONGESTED);

1537

zone_set_flag(zone, ZONE_CONGESTED);

1538

1539

/*

1539

/*

1540

* If dirty pages are scanned that are not queued for IO, it

1540

* If dirty pages are scanned that are not queued for IO, it

1541

* implies that flushers are not keeping up. In this case, flag

1541

* implies that flushers are not keeping up. In this case, flag

1542

* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing

1542

* the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing

1543

* pages from reclaim context.

1543

* pages from reclaim context.

1544

*/

1544

*/

1545

if (nr_unqueued_dirty == nr_taken)

1545

if (nr_unqueued_dirty == nr_taken)

1546

zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);

1546

zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);

1547

1548

/*

1548

/*

1549

* If kswapd scans pages marked marked for immediate

1549

* If kswapd scans pages marked marked for immediate

1550

* reclaim and under writeback (nr_immediate), it implies

1550

* reclaim and under writeback (nr_immediate), it implies

1551

* that pages are cycling through the LRU faster than

1551

* that pages are cycling through the LRU faster than

1552

* they are written so also forcibly stall.

1552

* they are written so also forcibly stall.

1553

*/

1553

*/

1554

if (nr_immediate)

1554

if (nr_immediate)

1555

congestion_wait(BLK_RW_ASYNC, HZ/10);

1555

congestion_wait(BLK_RW_ASYNC, HZ/10);

1556

}

1556

}

1557

1558

/*

1558

/*

1559

* Stall direct reclaim for IO completions if underlying BDIs or zone

1559

* Stall direct reclaim for IO completions if underlying BDIs or zone

1560

* is congested. Allow kswapd to continue until it starts encountering

1560

* is congested. Allow kswapd to continue until it starts encountering

1561

* unqueued dirty pages or cycling through the LRU too quickly.

1561

* unqueued dirty pages or cycling through the LRU too quickly.

1562

*/

1562

*/

1563

if (!sc->hibernation_mode && !current_is_kswapd())

1563

if (!sc->hibernation_mode && !current_is_kswapd())

1564

wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

1564

wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);

1565

1566

trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,

1566

trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,

1567

zone_idx(zone),

1567

zone_idx(zone),

1568

nr_scanned, nr_reclaimed,

1568

nr_scanned, nr_reclaimed,

1569

sc->priority,

1569

sc->priority,

1570

trace_shrink_flags(file));

1570

trace_shrink_flags(file));

1571

return nr_reclaimed;

1571

return nr_reclaimed;

1572

}

1572

}

1573

1574

/*

1574

/*

1575

* This moves pages from the active list to the inactive list.

1575

* This moves pages from the active list to the inactive list.

1576

*

1576

*

1577

* We move them the other way if the page is referenced by one or more

1577

* We move them the other way if the page is referenced by one or more

1578

* processes, from rmap.

1578

* processes, from rmap.

1579

*

1579

*

1580

* If the pages are mostly unmapped, the processing is fast and it is

1580

* If the pages are mostly unmapped, the processing is fast and it is

1581

* appropriate to hold zone->lru_lock across the whole operation. But if

1581

* appropriate to hold zone->lru_lock across the whole operation. But if

1582

* the pages are mapped, the processing is slow (page_referenced()) so we

1582

* the pages are mapped, the processing is slow (page_referenced()) so we

1583

* should drop zone->lru_lock around each page. It's impossible to balance

1583

* should drop zone->lru_lock around each page. It's impossible to balance

1584

* this, so instead we remove the pages from the LRU while processing them.

1584

* this, so instead we remove the pages from the LRU while processing them.

1585

* It is safe to rely on PG_active against the non-LRU pages in here because

1585

* It is safe to rely on PG_active against the non-LRU pages in here because

1586

* nobody will play with that bit on a non-LRU page.

1586

* nobody will play with that bit on a non-LRU page.

1587

*

1587

*

1588

* The downside is that we have to touch page->_count against each page.

1588

* The downside is that we have to touch page->_count against each page.

1589

* But we had to alter page->flags anyway.

1589

* But we had to alter page->flags anyway.

1590

*/

1590

*/

1591

1592

static void move_active_pages_to_lru(struct lruvec *lruvec,

1592

static void move_active_pages_to_lru(struct lruvec *lruvec,

1593

struct list_head *list,

1593

struct list_head *list,

1594

struct list_head *pages_to_free,

1594

struct list_head *pages_to_free,

1595

enum lru_list lru)

1595

enum lru_list lru)

1596

{

1596

{

1597

struct zone *zone = lruvec_zone(lruvec);

1597

struct zone *zone = lruvec_zone(lruvec);

1598

unsigned long pgmoved = 0;

1598

unsigned long pgmoved = 0;

1599

struct page *page;

1599

struct page *page;

1600

int nr_pages;

1600

int nr_pages;

1601

1602

while (!list_empty(list)) {

1602

while (!list_empty(list)) {

1603

page = lru_to_page(list);

1603

page = lru_to_page(list);

1604

lruvec = mem_cgroup_page_lruvec(page, zone);

1604

lruvec = mem_cgroup_page_lruvec(page, zone);

1605

1606

VM_BUG_ON(PageLRU(page));

1606

VM_BUG_ON(PageLRU(page));

1607

SetPageLRU(page);

1607

SetPageLRU(page);

1608

1609

nr_pages = hpage_nr_pages(page);

1609

nr_pages = hpage_nr_pages(page);

1610

mem_cgroup_update_lru_size(lruvec, lru, nr_pages);

1610

mem_cgroup_update_lru_size(lruvec, lru, nr_pages);

1611

list_move(&page->lru, &lruvec->lists[lru]);

1611

list_move(&page->lru, &lruvec->lists[lru]);

1612

pgmoved += nr_pages;

1612

pgmoved += nr_pages;

1613

1614

if (put_page_testzero(page)) {

1614

if (put_page_testzero(page)) {

1615

__ClearPageLRU(page);

1615

__ClearPageLRU(page);

1616

__ClearPageActive(page);

1616

__ClearPageActive(page);

1617

del_page_from_lru_list(page, lruvec, lru);

1617

del_page_from_lru_list(page, lruvec, lru);

1618

1619

if (unlikely(PageCompound(page))) {

1619

if (unlikely(PageCompound(page))) {

1620

spin_unlock_irq(&zone->lru_lock);

1620

spin_unlock_irq(&zone->lru_lock);

1621

(*get_compound_page_dtor(page))(page);

1621

(*get_compound_page_dtor(page))(page);

1622

spin_lock_irq(&zone->lru_lock);

1622

spin_lock_irq(&zone->lru_lock);

1623

} else

1623

} else

1624

list_add(&page->lru, pages_to_free);

1624

list_add(&page->lru, pages_to_free);

1625

}

1625

}

1626

}

1626

}

1627

__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);

1627

__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);

1628

if (!is_active_lru(lru))

1628

if (!is_active_lru(lru))

1629

__count_vm_events(PGDEACTIVATE, pgmoved);

1629

__count_vm_events(PGDEACTIVATE, pgmoved);

1630

}

1630

}

1631

1632

static void shrink_active_list(unsigned long nr_to_scan,

1632

static void shrink_active_list(unsigned long nr_to_scan,

1633

struct lruvec *lruvec,

1633

struct lruvec *lruvec,

1634

struct scan_control *sc,

1634

struct scan_control *sc,

1635

enum lru_list lru)

1635

enum lru_list lru)

1636

{

1636

{

1637

unsigned long nr_taken;

1637

unsigned long nr_taken;

1638

unsigned long nr_scanned;

1638

unsigned long nr_scanned;

1639

unsigned long vm_flags;

1639

unsigned long vm_flags;

1640

LIST_HEAD(l_hold); /* The pages which were snipped off */

1640

LIST_HEAD(l_hold); /* The pages which were snipped off */

1641

LIST_HEAD(l_active);

1641

LIST_HEAD(l_active);

1642

LIST_HEAD(l_inactive);

1642

LIST_HEAD(l_inactive);

1643

struct page *page;

1643

struct page *page;

1644

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1644

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1645

unsigned long nr_rotated = 0;

1645

unsigned long nr_rotated = 0;

1646

isolate_mode_t isolate_mode = 0;

1646

isolate_mode_t isolate_mode = 0;

1647

int file = is_file_lru(lru);

1647

int file = is_file_lru(lru);

1648

struct zone *zone = lruvec_zone(lruvec);

1648

struct zone *zone = lruvec_zone(lruvec);

1649

1650

lru_add_drain();

1650

lru_add_drain();

1651

1652

if (!sc->may_unmap)

1652

if (!sc->may_unmap)

1653

isolate_mode |= ISOLATE_UNMAPPED;

1653

isolate_mode |= ISOLATE_UNMAPPED;

1654

if (!sc->may_writepage)

1654

if (!sc->may_writepage)

1655

isolate_mode |= ISOLATE_CLEAN;

1655

isolate_mode |= ISOLATE_CLEAN;

1656

1657

spin_lock_irq(&zone->lru_lock);

1657

spin_lock_irq(&zone->lru_lock);

1658

1659

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,

1659

nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,

1660

&nr_scanned, sc, isolate_mode, lru);

1660

&nr_scanned, sc, isolate_mode, lru);

1661

if (global_reclaim(sc))

1661

if (global_reclaim(sc))

1662

zone->pages_scanned += nr_scanned;

1662

zone->pages_scanned += nr_scanned;

1663

1664

reclaim_stat->recent_scanned[file] += nr_taken;

1664

reclaim_stat->recent_scanned[file] += nr_taken;

1665

1666

__count_zone_vm_events(PGREFILL, zone, nr_scanned);

1666

__count_zone_vm_events(PGREFILL, zone, nr_scanned);

1667

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1667

__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);

1668

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1668

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);

1669

spin_unlock_irq(&zone->lru_lock);

1669

spin_unlock_irq(&zone->lru_lock);

1670

1671

while (!list_empty(&l_hold)) {

1671

while (!list_empty(&l_hold)) {

1672

cond_resched();

1672

cond_resched();

1673

page = lru_to_page(&l_hold);

1673

page = lru_to_page(&l_hold);

1674

list_del(&page->lru);

1674

list_del(&page->lru);

1675

1676

if (unlikely(!page_evictable(page))) {

1676

if (unlikely(!page_evictable(page))) {

1677

putback_lru_page(page);

1677

putback_lru_page(page);

1678

continue;

1678

continue;

1679

}

1679

}

1680

1681

if (unlikely(buffer_heads_over_limit)) {

1681

if (unlikely(buffer_heads_over_limit)) {

1682

if (page_has_private(page) && trylock_page(page)) {

1682

if (page_has_private(page) && trylock_page(page)) {

1683

if (page_has_private(page))

1683

if (page_has_private(page))

1684

try_to_release_page(page, 0);

1684

try_to_release_page(page, 0);

1685

unlock_page(page);

1685

unlock_page(page);

1686

}

1686

}

1687

}

1687

}

1688

1689

if (page_referenced(page, 0, sc->target_mem_cgroup,

1689

if (page_referenced(page, 0, sc->target_mem_cgroup,

1690

&vm_flags)) {

1690

&vm_flags)) {

1691

nr_rotated += hpage_nr_pages(page);

1691

nr_rotated += hpage_nr_pages(page);

1692

/*

1692

/*

1693

* Identify referenced, file-backed active pages and

1693

* Identify referenced, file-backed active pages and

1694

* give them one more trip around the active list. So

1694

* give them one more trip around the active list. So

1695

* that executable code get better chances to stay in

1695

* that executable code get better chances to stay in

1696

* memory under moderate memory pressure. Anon pages

1696

* memory under moderate memory pressure. Anon pages

1697

* are not likely to be evicted by use-once streaming

1697

* are not likely to be evicted by use-once streaming

1698

* IO, plus JVM can create lots of anon VM_EXEC pages,

1698

* IO, plus JVM can create lots of anon VM_EXEC pages,

1699

* so we ignore them here.

1699

* so we ignore them here.

1700

*/

1700

*/

1701

if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {

1701

if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {

1702

list_add(&page->lru, &l_active);

1702

list_add(&page->lru, &l_active);

1703

continue;

1703

continue;

1704

}

1704

}

1705

}

1705

}

1706

1707

ClearPageActive(page); /* we are de-activating */

1707

ClearPageActive(page); /* we are de-activating */

1708

list_add(&page->lru, &l_inactive);

1708

list_add(&page->lru, &l_inactive);

1709

}

1709

}

1710

1711

/*

1711

/*

1712

* Move pages back to the lru list.

1712

* Move pages back to the lru list.

1713

*/

1713

*/

1714

spin_lock_irq(&zone->lru_lock);

1714

spin_lock_irq(&zone->lru_lock);

1715

/*

1715

/*

1716

* Count referenced pages from currently used mappings as rotated,

1716

* Count referenced pages from currently used mappings as rotated,

1717

* even though only some of them are actually re-activated. This

1717

* even though only some of them are actually re-activated. This

1718

* helps balance scan pressure between file and anonymous pages in

1718

* helps balance scan pressure between file and anonymous pages in

1719

* get_scan_ratio.

1719

* get_scan_ratio.

1720

*/

1720

*/

1721

reclaim_stat->recent_rotated[file] += nr_rotated;

1721

reclaim_stat->recent_rotated[file] += nr_rotated;

1722

1723

move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);

1723

move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);

1724

move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);

1724

move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);

1725

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1725

__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);

1726

spin_unlock_irq(&zone->lru_lock);

1726

spin_unlock_irq(&zone->lru_lock);

1727

1728

free_hot_cold_page_list(&l_hold, 1);

1728

free_hot_cold_page_list(&l_hold, 1);

1729

}

1729

}

1730

1731

#ifdef CONFIG_SWAP

1731

#ifdef CONFIG_SWAP

1732

static int inactive_anon_is_low_global(struct zone *zone)

1732

static int inactive_anon_is_low_global(struct zone *zone)

1733

{

1733

{

1734

unsigned long active, inactive;

1734

unsigned long active, inactive;

1735

1736

active = zone_page_state(zone, NR_ACTIVE_ANON);

1736

active = zone_page_state(zone, NR_ACTIVE_ANON);

1737

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1737

inactive = zone_page_state(zone, NR_INACTIVE_ANON);

1738

1739

if (inactive * zone->inactive_ratio < active)

1739

if (inactive * zone->inactive_ratio < active)

1740

return 1;

1740

return 1;

1741

1742

return 0;

1742

return 0;

1743

}

1743

}

1744

1745

/**

1745

/**

1746

* inactive_anon_is_low - check if anonymous pages need to be deactivated

1746

* inactive_anon_is_low - check if anonymous pages need to be deactivated

1747

* @lruvec: LRU vector to check

1747

* @lruvec: LRU vector to check

1748

*

1748

*

1749

* Returns true if the zone does not have enough inactive anon pages,

1749

* Returns true if the zone does not have enough inactive anon pages,

1750

* meaning some active anon pages need to be deactivated.

1750

* meaning some active anon pages need to be deactivated.

1751

*/

1751

*/

1752

static int inactive_anon_is_low(struct lruvec *lruvec)

1752

static int inactive_anon_is_low(struct lruvec *lruvec)

1753

{

1753

{

1754

/*

1754

/*

1755

* If we don't have swap space, anonymous page deactivation

1755

* If we don't have swap space, anonymous page deactivation

1756

* is pointless.

1756

* is pointless.

1757

*/

1757

*/

1758

if (!total_swap_pages)

1758

if (!total_swap_pages)

1759

return 0;

1759

return 0;

1760

1761

if (!mem_cgroup_disabled())

1761

if (!mem_cgroup_disabled())

1762

return mem_cgroup_inactive_anon_is_low(lruvec);

1762

return mem_cgroup_inactive_anon_is_low(lruvec);

1763

1764

return inactive_anon_is_low_global(lruvec_zone(lruvec));

1764

return inactive_anon_is_low_global(lruvec_zone(lruvec));

1765

}

1765

}

1766

#else

1766

#else

1767

static inline int inactive_anon_is_low(struct lruvec *lruvec)

1767

static inline int inactive_anon_is_low(struct lruvec *lruvec)

1768

{

1768

{

1769

return 0;

1769

return 0;

1770

}

1770

}

1771

#endif

1771

#endif

1772

1773

/**

1773

/**

1774

* inactive_file_is_low - check if file pages need to be deactivated

1774

* inactive_file_is_low - check if file pages need to be deactivated

1775

* @lruvec: LRU vector to check

1775

* @lruvec: LRU vector to check

1776

*

1776

*

1777

* When the system is doing streaming IO, memory pressure here

1777

* When the system is doing streaming IO, memory pressure here

1778

* ensures that active file pages get deactivated, until more

1778

* ensures that active file pages get deactivated, until more

1779

* than half of the file pages are on the inactive list.

1779

* than half of the file pages are on the inactive list.

1780

*

1780

*

1781

* Once we get to that situation, protect the system's working

1781

* Once we get to that situation, protect the system's working

1782

* set from being evicted by disabling active file page aging.

1782

* set from being evicted by disabling active file page aging.

1783

*

1783

*

1784

* This uses a different ratio than the anonymous pages, because

1784

* This uses a different ratio than the anonymous pages, because

1785

* the page cache uses a use-once replacement algorithm.

1785

* the page cache uses a use-once replacement algorithm.

1786

*/

1786

*/

1787

static int inactive_file_is_low(struct lruvec *lruvec)

1787

static int inactive_file_is_low(struct lruvec *lruvec)

1788

{

1788

{

1789

unsigned long inactive;

1789

unsigned long inactive;

1790

unsigned long active;

1790

unsigned long active;

1791

1792

inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);

1792

inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);

1793

active = get_lru_size(lruvec, LRU_ACTIVE_FILE);

1793

active = get_lru_size(lruvec, LRU_ACTIVE_FILE);

1794

1795

return active > inactive;

1795

return active > inactive;

1796

}

1796

}

1797

1798

static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)

1798

static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)

1799

{

1799

{

1800

if (is_file_lru(lru))

1800

if (is_file_lru(lru))

1801

return inactive_file_is_low(lruvec);

1801

return inactive_file_is_low(lruvec);

1802

else

1802

else

1803

return inactive_anon_is_low(lruvec);

1803

return inactive_anon_is_low(lruvec);

1804

}

1804

}

1805

1806

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,

1806

static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,

1807

struct lruvec *lruvec, struct scan_control *sc)

1807

struct lruvec *lruvec, struct scan_control *sc)

1808

{

1808

{

1809

if (is_active_lru(lru)) {

1809

if (is_active_lru(lru)) {

1810

if (inactive_list_is_low(lruvec, lru))

1810

if (inactive_list_is_low(lruvec, lru))

1811

shrink_active_list(nr_to_scan, lruvec, sc, lru);

1811

shrink_active_list(nr_to_scan, lruvec, sc, lru);

1812

return 0;

1812

return 0;

1813

}

1813

}

1814

1815

return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);

1815

return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);

1816

}

1816

}

1817

1818

static int vmscan_swappiness(struct scan_control *sc)

1818

static int vmscan_swappiness(struct scan_control *sc)

1819

{

1819

{

1820

if (global_reclaim(sc))

1820

if (global_reclaim(sc))

1821

return vm_swappiness;

1821

return vm_swappiness;

1822

return mem_cgroup_swappiness(sc->target_mem_cgroup);

1822

return mem_cgroup_swappiness(sc->target_mem_cgroup);

1823

}

1823

}

1824

1825

enum scan_balance {

1825

enum scan_balance {

1826

SCAN_EQUAL,

1826

SCAN_EQUAL,

1827

SCAN_FRACT,

1827

SCAN_FRACT,

1828

SCAN_ANON,

1828

SCAN_ANON,

1829

SCAN_FILE,

1829

SCAN_FILE,

1830

};

1830

};

1831

1832

/*

1832

/*

1833

* Determine how aggressively the anon and file LRU lists should be

1833

* Determine how aggressively the anon and file LRU lists should be

1834

* scanned. The relative value of each set of LRU lists is determined

1834

* scanned. The relative value of each set of LRU lists is determined

1835

* by looking at the fraction of the pages scanned we did rotate back

1835

* by looking at the fraction of the pages scanned we did rotate back

1836

* onto the active list instead of evict.

1836

* onto the active list instead of evict.

1837

*

1837

*

1838

* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan

1838

* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan

1839

* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan

1839

* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan

1840

*/

1840

*/

1841

static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,

1841

static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,

1842

unsigned long *nr)

1842

unsigned long *nr)

1843

{

1843

{

1844

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1844

struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;

1845

u64 fraction[2];

1845

u64 fraction[2];

1846

u64 denominator = 0; /* gcc */

1846

u64 denominator = 0; /* gcc */

1847

struct zone *zone = lruvec_zone(lruvec);

1847

struct zone *zone = lruvec_zone(lruvec);

1848

unsigned long anon_prio, file_prio;

1848

unsigned long anon_prio, file_prio;

1849

enum scan_balance scan_balance;

1849

enum scan_balance scan_balance;

1850

unsigned long anon, file, free;

1850

unsigned long anon, file, free;

1851

bool force_scan = false;

1851

bool force_scan = false;

1852

unsigned long ap, fp;

1852

unsigned long ap, fp;

1853

enum lru_list lru;

1853

enum lru_list lru;

1854

1855

/*

1855

/*

1856

* If the zone or memcg is small, nr[l] can be 0. This

1856

* If the zone or memcg is small, nr[l] can be 0. This

1857

* results in no scanning on this priority and a potential

1857

* results in no scanning on this priority and a potential

1858

* priority drop. Global direct reclaim can go to the next

1858

* priority drop. Global direct reclaim can go to the next

1859

* zone and tends to have no problems. Global kswapd is for

1859

* zone and tends to have no problems. Global kswapd is for

1860

* zone balancing and it needs to scan a minimum amount. When

1860

* zone balancing and it needs to scan a minimum amount. When

1861

* reclaiming for a memcg, a priority drop can cause high

1861

* reclaiming for a memcg, a priority drop can cause high

1862

* latencies, so it's better to scan a minimum amount there as

1862

* latencies, so it's better to scan a minimum amount there as

1863

* well.

1863

* well.

1864

*/

1864

*/

1865

if (current_is_kswapd() && !zone_reclaimable(zone))

1865

if (current_is_kswapd() && !zone_reclaimable(zone))

1866

force_scan = true;

1866

force_scan = true;

1867

if (!global_reclaim(sc))

1867

if (!global_reclaim(sc))

1868

force_scan = true;

1868

force_scan = true;

1869

1870

/* If we have no swap space, do not bother scanning anon pages. */

1870

/* If we have no swap space, do not bother scanning anon pages. */

1871

if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {

1871

if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {

1872

scan_balance = SCAN_FILE;

1872

scan_balance = SCAN_FILE;

1873

goto out;

1873

goto out;

1874

}

1874

}

1875

1876

/*

1876

/*

1877

* Global reclaim will swap to prevent OOM even with no

1877

* Global reclaim will swap to prevent OOM even with no

1878

* swappiness, but memcg users want to use this knob to

1878

* swappiness, but memcg users want to use this knob to

1879

* disable swapping for individual groups completely when

1879

* disable swapping for individual groups completely when

1880

* using the memory controller's swap limit feature would be

1880

* using the memory controller's swap limit feature would be

1881

* too expensive.

1881

* too expensive.

1882

*/

1882

*/

1883

if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {

1883

if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {

1884

scan_balance = SCAN_FILE;

1884

scan_balance = SCAN_FILE;

1885

goto out;

1885

goto out;

1886

}

1886

}

1887

1888

/*

1888

/*

1889

* Do not apply any pressure balancing cleverness when the

1889

* Do not apply any pressure balancing cleverness when the

1890

* system is close to OOM, scan both anon and file equally

1890

* system is close to OOM, scan both anon and file equally

1891

* (unless the swappiness setting disagrees with swapping).

1891

* (unless the swappiness setting disagrees with swapping).

1892

*/

1892

*/

1893

if (!sc->priority && vmscan_swappiness(sc)) {

1893

if (!sc->priority && vmscan_swappiness(sc)) {

1894

scan_balance = SCAN_EQUAL;

1894

scan_balance = SCAN_EQUAL;

1895

goto out;

1895

goto out;

1896

}

1896

}

1897

1898

anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +

1898

anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +

1899

get_lru_size(lruvec, LRU_INACTIVE_ANON);

1899

get_lru_size(lruvec, LRU_INACTIVE_ANON);

1900

file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +

1900

file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +

1901

get_lru_size(lruvec, LRU_INACTIVE_FILE);

1901

get_lru_size(lruvec, LRU_INACTIVE_FILE);

1902

1903

/*

1903

/*

1904

* If it's foreseeable that reclaiming the file cache won't be

1904

* If it's foreseeable that reclaiming the file cache won't be

1905

* enough to get the zone back into a desirable shape, we have

1905

* enough to get the zone back into a desirable shape, we have

1906

* to swap. Better start now and leave the - probably heavily

1906

* to swap. Better start now and leave the - probably heavily

1907

* thrashing - remaining file pages alone.

1907

* thrashing - remaining file pages alone.

1908

*/

1908

*/

1909

if (global_reclaim(sc)) {

1909

if (global_reclaim(sc)) {

1910

free = zone_page_state(zone, NR_FREE_PAGES);

1910

free = zone_page_state(zone, NR_FREE_PAGES);

1911

if (unlikely(file + free <= high_wmark_pages(zone))) {

1911

if (unlikely(file + free <= high_wmark_pages(zone))) {

1912

scan_balance = SCAN_ANON;

1912

scan_balance = SCAN_ANON;

1913

goto out;

1913

goto out;

1914

}

1914

}

1915

}

1915

}

1916

1917

/*

1917

/*

1918

* There is enough inactive page cache, do not reclaim

1918

* There is enough inactive page cache, do not reclaim

1919

* anything from the anonymous working set right now.

1919

* anything from the anonymous working set right now.

1920

*/

1920

*/

1921

if (!inactive_file_is_low(lruvec)) {

1921

if (!inactive_file_is_low(lruvec)) {

1922

scan_balance = SCAN_FILE;

1922

scan_balance = SCAN_FILE;

1923

goto out;

1923

goto out;

1924

}

1924

}

1925

1926

scan_balance = SCAN_FRACT;

1926

scan_balance = SCAN_FRACT;

1927

1928

/*

1928

/*

1929

* With swappiness at 100, anonymous and file have the same priority.

1929

* With swappiness at 100, anonymous and file have the same priority.

1930

* This scanning priority is essentially the inverse of IO cost.

1930

* This scanning priority is essentially the inverse of IO cost.

1931

*/

1931

*/

1932

anon_prio = vmscan_swappiness(sc);

1932

anon_prio = vmscan_swappiness(sc);

1933

file_prio = 200 - anon_prio;

1933

file_prio = 200 - anon_prio;

1934

1935

/*

1935

/*

1936

* OK, so we have swap space and a fair amount of page cache

1936

* OK, so we have swap space and a fair amount of page cache

1937

* pages. We use the recently rotated / recently scanned

1937

* pages. We use the recently rotated / recently scanned

1938

* ratios to determine how valuable each cache is.

1938

* ratios to determine how valuable each cache is.

1939

*

1939

*

1940

* Because workloads change over time (and to avoid overflow)

1940

* Because workloads change over time (and to avoid overflow)

1941

* we keep these statistics as a floating average, which ends

1941

* we keep these statistics as a floating average, which ends

1942

* up weighing recent references more than old ones.

1942

* up weighing recent references more than old ones.

1943

*

1943

*

1944

* anon in [0], file in [1]

1944

* anon in [0], file in [1]

1945

*/

1945

*/

1946

spin_lock_irq(&zone->lru_lock);

1946

spin_lock_irq(&zone->lru_lock);

1947

if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {

1947

if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {

1948

reclaim_stat->recent_scanned[0] /= 2;

1948

reclaim_stat->recent_scanned[0] /= 2;

1949

reclaim_stat->recent_rotated[0] /= 2;

1949

reclaim_stat->recent_rotated[0] /= 2;

1950

}

1950

}

1951

1952

if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {

1952

if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {

1953

reclaim_stat->recent_scanned[1] /= 2;

1953

reclaim_stat->recent_scanned[1] /= 2;

1954

reclaim_stat->recent_rotated[1] /= 2;

1954

reclaim_stat->recent_rotated[1] /= 2;

1955

}

1955

}

1956

1957

/*

1957

/*

1958

* The amount of pressure on anon vs file pages is inversely

1958

* The amount of pressure on anon vs file pages is inversely

1959

* proportional to the fraction of recently scanned pages on

1959

* proportional to the fraction of recently scanned pages on

1960

* each list that were recently referenced and in active use.

1960

* each list that were recently referenced and in active use.

1961

*/

1961

*/

1962

ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);

1962

ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);

1963

ap /= reclaim_stat->recent_rotated[0] + 1;

1963

ap /= reclaim_stat->recent_rotated[0] + 1;

1964

1965

fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);

1965

fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);

1966

fp /= reclaim_stat->recent_rotated[1] + 1;

1966

fp /= reclaim_stat->recent_rotated[1] + 1;

1967

spin_unlock_irq(&zone->lru_lock);

1967

spin_unlock_irq(&zone->lru_lock);

1968

1969

fraction[0] = ap;

1969

fraction[0] = ap;

1970

fraction[1] = fp;

1970

fraction[1] = fp;

1971

denominator = ap + fp + 1;

1971

denominator = ap + fp + 1;

1972

out:

1972

out:

1973

for_each_evictable_lru(lru) {

1973

for_each_evictable_lru(lru) {

1974

int file = is_file_lru(lru);

1974

int file = is_file_lru(lru);

1975

unsigned long size;

1975

unsigned long size;

1976

unsigned long scan;

1976

unsigned long scan;

1977

1978

size = get_lru_size(lruvec, lru);

1978

size = get_lru_size(lruvec, lru);

1979

scan = size >> sc->priority;

1979

scan = size >> sc->priority;

1980

1981

if (!scan && force_scan)

1981

if (!scan && force_scan)

1982

scan = min(size, SWAP_CLUSTER_MAX);

1982

scan = min(size, SWAP_CLUSTER_MAX);

1983

1984

switch (scan_balance) {

1984

switch (scan_balance) {

1985

case SCAN_EQUAL:

1985

case SCAN_EQUAL:

1986

/* Scan lists relative to size */

1986

/* Scan lists relative to size */

1987

break;

1987

break;

1988

case SCAN_FRACT:

1988

case SCAN_FRACT:

1989

/*

1989

/*

1990

* Scan types proportional to swappiness and

1990

* Scan types proportional to swappiness and

1991

* their relative recent reclaim efficiency.

1991

* their relative recent reclaim efficiency.

1992

*/

1992

*/

1993

scan = div64_u64(scan * fraction[file], denominator);

1993

scan = div64_u64(scan * fraction[file], denominator);

1994

break;

1994

break;

1995

case SCAN_FILE:

1995

case SCAN_FILE:

1996

case SCAN_ANON:

1996

case SCAN_ANON:

1997

/* Scan one type exclusively */

1997

/* Scan one type exclusively */

1998

if ((scan_balance == SCAN_FILE) != file)

1998

if ((scan_balance == SCAN_FILE) != file)

1999

scan = 0;

1999

scan = 0;

2000

break;

2000

break;

2001

default:

2001

default:

2002

/* Look ma, no brain */

2002

/* Look ma, no brain */

2003

BUG();

2003

BUG();

2004

}

2004

}

2005

nr[lru] = scan;

2005

nr[lru] = scan;

2006

}

2006

}

2007

}

2007

}

2008

2009

/*

2009

/*

2010

* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.

2010

* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.

2011

*/

2011

*/

2012

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

2012

static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

2013

{

2013

{

2014

unsigned long nr[NR_LRU_LISTS];

2014

unsigned long nr[NR_LRU_LISTS];

2015

unsigned long targets[NR_LRU_LISTS];

2015

unsigned long targets[NR_LRU_LISTS];

2016

unsigned long nr_to_scan;

2016

unsigned long nr_to_scan;

2017

enum lru_list lru;

2017

enum lru_list lru;

2018

unsigned long nr_reclaimed = 0;

2018

unsigned long nr_reclaimed = 0;

2019

unsigned long nr_to_reclaim = sc->nr_to_reclaim;

2019

unsigned long nr_to_reclaim = sc->nr_to_reclaim;

2020

struct blk_plug plug;

2020

struct blk_plug plug;

2021

bool scan_adjusted = false;

2021

bool scan_adjusted;

2022

2023

get_scan_count(lruvec, sc, nr);

2023

get_scan_count(lruvec, sc, nr);

2024

2025

/* Record the original scan target for proportional adjustments later */

2025

/* Record the original scan target for proportional adjustments later */

2026

memcpy(targets, nr, sizeof(nr));

2026

memcpy(targets, nr, sizeof(nr));

2027

2028

/*

2029

* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal

2030

* event that can occur when there is little memory pressure e.g.

2031

* multiple streaming readers/writers. Hence, we do not abort scanning

2032

* when the requested number of pages are reclaimed when scanning at

2033

* DEF_PRIORITY on the assumption that the fact we are direct

2034

* reclaiming implies that kswapd is not keeping up and it is best to

2035

* do a batch of work at once. For memcg reclaim one check is made to

2036

* abort proportional reclaim if either the file or anon lru has already

2037

* dropped to zero at the first pass.

2038

*/

2039

scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&

2040

sc->priority == DEF_PRIORITY);

2041

2028

blk_start_plug(&plug);

2042

blk_start_plug(&plug);

2029

while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||

2043

while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||

2030

nr[LRU_INACTIVE_FILE]) {

2044

nr[LRU_INACTIVE_FILE]) {

2031

unsigned long nr_anon, nr_file, percentage;

2045

unsigned long nr_anon, nr_file, percentage;

2032

unsigned long nr_scanned;

2046

unsigned long nr_scanned;

2033

2047

2034

for_each_evictable_lru(lru) {

2048

for_each_evictable_lru(lru) {

2035

if (nr[lru]) {

2049

if (nr[lru]) {

2036

nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);

2050

nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);

2037

nr[lru] -= nr_to_scan;

2051

nr[lru] -= nr_to_scan;

2038

2052

2039

nr_reclaimed += shrink_list(lru, nr_to_scan,

2053

nr_reclaimed += shrink_list(lru, nr_to_scan,

2040

lruvec, sc);

2054

lruvec, sc);

2041

}

2055

}

2042

}

2056

}

2043

2057

2044

if (nr_reclaimed < nr_to_reclaim || scan_adjusted)

2058

if (nr_reclaimed < nr_to_reclaim || scan_adjusted)

2045

continue;

2059

continue;

2046

2060

2047

/*

2061

/*

2048

* For global direct reclaim, reclaim only the number of pages

2049

* requested. Less care is taken to scan proportionally as it

2050

* is more important to minimise direct reclaim stall latency

2051

* than it is to properly age the LRU lists.

2052

*/

2053

if (global_reclaim(sc) && !current_is_kswapd())

2054

break;

2055

2056

/*

2057

* For kswapd and memcg, reclaim at least the number of pages

2062

* For kswapd and memcg, reclaim at least the number of pages

2058

* requested. Ensure that the anon and file LRUs shrink

2063

* requested. Ensure that the anon and file LRUs are scanned

2059

* proportionally what was requested by get_scan_count(). We

2064

* proportionally what was requested by get_scan_count(). We

2060

* stop reclaiming one LRU and reduce the amount scanning

2065

* stop reclaiming one LRU and reduce the amount scanning

2061

* proportional to the original scan target.

2066

* proportional to the original scan target.

2062

*/

2067

*/

2063

nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];

2068

nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];

2064

nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

2069

nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];

2070

2071

/*

2072

* It's just vindictive to attack the larger once the smaller

2073

* has gone to zero. And given the way we stop scanning the

2074

* smaller below, this makes sure that we only make one nudge

2075

* towards proportionality once we've got nr_to_reclaim.

2076

*/

2077

if (!nr_file || !nr_anon)

2078

break;

2065

2079

2066

if (nr_file > nr_anon) {

2080

if (nr_file > nr_anon) {

2067

unsigned long scan_target = targets[LRU_INACTIVE_ANON] +

2081

unsigned long scan_target = targets[LRU_INACTIVE_ANON] +

2068

targets[LRU_ACTIVE_ANON] + 1;

2082

targets[LRU_ACTIVE_ANON] + 1;

2069

lru = LRU_BASE;

2083

lru = LRU_BASE;

2070

percentage = nr_anon * 100 / scan_target;

2084

percentage = nr_anon * 100 / scan_target;

2071

} else {

2085

} else {

2072

unsigned long scan_target = targets[LRU_INACTIVE_FILE] +

2086

unsigned long scan_target = targets[LRU_INACTIVE_FILE] +

2073

targets[LRU_ACTIVE_FILE] + 1;

2087

targets[LRU_ACTIVE_FILE] + 1;

2074

lru = LRU_FILE;

2088

lru = LRU_FILE;

2075

percentage = nr_file * 100 / scan_target;

2089

percentage = nr_file * 100 / scan_target;

2076

}

2090

}

2077

2091

2078

/* Stop scanning the smaller of the LRU */

2092

/* Stop scanning the smaller of the LRU */

2079

nr[lru] = 0;

2093

nr[lru] = 0;

2080

nr[lru + LRU_ACTIVE] = 0;

2094

nr[lru + LRU_ACTIVE] = 0;

2081

2095

2082

/*

2096

/*

2083

* Recalculate the other LRU scan count based on its original

2097

* Recalculate the other LRU scan count based on its original

2084

* scan target and the percentage scanning already complete

2098

* scan target and the percentage scanning already complete

2085

*/

2099

*/

2086

lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;

2100

lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;

2087

nr_scanned = targets[lru] - nr[lru];

2101

nr_scanned = targets[lru] - nr[lru];

2088

nr[lru] = targets[lru] * (100 - percentage) / 100;

2102

nr[lru] = targets[lru] * (100 - percentage) / 100;

2089

nr[lru] -= min(nr[lru], nr_scanned);

2103

nr[lru] -= min(nr[lru], nr_scanned);

2090

2104

2091

lru += LRU_ACTIVE;

2105

lru += LRU_ACTIVE;

2092

nr_scanned = targets[lru] - nr[lru];

2106

nr_scanned = targets[lru] - nr[lru];

2093

nr[lru] = targets[lru] * (100 - percentage) / 100;

2107

nr[lru] = targets[lru] * (100 - percentage) / 100;

2094

nr[lru] -= min(nr[lru], nr_scanned);

2108

nr[lru] -= min(nr[lru], nr_scanned);

2095

2109

2096

scan_adjusted = true;

2110

scan_adjusted = true;

2097

}

2111

}

2098

blk_finish_plug(&plug);

2112

blk_finish_plug(&plug);

2099

sc->nr_reclaimed += nr_reclaimed;

2113

sc->nr_reclaimed += nr_reclaimed;

2100

2114

2101

/*

2115

/*

2102

* Even if we did not try to evict anon pages at all, we want to

2116

* Even if we did not try to evict anon pages at all, we want to

2103

* rebalance the anon lru active/inactive ratio.

2117

* rebalance the anon lru active/inactive ratio.

2104

*/

2118

*/

2105

if (inactive_anon_is_low(lruvec))

2119

if (inactive_anon_is_low(lruvec))

2106

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2120

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2107

sc, LRU_ACTIVE_ANON);

2121

sc, LRU_ACTIVE_ANON);

2108

2122

2109

throttle_vm_writeout(sc->gfp_mask);

2123

throttle_vm_writeout(sc->gfp_mask);

2110

}

2124

}

2111

2125

2112

/* Use reclaim/compaction for costly allocs or under memory pressure */

2126

/* Use reclaim/compaction for costly allocs or under memory pressure */

2113

static bool in_reclaim_compaction(struct scan_control *sc)

2127

static bool in_reclaim_compaction(struct scan_control *sc)

2114

{

2128

{

2115

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2129

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2116

(sc->order > PAGE_ALLOC_COSTLY_ORDER ||

2130

(sc->order > PAGE_ALLOC_COSTLY_ORDER ||

2117

sc->priority < DEF_PRIORITY - 2))

2131

sc->priority < DEF_PRIORITY - 2))

2118

return true;

2132

return true;

2119

2133

2120

return false;

2134

return false;

2121

}

2135

}

2122

2136

2123

/*

2137

/*

2124

* Reclaim/compaction is used for high-order allocation requests. It reclaims

2138

* Reclaim/compaction is used for high-order allocation requests. It reclaims

2125

* order-0 pages before compacting the zone. should_continue_reclaim() returns

2139

* order-0 pages before compacting the zone. should_continue_reclaim() returns

2126

* true if more pages should be reclaimed such that when the page allocator

2140

* true if more pages should be reclaimed such that when the page allocator

2127

* calls try_to_compact_zone() that it will have enough free pages to succeed.

2141

* calls try_to_compact_zone() that it will have enough free pages to succeed.

2128

* It will give up earlier than that if there is difficulty reclaiming pages.

2142

* It will give up earlier than that if there is difficulty reclaiming pages.

2129

*/

2143

*/

2130

static inline bool should_continue_reclaim(struct zone *zone,

2144

static inline bool should_continue_reclaim(struct zone *zone,

2131

unsigned long nr_reclaimed,

2145

unsigned long nr_reclaimed,

2132

unsigned long nr_scanned,

2146

unsigned long nr_scanned,

2133

struct scan_control *sc)

2147

struct scan_control *sc)

2134

{

2148

{

2135

unsigned long pages_for_compaction;

2149

unsigned long pages_for_compaction;

2136

unsigned long inactive_lru_pages;

2150

unsigned long inactive_lru_pages;

2137

2151

2138

/* If not in reclaim/compaction mode, stop */

2152

/* If not in reclaim/compaction mode, stop */

2139

if (!in_reclaim_compaction(sc))

2153

if (!in_reclaim_compaction(sc))

2140

return false;

2154

return false;

2141

2155

2142

/* Consider stopping depending on scan and reclaim activity */

2156

/* Consider stopping depending on scan and reclaim activity */

2143

if (sc->gfp_mask & __GFP_REPEAT) {

2157

if (sc->gfp_mask & __GFP_REPEAT) {

2144

/*

2158

/*

2145

* For __GFP_REPEAT allocations, stop reclaiming if the

2159

* For __GFP_REPEAT allocations, stop reclaiming if the

2146

* full LRU list has been scanned and we are still failing

2160

* full LRU list has been scanned and we are still failing

2147

* to reclaim pages. This full LRU scan is potentially

2161

* to reclaim pages. This full LRU scan is potentially

2148

* expensive but a __GFP_REPEAT caller really wants to succeed

2162

* expensive but a __GFP_REPEAT caller really wants to succeed

2149

*/

2163

*/

2150

if (!nr_reclaimed && !nr_scanned)

2164

if (!nr_reclaimed && !nr_scanned)

2151

return false;

2165

return false;

2152

} else {

2166

} else {

2153

/*

2167

/*

2154

* For non-__GFP_REPEAT allocations which can presumably

2168

* For non-__GFP_REPEAT allocations which can presumably

2155

* fail without consequence, stop if we failed to reclaim

2169

* fail without consequence, stop if we failed to reclaim

2156

* any pages from the last SWAP_CLUSTER_MAX number of

2170

* any pages from the last SWAP_CLUSTER_MAX number of

2157

* pages that were scanned. This will return to the

2171

* pages that were scanned. This will return to the

2158

* caller faster at the risk reclaim/compaction and

2172

* caller faster at the risk reclaim/compaction and

2159

* the resulting allocation attempt fails

2173

* the resulting allocation attempt fails

2160

*/

2174

*/

2161

if (!nr_reclaimed)

2175

if (!nr_reclaimed)

2162

return false;

2176

return false;

2163

}

2177

}

2164

2178

2165

/*

2179

/*

2166

* If we have not reclaimed enough pages for compaction and the

2180

* If we have not reclaimed enough pages for compaction and the

2167

* inactive lists are large enough, continue reclaiming

2181

* inactive lists are large enough, continue reclaiming

2168

*/

2182

*/

2169

pages_for_compaction = (2UL << sc->order);

2183

pages_for_compaction = (2UL << sc->order);

2170

inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);

2184

inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);

2171

if (get_nr_swap_pages() > 0)

2185

if (get_nr_swap_pages() > 0)

2172

inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);

2186

inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);

2173

if (sc->nr_reclaimed < pages_for_compaction &&

2187

if (sc->nr_reclaimed < pages_for_compaction &&

2174

inactive_lru_pages > pages_for_compaction)

2188

inactive_lru_pages > pages_for_compaction)

2175

return true;

2189

return true;

2176

2190

2177

/* If compaction would go ahead or the allocation would succeed, stop */

2191

/* If compaction would go ahead or the allocation would succeed, stop */

2178

switch (compaction_suitable(zone, sc->order)) {

2192

switch (compaction_suitable(zone, sc->order)) {

2179

case COMPACT_PARTIAL:

2193

case COMPACT_PARTIAL:

2180

case COMPACT_CONTINUE:

2194

case COMPACT_CONTINUE:

2181

return false;

2195

return false;

2182

default:

2196

default:

2183

return true;

2197

return true;

2184

}

2198

}

2185

}

2199

}

2186

2200

2187

static void shrink_zone(struct zone *zone, struct scan_control *sc)

2201

static void shrink_zone(struct zone *zone, struct scan_control *sc)

2188

{

2202

{

2189

unsigned long nr_reclaimed, nr_scanned;

2203

unsigned long nr_reclaimed, nr_scanned;

2190

2204

2191

do {

2205

do {

2192

struct mem_cgroup *root = sc->target_mem_cgroup;

2206

struct mem_cgroup *root = sc->target_mem_cgroup;

2193

struct mem_cgroup_reclaim_cookie reclaim = {

2207

struct mem_cgroup_reclaim_cookie reclaim = {

2194

.zone = zone,

2208

.zone = zone,

2195

.priority = sc->priority,

2209

.priority = sc->priority,

2196

};

2210

};

2197

struct mem_cgroup *memcg;

2211

struct mem_cgroup *memcg;

2198

2212

2199

nr_reclaimed = sc->nr_reclaimed;

2213

nr_reclaimed = sc->nr_reclaimed;

2200

nr_scanned = sc->nr_scanned;

2214

nr_scanned = sc->nr_scanned;

2201

2215

2202

memcg = mem_cgroup_iter(root, NULL, &reclaim);

2216

memcg = mem_cgroup_iter(root, NULL, &reclaim);

2203

do {

2217

do {

2204

struct lruvec *lruvec;

2218

struct lruvec *lruvec;

2205

2219

2206

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2220

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2207

2221

2208

shrink_lruvec(lruvec, sc);

2222

shrink_lruvec(lruvec, sc);

2209

2223

2210

/*

2224

/*

2211

* Direct reclaim and kswapd have to scan all memory

2225

* Direct reclaim and kswapd have to scan all memory

2212

* cgroups to fulfill the overall scan target for the

2226

* cgroups to fulfill the overall scan target for the

2213

* zone.

2227

* zone.

2214

*

2228

*

2215

* Limit reclaim, on the other hand, only cares about

2229

* Limit reclaim, on the other hand, only cares about

2216

* nr_to_reclaim pages to be reclaimed and it will

2230

* nr_to_reclaim pages to be reclaimed and it will

2217

* retry with decreasing priority if one round over the

2231

* retry with decreasing priority if one round over the

2218

* whole hierarchy is not sufficient.

2232

* whole hierarchy is not sufficient.

2219

*/

2233

*/

2220

if (!global_reclaim(sc) &&

2234

if (!global_reclaim(sc) &&

2221

sc->nr_reclaimed >= sc->nr_to_reclaim) {

2235

sc->nr_reclaimed >= sc->nr_to_reclaim) {

2222

mem_cgroup_iter_break(root, memcg);

2236

mem_cgroup_iter_break(root, memcg);

2223

break;

2237

break;

2224

}

2238

}

2225

memcg = mem_cgroup_iter(root, memcg, &reclaim);

2239

memcg = mem_cgroup_iter(root, memcg, &reclaim);

2226

} while (memcg);

2240

} while (memcg);

2227

2241

2228

vmpressure(sc->gfp_mask, sc->target_mem_cgroup,

2242

vmpressure(sc->gfp_mask, sc->target_mem_cgroup,

2229

sc->nr_scanned - nr_scanned,

2243

sc->nr_scanned - nr_scanned,

2230

sc->nr_reclaimed - nr_reclaimed);

2244

sc->nr_reclaimed - nr_reclaimed);

2231

2245

2232

} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,

2246

} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,

2233

sc->nr_scanned - nr_scanned, sc));

2247

sc->nr_scanned - nr_scanned, sc));

2234

}

2248

}

2235

2249

2236

/* Returns true if compaction should go ahead for a high-order request */

2250

/* Returns true if compaction should go ahead for a high-order request */

2237

static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)

2251

static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)

2238

{

2252

{

2239

unsigned long balance_gap, watermark;

2253

unsigned long balance_gap, watermark;

2240

bool watermark_ok;

2254

bool watermark_ok;

2241

2255

2242

/* Do not consider compaction for orders reclaim is meant to satisfy */

2256

/* Do not consider compaction for orders reclaim is meant to satisfy */

2243

if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)

2257

if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)

2244

return false;

2258

return false;

2245

2259

2246

/*

2260

/*

2247

* Compaction takes time to run and there are potentially other

2261

* Compaction takes time to run and there are potentially other

2248

* callers using the pages just freed. Continue reclaiming until

2262

* callers using the pages just freed. Continue reclaiming until

2249

* there is a buffer of free pages available to give compaction

2263

* there is a buffer of free pages available to give compaction

2250

* a reasonable chance of completing and allocating the page

2264

* a reasonable chance of completing and allocating the page

2251

*/

2265

*/

2252

balance_gap = min(low_wmark_pages(zone),

2266

balance_gap = min(low_wmark_pages(zone),

2253

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2267

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2254

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2268

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2255

watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);

2269

watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);

2256

watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

2270

watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);

2257

2271

2258

/*

2272

/*

2259

* If compaction is deferred, reclaim up to a point where

2273

* If compaction is deferred, reclaim up to a point where

2260

* compaction will have a chance of success when re-enabled

2274

* compaction will have a chance of success when re-enabled

2261

*/

2275

*/

2262

if (compaction_deferred(zone, sc->order))

2276

if (compaction_deferred(zone, sc->order))

2263

return watermark_ok;

2277

return watermark_ok;

2264

2278

2265

/* If compaction is not ready to start, keep reclaiming */

2279

/* If compaction is not ready to start, keep reclaiming */

2266

if (!compaction_suitable(zone, sc->order))

2280

if (!compaction_suitable(zone, sc->order))

2267

return false;

2281

return false;

2268

2282

2269

return watermark_ok;

2283

return watermark_ok;

2270

}

2284

}

2271

2285

2272

/*

2286

/*

2273

* This is the direct reclaim path, for page-allocating processes. We only

2287

* This is the direct reclaim path, for page-allocating processes. We only

2274

* try to reclaim pages from zones which will satisfy the caller's allocation

2288

* try to reclaim pages from zones which will satisfy the caller's allocation

2275

* request.

2289

* request.

2276

*

2290

*

2277

* We reclaim from a zone even if that zone is over high_wmark_pages(zone).

2291

* We reclaim from a zone even if that zone is over high_wmark_pages(zone).

2278

* Because:

2292

* Because:

2279

* a) The caller may be trying to free *extra* pages to satisfy a higher-order

2293

* a) The caller may be trying to free *extra* pages to satisfy a higher-order

2280

* allocation or

2294

* allocation or

2281

* b) The target zone may be at high_wmark_pages(zone) but the lower zones

2295

* b) The target zone may be at high_wmark_pages(zone) but the lower zones

2282

* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'

2296

* must go *over* high_wmark_pages(zone) to satisfy the `incremental min'

2283

* zone defense algorithm.

2297

* zone defense algorithm.

2284

*

2298

*

2285

* If a zone is deemed to be full of pinned pages then just give it a light

2299

* If a zone is deemed to be full of pinned pages then just give it a light

2286

* scan then give up on it.

2300

* scan then give up on it.

2287

*

2301

*

2288

* This function returns true if a zone is being reclaimed for a costly

2302

* This function returns true if a zone is being reclaimed for a costly

2289

* high-order allocation and compaction is ready to begin. This indicates to

2303

* high-order allocation and compaction is ready to begin. This indicates to

2290

* the caller that it should consider retrying the allocation instead of

2304

* the caller that it should consider retrying the allocation instead of

2291

* further reclaim.

2305

* further reclaim.

2292

*/

2306

*/

2293

static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

2307

static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)

2294

{

2308

{

2295

struct zoneref *z;

2309

struct zoneref *z;

2296

struct zone *zone;

2310

struct zone *zone;

2297

unsigned long nr_soft_reclaimed;

2311

unsigned long nr_soft_reclaimed;

2298

unsigned long nr_soft_scanned;

2312

unsigned long nr_soft_scanned;

2299

bool aborted_reclaim = false;

2313

bool aborted_reclaim = false;

2300

2314

2301

/*

2315

/*

2302

* If the number of buffer_heads in the machine exceeds the maximum

2316

* If the number of buffer_heads in the machine exceeds the maximum

2303

* allowed level, force direct reclaim to scan the highmem zone as

2317

* allowed level, force direct reclaim to scan the highmem zone as

2304

* highmem pages could be pinning lowmem pages storing buffer_heads

2318

* highmem pages could be pinning lowmem pages storing buffer_heads

2305

*/

2319

*/

2306

if (buffer_heads_over_limit)

2320

if (buffer_heads_over_limit)

2307

sc->gfp_mask |= __GFP_HIGHMEM;

2321

sc->gfp_mask |= __GFP_HIGHMEM;

2308

2322

2309

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2323

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2310

gfp_zone(sc->gfp_mask), sc->nodemask) {

2324

gfp_zone(sc->gfp_mask), sc->nodemask) {

2311

if (!populated_zone(zone))

2325

if (!populated_zone(zone))

2312

continue;

2326

continue;

2313

/*

2327

/*

2314

* Take care memory controller reclaiming has small influence

2328

* Take care memory controller reclaiming has small influence

2315

* to global LRU.

2329

* to global LRU.

2316

*/

2330

*/

2317

if (global_reclaim(sc)) {

2331

if (global_reclaim(sc)) {

2318

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2332

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2319

continue;

2333

continue;

2320

if (sc->priority != DEF_PRIORITY &&

2334

if (sc->priority != DEF_PRIORITY &&

2321

!zone_reclaimable(zone))

2335

!zone_reclaimable(zone))

2322

continue; /* Let kswapd poll it */

2336

continue; /* Let kswapd poll it */

2323

if (IS_ENABLED(CONFIG_COMPACTION)) {

2337

if (IS_ENABLED(CONFIG_COMPACTION)) {

2324

/*

2338

/*

2325

* If we already have plenty of memory free for

2339

* If we already have plenty of memory free for

2326

* compaction in this zone, don't free any more.

2340

* compaction in this zone, don't free any more.

2327

* Even though compaction is invoked for any

2341

* Even though compaction is invoked for any

2328

* non-zero order, only frequent costly order

2342

* non-zero order, only frequent costly order

2329

* reclamation is disruptive enough to become a

2343

* reclamation is disruptive enough to become a

2330

* noticeable problem, like transparent huge

2344

* noticeable problem, like transparent huge

2331

* page allocations.

2345

* page allocations.

2332

*/

2346

*/

2333

if (compaction_ready(zone, sc)) {

2347

if (compaction_ready(zone, sc)) {

2334

aborted_reclaim = true;

2348

aborted_reclaim = true;

2335

continue;

2349

continue;

2336

}

2350

}

2337

}

2351

}

2338

/*

2352

/*

2339

* This steals pages from memory cgroups over softlimit

2353

* This steals pages from memory cgroups over softlimit

2340

* and returns the number of reclaimed pages and

2354

* and returns the number of reclaimed pages and

2341

* scanned pages. This works for global memory pressure

2355

* scanned pages. This works for global memory pressure

2342

* and balancing, not for a memcg's limit.

2356

* and balancing, not for a memcg's limit.

2343

*/

2357

*/

2344

nr_soft_scanned = 0;

2358

nr_soft_scanned = 0;

2345

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

2359

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

2346

sc->order, sc->gfp_mask,

2360

sc->order, sc->gfp_mask,

2347

&nr_soft_scanned);

2361

&nr_soft_scanned);

2348

sc->nr_reclaimed += nr_soft_reclaimed;

2362

sc->nr_reclaimed += nr_soft_reclaimed;

2349

sc->nr_scanned += nr_soft_scanned;

2363

sc->nr_scanned += nr_soft_scanned;

2350

/* need some check for avoid more shrink_zone() */

2364

/* need some check for avoid more shrink_zone() */

2351

}

2365

}

2352

2366

2353

shrink_zone(zone, sc);

2367

shrink_zone(zone, sc);

2354

}

2368

}

2355

2369

2356

return aborted_reclaim;

2370

return aborted_reclaim;

2357

}

2371

}

2358

2372

2359

/* All zones in zonelist are unreclaimable? */

2373

/* All zones in zonelist are unreclaimable? */

2360

static bool all_unreclaimable(struct zonelist *zonelist,

2374

static bool all_unreclaimable(struct zonelist *zonelist,

2361

struct scan_control *sc)

2375

struct scan_control *sc)

2362

{

2376

{

2363

struct zoneref *z;

2377

struct zoneref *z;

2364

struct zone *zone;

2378

struct zone *zone;

2365

2379

2366

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2380

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2367

gfp_zone(sc->gfp_mask), sc->nodemask) {

2381

gfp_zone(sc->gfp_mask), sc->nodemask) {

2368

if (!populated_zone(zone))

2382

if (!populated_zone(zone))

2369

continue;

2383

continue;

2370

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2384

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2371

continue;

2385

continue;

2372

if (zone_reclaimable(zone))

2386

if (zone_reclaimable(zone))

2373

return false;

2387

return false;

2374

}

2388

}

2375

2389

2376

return true;

2390

return true;

2377

}

2391

}

2378

2392

2379

/*

2393

/*

2380

* This is the main entry point to direct page reclaim.

2394

* This is the main entry point to direct page reclaim.

2381

*

2395

*

2382

* If a full scan of the inactive list fails to free enough memory then we

2396

* If a full scan of the inactive list fails to free enough memory then we

2383

* are "out of memory" and something needs to be killed.

2397

* are "out of memory" and something needs to be killed.

2384

*

2398

*

2385

* If the caller is !__GFP_FS then the probability of a failure is reasonably

2399

* If the caller is !__GFP_FS then the probability of a failure is reasonably

2386

* high - the zone may be full of dirty or under-writeback pages, which this

2400

* high - the zone may be full of dirty or under-writeback pages, which this

2387

* caller can't do much about. We kick the writeback threads and take explicit

2401

* caller can't do much about. We kick the writeback threads and take explicit

2388

* naps in the hope that some of these pages can be written. But if the

2402

* naps in the hope that some of these pages can be written. But if the

2389

* allocating task holds filesystem locks which prevent writeout this might not

2403

* allocating task holds filesystem locks which prevent writeout this might not

2390

* work, and the allocation attempt will fail.

2404

* work, and the allocation attempt will fail.

2391

*

2405

*

2392

* returns: 0, if no pages reclaimed

2406

* returns: 0, if no pages reclaimed

2393

* else, the number of pages reclaimed

2407

* else, the number of pages reclaimed

2394

*/

2408

*/

2395

static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

2409

static unsigned long do_try_to_free_pages(struct zonelist *zonelist,

2396

struct scan_control *sc,

2410

struct scan_control *sc,

2397

struct shrink_control *shrink)

2411

struct shrink_control *shrink)

2398

{

2412

{

2399

unsigned long total_scanned = 0;

2413

unsigned long total_scanned = 0;

2400

struct reclaim_state *reclaim_state = current->reclaim_state;

2414

struct reclaim_state *reclaim_state = current->reclaim_state;

2401

struct zoneref *z;

2415

struct zoneref *z;

2402

struct zone *zone;

2416

struct zone *zone;

2403

unsigned long writeback_threshold;

2417

unsigned long writeback_threshold;

2404

bool aborted_reclaim;

2418

bool aborted_reclaim;

2405

2419

2406

delayacct_freepages_start();

2420

delayacct_freepages_start();

2407

2421

2408

if (global_reclaim(sc))

2422

if (global_reclaim(sc))

2409

count_vm_event(ALLOCSTALL);

2423

count_vm_event(ALLOCSTALL);

2410

2424

2411

do {

2425

do {

2412

vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,

2426

vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,

2413

sc->priority);

2427

sc->priority);

2414

sc->nr_scanned = 0;

2428

sc->nr_scanned = 0;

2415

aborted_reclaim = shrink_zones(zonelist, sc);

2429

aborted_reclaim = shrink_zones(zonelist, sc);

2416

2430

2417

/*

2431

/*

2418

* Don't shrink slabs when reclaiming memory from over limit

2432

* Don't shrink slabs when reclaiming memory from over limit

2419

* cgroups but do shrink slab at least once when aborting

2433

* cgroups but do shrink slab at least once when aborting

2420

* reclaim for compaction to avoid unevenly scanning file/anon

2434

* reclaim for compaction to avoid unevenly scanning file/anon

2421

* LRU pages over slab pages.

2435

* LRU pages over slab pages.

2422

*/

2436

*/

2423

if (global_reclaim(sc)) {

2437

if (global_reclaim(sc)) {

2424

unsigned long lru_pages = 0;

2438

unsigned long lru_pages = 0;

2425

2439

2426

nodes_clear(shrink->nodes_to_scan);

2440

nodes_clear(shrink->nodes_to_scan);

2427

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2441

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2428

gfp_zone(sc->gfp_mask), sc->nodemask) {

2442

gfp_zone(sc->gfp_mask), sc->nodemask) {

2429

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2443

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

2430

continue;

2444

continue;

2431

2445

2432

lru_pages += zone_reclaimable_pages(zone);

2446

lru_pages += zone_reclaimable_pages(zone);

2433

node_set(zone_to_nid(zone),

2447

node_set(zone_to_nid(zone),

2434

shrink->nodes_to_scan);

2448

shrink->nodes_to_scan);

2435

}

2449

}

2436

2450

2437

shrink_slab(shrink, sc->nr_scanned, lru_pages);

2451

shrink_slab(shrink, sc->nr_scanned, lru_pages);

2438

if (reclaim_state) {

2452

if (reclaim_state) {

2439

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2453

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2440

reclaim_state->reclaimed_slab = 0;

2454

reclaim_state->reclaimed_slab = 0;

2441

}

2455

}

2442

}

2456

}

2443

total_scanned += sc->nr_scanned;

2457

total_scanned += sc->nr_scanned;

2444

if (sc->nr_reclaimed >= sc->nr_to_reclaim)

2458

if (sc->nr_reclaimed >= sc->nr_to_reclaim)

2445

goto out;

2459

goto out;

2446

2460

2447

/*

2461

/*

2448

* If we're getting trouble reclaiming, start doing

2462

* If we're getting trouble reclaiming, start doing

2449

* writepage even in laptop mode.

2463

* writepage even in laptop mode.

2450

*/

2464

*/

2451

if (sc->priority < DEF_PRIORITY - 2)

2465

if (sc->priority < DEF_PRIORITY - 2)

2452

sc->may_writepage = 1;

2466

sc->may_writepage = 1;

2453

2467

2454

/*

2468

/*

2455

* Try to write back as many pages as we just scanned. This

2469

* Try to write back as many pages as we just scanned. This

2456

* tends to cause slow streaming writers to write data to the

2470

* tends to cause slow streaming writers to write data to the

2457

* disk smoothly, at the dirtying rate, which is nice. But

2471

* disk smoothly, at the dirtying rate, which is nice. But

2458

* that's undesirable in laptop mode, where we *want* lumpy

2472

* that's undesirable in laptop mode, where we *want* lumpy

2459

* writeout. So in laptop mode, write out the whole world.

2473

* writeout. So in laptop mode, write out the whole world.

2460

*/

2474

*/

2461

writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;

2475

writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;

2462

if (total_scanned > writeback_threshold) {

2476

if (total_scanned > writeback_threshold) {

2463

wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,

2477

wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,

2464

WB_REASON_TRY_TO_FREE_PAGES);

2478

WB_REASON_TRY_TO_FREE_PAGES);

2465

sc->may_writepage = 1;

2479

sc->may_writepage = 1;

2466

}

2480

}

2467

} while (--sc->priority >= 0 && !aborted_reclaim);

2481

} while (--sc->priority >= 0 && !aborted_reclaim);

2468

2482

2469

out:

2483

out:

2470

delayacct_freepages_end();

2484

delayacct_freepages_end();

2471

2485

2472

if (sc->nr_reclaimed)

2486

if (sc->nr_reclaimed)

2473

return sc->nr_reclaimed;

2487

return sc->nr_reclaimed;

2474

2488

2475

/*

2489

/*

2476

* As hibernation is going on, kswapd is freezed so that it can't mark

2490

* As hibernation is going on, kswapd is freezed so that it can't mark

2477

* the zone into all_unreclaimable. Thus bypassing all_unreclaimable

2491

* the zone into all_unreclaimable. Thus bypassing all_unreclaimable

2478

* check.

2492

* check.

2479

*/

2493

*/

2480

if (oom_killer_disabled)

2494

if (oom_killer_disabled)

2481

return 0;

2495

return 0;

2482

2496

2483

/* Aborted reclaim to try compaction? don't OOM, then */

2497

/* Aborted reclaim to try compaction? don't OOM, then */

2484

if (aborted_reclaim)

2498

if (aborted_reclaim)

2485

return 1;

2499

return 1;

2486

2500

2487

/* top priority shrink_zones still had more to do? don't OOM, then */

2501

/* top priority shrink_zones still had more to do? don't OOM, then */

2488

if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

2502

if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))

2489

return 1;

2503

return 1;

2490

2504

2491

return 0;

2505

return 0;

2492

}

2506

}

2493

2507

2494

static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)

2508

static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)

2495

{

2509

{

2496

struct zone *zone;

2510

struct zone *zone;

2497

unsigned long pfmemalloc_reserve = 0;

2511

unsigned long pfmemalloc_reserve = 0;

2498

unsigned long free_pages = 0;

2512

unsigned long free_pages = 0;

2499

int i;

2513

int i;

2500

bool wmark_ok;

2514

bool wmark_ok;

2501

2515

2502

for (i = 0; i <= ZONE_NORMAL; i++) {

2516

for (i = 0; i <= ZONE_NORMAL; i++) {

2503

zone = &pgdat->node_zones[i];

2517

zone = &pgdat->node_zones[i];

2504

if (!populated_zone(zone))

2518

if (!populated_zone(zone))

2505

continue;

2519

continue;

2506

2520

2507

pfmemalloc_reserve += min_wmark_pages(zone);

2521

pfmemalloc_reserve += min_wmark_pages(zone);

2508

free_pages += zone_page_state(zone, NR_FREE_PAGES);

2522

free_pages += zone_page_state(zone, NR_FREE_PAGES);

2509

}

2523

}

2510

2524

2511

/* If there are no reserves (unexpected config) then do not throttle */

2525

/* If there are no reserves (unexpected config) then do not throttle */

2512

if (!pfmemalloc_reserve)

2526

if (!pfmemalloc_reserve)

2513

return true;

2527

return true;

2514

2528

2515

wmark_ok = free_pages > pfmemalloc_reserve / 2;

2529

wmark_ok = free_pages > pfmemalloc_reserve / 2;

2516

2530

2517

/* kswapd must be awake if processes are being throttled */

2531

/* kswapd must be awake if processes are being throttled */

2518

if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {

2532

if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {

2519

pgdat->classzone_idx = min(pgdat->classzone_idx,

2533

pgdat->classzone_idx = min(pgdat->classzone_idx,

2520

(enum zone_type)ZONE_NORMAL);

2534

(enum zone_type)ZONE_NORMAL);

2521

wake_up_interruptible(&pgdat->kswapd_wait);

2535

wake_up_interruptible(&pgdat->kswapd_wait);

2522

}

2536

}

2523

2537

2524

return wmark_ok;

2538

return wmark_ok;

2525

}

2539

}

2526

2540

2527

/*

2541

/*

2528

* Throttle direct reclaimers if backing storage is backed by the network

2542

* Throttle direct reclaimers if backing storage is backed by the network

2529

* and the PFMEMALLOC reserve for the preferred node is getting dangerously

2543

* and the PFMEMALLOC reserve for the preferred node is getting dangerously

2530

* depleted. kswapd will continue to make progress and wake the processes

2544

* depleted. kswapd will continue to make progress and wake the processes

2531

* when the low watermark is reached.

2545

* when the low watermark is reached.

2532

*

2546

*

2533

* Returns true if a fatal signal was delivered during throttling. If this

2547

* Returns true if a fatal signal was delivered during throttling. If this

2534

* happens, the page allocator should not consider triggering the OOM killer.

2548

* happens, the page allocator should not consider triggering the OOM killer.

2535

*/

2549

*/

2536

static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,

2550

static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,

2537

nodemask_t *nodemask)

2551

nodemask_t *nodemask)

2538

{

2552

{

2539

struct zoneref *z;

2553

struct zoneref *z;

2540

struct zone *zone;

2554

struct zone *zone;

2541

pg_data_t *pgdat = NULL;

2555

pg_data_t *pgdat = NULL;

2542

2556

2543

/*

2557

/*

2544

* Kernel threads should not be throttled as they may be indirectly

2558

* Kernel threads should not be throttled as they may be indirectly

2545

* responsible for cleaning pages necessary for reclaim to make forward

2559

* responsible for cleaning pages necessary for reclaim to make forward

2546

* progress. kjournald for example may enter direct reclaim while

2560

* progress. kjournald for example may enter direct reclaim while

2547

* committing a transaction where throttling it could forcing other

2561

* committing a transaction where throttling it could forcing other

2548

* processes to block on log_wait_commit().

2562

* processes to block on log_wait_commit().

2549

*/

2563

*/

2550

if (current->flags & PF_KTHREAD)

2564

if (current->flags & PF_KTHREAD)

2551

goto out;

2565

goto out;

2552

2566

2553

/*

2567

/*

2554

* If a fatal signal is pending, this process should not throttle.

2568

* If a fatal signal is pending, this process should not throttle.

2555

* It should return quickly so it can exit and free its memory

2569

* It should return quickly so it can exit and free its memory

2556

*/

2570

*/

2557

if (fatal_signal_pending(current))

2571

if (fatal_signal_pending(current))

2558

goto out;

2572

goto out;

2559

2573

2560

/*

2574

/*

2561

* Check if the pfmemalloc reserves are ok by finding the first node

2575

* Check if the pfmemalloc reserves are ok by finding the first node

2562

* with a usable ZONE_NORMAL or lower zone. The expectation is that

2576

* with a usable ZONE_NORMAL or lower zone. The expectation is that

2563

* GFP_KERNEL will be required for allocating network buffers when

2577

* GFP_KERNEL will be required for allocating network buffers when

2564

* swapping over the network so ZONE_HIGHMEM is unusable.

2578

* swapping over the network so ZONE_HIGHMEM is unusable.

2565

*

2579

*

2566

* Throttling is based on the first usable node and throttled processes

2580

* Throttling is based on the first usable node and throttled processes

2567

* wait on a queue until kswapd makes progress and wakes them. There

2581

* wait on a queue until kswapd makes progress and wakes them. There

2568

* is an affinity then between processes waking up and where reclaim

2582

* is an affinity then between processes waking up and where reclaim

2569

* progress has been made assuming the process wakes on the same node.

2583

* progress has been made assuming the process wakes on the same node.

2570

* More importantly, processes running on remote nodes will not compete

2584

* More importantly, processes running on remote nodes will not compete

2571

* for remote pfmemalloc reserves and processes on different nodes

2585

* for remote pfmemalloc reserves and processes on different nodes

2572

* should make reasonable progress.

2586

* should make reasonable progress.

2573

*/

2587

*/

2574

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2588

for_each_zone_zonelist_nodemask(zone, z, zonelist,

2575

gfp_mask, nodemask) {

2589

gfp_mask, nodemask) {

2576

if (zone_idx(zone) > ZONE_NORMAL)

2590

if (zone_idx(zone) > ZONE_NORMAL)

2577

continue;

2591

continue;

2578

2592

2579

/* Throttle based on the first usable node */

2593

/* Throttle based on the first usable node */

2580

pgdat = zone->zone_pgdat;

2594

pgdat = zone->zone_pgdat;

2581

if (pfmemalloc_watermark_ok(pgdat))

2595

if (pfmemalloc_watermark_ok(pgdat))

2582

goto out;

2596

goto out;

2583

break;

2597

break;

2584

}

2598

}

2585

2599

2586

/* If no zone was usable by the allocation flags then do not throttle */

2600

/* If no zone was usable by the allocation flags then do not throttle */

2587

if (!pgdat)

2601

if (!pgdat)

2588

goto out;

2602

goto out;

2589

2603

2590

/* Account for the throttling */

2604

/* Account for the throttling */

2591

count_vm_event(PGSCAN_DIRECT_THROTTLE);

2605

count_vm_event(PGSCAN_DIRECT_THROTTLE);

2592

2606

2593

/*

2607

/*

2594

* If the caller cannot enter the filesystem, it's possible that it

2608

* If the caller cannot enter the filesystem, it's possible that it

2595

* is due to the caller holding an FS lock or performing a journal

2609

* is due to the caller holding an FS lock or performing a journal

2596

* transaction in the case of a filesystem like ext[3|4]. In this case,

2610

* transaction in the case of a filesystem like ext[3|4]. In this case,

2597

* it is not safe to block on pfmemalloc_wait as kswapd could be

2611

* it is not safe to block on pfmemalloc_wait as kswapd could be

2598

* blocked waiting on the same lock. Instead, throttle for up to a

2612

* blocked waiting on the same lock. Instead, throttle for up to a

2599

* second before continuing.

2613

* second before continuing.

2600

*/

2614

*/

2601

if (!(gfp_mask & __GFP_FS)) {

2615

if (!(gfp_mask & __GFP_FS)) {

2602

wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,

2616

wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,

2603

pfmemalloc_watermark_ok(pgdat), HZ);

2617

pfmemalloc_watermark_ok(pgdat), HZ);

2604

2618

2605

goto check_pending;

2619

goto check_pending;

2606

}

2620

}

2607

2621

2608

/* Throttle until kswapd wakes the process */

2622

/* Throttle until kswapd wakes the process */

2609

wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,

2623

wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,

2610

pfmemalloc_watermark_ok(pgdat));

2624

pfmemalloc_watermark_ok(pgdat));

2611

2625

2612

check_pending:

2626

check_pending:

2613

if (fatal_signal_pending(current))

2627

if (fatal_signal_pending(current))

2614

return true;

2628

return true;

2615

2629

2616

out:

2630

out:

2617

return false;

2631

return false;

2618

}

2632

}

2619

2633

2620

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

2634

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,

2621

gfp_t gfp_mask, nodemask_t *nodemask)

2635

gfp_t gfp_mask, nodemask_t *nodemask)

2622

{

2636

{

2623

unsigned long nr_reclaimed;

2637

unsigned long nr_reclaimed;

2624

struct scan_control sc = {

2638

struct scan_control sc = {

2625

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

2639

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

2626

.may_writepage = !laptop_mode,

2640

.may_writepage = !laptop_mode,

2627

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2641

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2628

.may_unmap = 1,

2642

.may_unmap = 1,

2629

.may_swap = 1,

2643

.may_swap = 1,

2630

.order = order,

2644

.order = order,

2631

.priority = DEF_PRIORITY,

2645

.priority = DEF_PRIORITY,

2632

.target_mem_cgroup = NULL,

2646

.target_mem_cgroup = NULL,

2633

.nodemask = nodemask,

2647

.nodemask = nodemask,

2634

};

2648

};

2635

struct shrink_control shrink = {

2649

struct shrink_control shrink = {

2636

.gfp_mask = sc.gfp_mask,

2650

.gfp_mask = sc.gfp_mask,

2637

};

2651

};

2638

2652

2639

/*

2653

/*

2640

* Do not enter reclaim if fatal signal was delivered while throttled.

2654

* Do not enter reclaim if fatal signal was delivered while throttled.

2641

* 1 is returned so that the page allocator does not OOM kill at this

2655

* 1 is returned so that the page allocator does not OOM kill at this

2642

* point.

2656

* point.

2643

*/

2657

*/

2644

if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))

2658

if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))

2645

return 1;

2659

return 1;

2646

2660

2647

trace_mm_vmscan_direct_reclaim_begin(order,

2661

trace_mm_vmscan_direct_reclaim_begin(order,

2648

sc.may_writepage,

2662

sc.may_writepage,

2649

gfp_mask);

2663

gfp_mask);

2650

2664

2651

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2665

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2652

2666

2653

trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

2667

trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

2654

2668

2655

return nr_reclaimed;

2669

return nr_reclaimed;

2656

}

2670

}

2657

2671

2658

#ifdef CONFIG_MEMCG

2672

#ifdef CONFIG_MEMCG

2659

2673

2660

unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,

2674

unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,

2661

gfp_t gfp_mask, bool noswap,

2675

gfp_t gfp_mask, bool noswap,

2662

struct zone *zone,

2676

struct zone *zone,

2663

unsigned long *nr_scanned)

2677

unsigned long *nr_scanned)

2664

{

2678

{

2665

struct scan_control sc = {

2679

struct scan_control sc = {

2666

.nr_scanned = 0,

2680

.nr_scanned = 0,

2667

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2681

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2668

.may_writepage = !laptop_mode,

2682

.may_writepage = !laptop_mode,

2669

.may_unmap = 1,

2683

.may_unmap = 1,

2670

.may_swap = !noswap,

2684

.may_swap = !noswap,

2671

.order = 0,

2685

.order = 0,

2672

.priority = 0,

2686

.priority = 0,

2673

.target_mem_cgroup = memcg,

2687

.target_mem_cgroup = memcg,

2674

};

2688

};

2675

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2689

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2676

2690

2677

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2691

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2678

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

2692

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);

2679

2693

2680

trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,

2694

trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,

2681

sc.may_writepage,

2695

sc.may_writepage,

2682

sc.gfp_mask);

2696

sc.gfp_mask);

2683

2697

2684

/*

2698

/*

2685

* NOTE: Although we can get the priority field, using it

2699

* NOTE: Although we can get the priority field, using it

2686

* here is not a good idea, since it limits the pages we can scan.

2700

* here is not a good idea, since it limits the pages we can scan.

2687

* if we don't reclaim here, the shrink_zone from balance_pgdat

2701

* if we don't reclaim here, the shrink_zone from balance_pgdat

2688

* will pick up pages from other mem cgroup's as well. We hack

2702

* will pick up pages from other mem cgroup's as well. We hack

2689

* the priority and make it zero.

2703

* the priority and make it zero.

2690

*/

2704

*/

2691

shrink_lruvec(lruvec, &sc);

2705

shrink_lruvec(lruvec, &sc);

2692

2706

2693

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

2707

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

2694

2708

2695

*nr_scanned = sc.nr_scanned;

2709

*nr_scanned = sc.nr_scanned;

2696

return sc.nr_reclaimed;

2710

return sc.nr_reclaimed;

2697

}

2711

}

2698

2712

2699

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

2713

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,

2700

gfp_t gfp_mask,

2714

gfp_t gfp_mask,

2701

bool noswap)

2715

bool noswap)

2702

{

2716

{

2703

struct zonelist *zonelist;

2717

struct zonelist *zonelist;

2704

unsigned long nr_reclaimed;

2718

unsigned long nr_reclaimed;

2705

int nid;

2719

int nid;

2706

struct scan_control sc = {

2720

struct scan_control sc = {

2707

.may_writepage = !laptop_mode,

2721

.may_writepage = !laptop_mode,

2708

.may_unmap = 1,

2722

.may_unmap = 1,

2709

.may_swap = !noswap,

2723

.may_swap = !noswap,

2710

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2724

.nr_to_reclaim = SWAP_CLUSTER_MAX,

2711

.order = 0,

2725

.order = 0,

2712

.priority = DEF_PRIORITY,

2726

.priority = DEF_PRIORITY,

2713

.target_mem_cgroup = memcg,

2727

.target_mem_cgroup = memcg,

2714

.nodemask = NULL, /* we don't care the placement */

2728

.nodemask = NULL, /* we don't care the placement */

2715

.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2729

.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |

2716

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),

2730

(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),

2717

};

2731

};

2718

struct shrink_control shrink = {

2732

struct shrink_control shrink = {

2719

.gfp_mask = sc.gfp_mask,

2733

.gfp_mask = sc.gfp_mask,

2720

};

2734

};

2721

2735

2722

/*

2736

/*

2723

* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't

2737

* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't

2724

* take care of from where we get pages. So the node where we start the

2738

* take care of from where we get pages. So the node where we start the

2725

* scan does not need to be the current node.

2739

* scan does not need to be the current node.

2726

*/

2740

*/

2727

nid = mem_cgroup_select_victim_node(memcg);

2741

nid = mem_cgroup_select_victim_node(memcg);

2728

2742

2729

zonelist = NODE_DATA(nid)->node_zonelists;

2743

zonelist = NODE_DATA(nid)->node_zonelists;

2730

2744

2731

trace_mm_vmscan_memcg_reclaim_begin(0,

2745

trace_mm_vmscan_memcg_reclaim_begin(0,

2732

sc.may_writepage,

2746

sc.may_writepage,

2733

sc.gfp_mask);

2747

sc.gfp_mask);

2734

2748

2735

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2749

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

2736

2750

2737

trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);

2751

trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);

2738

2752

2739

return nr_reclaimed;

2753

return nr_reclaimed;

2740

}

2754

}

2741

#endif

2755

#endif

2742

2756

2743

static void age_active_anon(struct zone *zone, struct scan_control *sc)

2757

static void age_active_anon(struct zone *zone, struct scan_control *sc)

2744

{

2758

{

2745

struct mem_cgroup *memcg;

2759

struct mem_cgroup *memcg;

2746

2760

2747

if (!total_swap_pages)

2761

if (!total_swap_pages)

2748

return;

2762

return;

2749

2763

2750

memcg = mem_cgroup_iter(NULL, NULL, NULL);

2764

memcg = mem_cgroup_iter(NULL, NULL, NULL);

2751

do {

2765

do {

2752

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2766

struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);

2753

2767

2754

if (inactive_anon_is_low(lruvec))

2768

if (inactive_anon_is_low(lruvec))

2755

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2769

shrink_active_list(SWAP_CLUSTER_MAX, lruvec,

2756

sc, LRU_ACTIVE_ANON);

2770

sc, LRU_ACTIVE_ANON);

2757

2771

2758

memcg = mem_cgroup_iter(NULL, memcg, NULL);

2772

memcg = mem_cgroup_iter(NULL, memcg, NULL);

2759

} while (memcg);

2773

} while (memcg);

2760

}

2774

}

2761

2775

2762

static bool zone_balanced(struct zone *zone, int order,

2776

static bool zone_balanced(struct zone *zone, int order,

2763

unsigned long balance_gap, int classzone_idx)

2777

unsigned long balance_gap, int classzone_idx)

2764

{

2778

{

2765

if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +

2779

if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +

2766

balance_gap, classzone_idx, 0))

2780

balance_gap, classzone_idx, 0))

2767

return false;

2781

return false;

2768

2782

2769

if (IS_ENABLED(CONFIG_COMPACTION) && order &&

2783

if (IS_ENABLED(CONFIG_COMPACTION) && order &&

2770

!compaction_suitable(zone, order))

2784

!compaction_suitable(zone, order))

2771

return false;

2785

return false;

2772

2786

2773

return true;

2787

return true;

2774

}

2788

}

2775

2789

2776

/*

2790

/*

2777

* pgdat_balanced() is used when checking if a node is balanced.

2791

* pgdat_balanced() is used when checking if a node is balanced.

2778

*

2792

*

2779

* For order-0, all zones must be balanced!

2793

* For order-0, all zones must be balanced!

2780

*

2794

*

2781

* For high-order allocations only zones that meet watermarks and are in a

2795

* For high-order allocations only zones that meet watermarks and are in a

2782

* zone allowed by the callers classzone_idx are added to balanced_pages. The

2796

* zone allowed by the callers classzone_idx are added to balanced_pages. The

2783

* total of balanced pages must be at least 25% of the zones allowed by

2797

* total of balanced pages must be at least 25% of the zones allowed by

2784

* classzone_idx for the node to be considered balanced. Forcing all zones to

2798

* classzone_idx for the node to be considered balanced. Forcing all zones to

2785

* be balanced for high orders can cause excessive reclaim when there are

2799

* be balanced for high orders can cause excessive reclaim when there are

2786

* imbalanced zones.

2800

* imbalanced zones.

2787

* The choice of 25% is due to

2801

* The choice of 25% is due to

2788

* o a 16M DMA zone that is balanced will not balance a zone on any

2802

* o a 16M DMA zone that is balanced will not balance a zone on any

2789

* reasonable sized machine

2803

* reasonable sized machine

2790

* o On all other machines, the top zone must be at least a reasonable

2804

* o On all other machines, the top zone must be at least a reasonable

2791

* percentage of the middle zones. For example, on 32-bit x86, highmem

2805

* percentage of the middle zones. For example, on 32-bit x86, highmem

2792

* would need to be at least 256M for it to be balance a whole node.

2806

* would need to be at least 256M for it to be balance a whole node.

2793

* Similarly, on x86-64 the Normal zone would need to be at least 1G

2807

* Similarly, on x86-64 the Normal zone would need to be at least 1G

2794

* to balance a node on its own. These seemed like reasonable ratios.

2808

* to balance a node on its own. These seemed like reasonable ratios.

2795

*/

2809

*/

2796

static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

2810

static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)

2797

{

2811

{

2798

unsigned long managed_pages = 0;

2812

unsigned long managed_pages = 0;

2799

unsigned long balanced_pages = 0;

2813

unsigned long balanced_pages = 0;

2800

int i;

2814

int i;

2801

2815

2802

/* Check the watermark levels */

2816

/* Check the watermark levels */

2803

for (i = 0; i <= classzone_idx; i++) {

2817

for (i = 0; i <= classzone_idx; i++) {

2804

struct zone *zone = pgdat->node_zones + i;

2818

struct zone *zone = pgdat->node_zones + i;

2805

2819

2806

if (!populated_zone(zone))

2820

if (!populated_zone(zone))

2807

continue;

2821

continue;

2808

2822

2809

managed_pages += zone->managed_pages;

2823

managed_pages += zone->managed_pages;

2810

2824

2811

/*

2825

/*

2812

* A special case here:

2826

* A special case here:

2813

*

2827

*

2814

* balance_pgdat() skips over all_unreclaimable after

2828

* balance_pgdat() skips over all_unreclaimable after

2815

* DEF_PRIORITY. Effectively, it considers them balanced so

2829

* DEF_PRIORITY. Effectively, it considers them balanced so

2816

* they must be considered balanced here as well!

2830

* they must be considered balanced here as well!

2817

*/

2831

*/

2818

if (!zone_reclaimable(zone)) {

2832

if (!zone_reclaimable(zone)) {

2819

balanced_pages += zone->managed_pages;

2833

balanced_pages += zone->managed_pages;

2820

continue;

2834

continue;

2821

}

2835

}

2822

2836

2823

if (zone_balanced(zone, order, 0, i))

2837

if (zone_balanced(zone, order, 0, i))

2824

balanced_pages += zone->managed_pages;

2838

balanced_pages += zone->managed_pages;

2825

else if (!order)

2839

else if (!order)

2826

return false;

2840

return false;

2827

}

2841

}

2828

2842

2829

if (order)

2843

if (order)

2830

return balanced_pages >= (managed_pages >> 2);

2844

return balanced_pages >= (managed_pages >> 2);

2831

else

2845

else

2832

return true;

2846

return true;

2833

}

2847

}

2834

2848

2835

/*

2849

/*

2836

* Prepare kswapd for sleeping. This verifies that there are no processes

2850

* Prepare kswapd for sleeping. This verifies that there are no processes

2837

* waiting in throttle_direct_reclaim() and that watermarks have been met.

2851

* waiting in throttle_direct_reclaim() and that watermarks have been met.

2838

*

2852

*

2839

* Returns true if kswapd is ready to sleep

2853

* Returns true if kswapd is ready to sleep

2840

*/

2854

*/

2841

static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

2855

static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,

2842

int classzone_idx)

2856

int classzone_idx)

2843

{

2857

{

2844

/* If a direct reclaimer woke kswapd within HZ/10, it's premature */

2858

/* If a direct reclaimer woke kswapd within HZ/10, it's premature */

2845

if (remaining)

2859

if (remaining)

2846

return false;

2860

return false;

2847

2861

2848

/*

2862

/*

2849

* There is a potential race between when kswapd checks its watermarks

2863

* There is a potential race between when kswapd checks its watermarks

2850

* and a process gets throttled. There is also a potential race if

2864

* and a process gets throttled. There is also a potential race if

2851

* processes get throttled, kswapd wakes, a large process exits therby

2865

* processes get throttled, kswapd wakes, a large process exits therby

2852

* balancing the zones that causes kswapd to miss a wakeup. If kswapd

2866

* balancing the zones that causes kswapd to miss a wakeup. If kswapd

2853

* is going to sleep, no process should be sleeping on pfmemalloc_wait

2867

* is going to sleep, no process should be sleeping on pfmemalloc_wait

2854

* so wake them now if necessary. If necessary, processes will wake

2868

* so wake them now if necessary. If necessary, processes will wake

2855

* kswapd and get throttled again

2869

* kswapd and get throttled again

2856

*/

2870

*/

2857

if (waitqueue_active(&pgdat->pfmemalloc_wait)) {

2871

if (waitqueue_active(&pgdat->pfmemalloc_wait)) {

2858

wake_up(&pgdat->pfmemalloc_wait);

2872

wake_up(&pgdat->pfmemalloc_wait);

2859

return false;

2873

return false;

2860

}

2874

}

2861

2875

2862

return pgdat_balanced(pgdat, order, classzone_idx);

2876

return pgdat_balanced(pgdat, order, classzone_idx);

2863

}

2877

}

2864

2878

2865

/*

2879

/*

2866

* kswapd shrinks the zone by the number of pages required to reach

2880

* kswapd shrinks the zone by the number of pages required to reach

2867

* the high watermark.

2881

* the high watermark.

2868

*

2882

*

2869

* Returns true if kswapd scanned at least the requested number of pages to

2883

* Returns true if kswapd scanned at least the requested number of pages to

2870

* reclaim or if the lack of progress was due to pages under writeback.

2884

* reclaim or if the lack of progress was due to pages under writeback.

2871

* This is used to determine if the scanning priority needs to be raised.

2885

* This is used to determine if the scanning priority needs to be raised.

2872

*/

2886

*/

2873

static bool kswapd_shrink_zone(struct zone *zone,

2887

static bool kswapd_shrink_zone(struct zone *zone,

2874

int classzone_idx,

2888

int classzone_idx,

2875

struct scan_control *sc,

2889

struct scan_control *sc,

2876

unsigned long lru_pages,

2890

unsigned long lru_pages,

2877

unsigned long *nr_attempted)

2891

unsigned long *nr_attempted)

2878

{

2892

{

2879

int testorder = sc->order;

2893

int testorder = sc->order;

2880

unsigned long balance_gap;

2894

unsigned long balance_gap;

2881

struct reclaim_state *reclaim_state = current->reclaim_state;

2895

struct reclaim_state *reclaim_state = current->reclaim_state;

2882

struct shrink_control shrink = {

2896

struct shrink_control shrink = {

2883

.gfp_mask = sc->gfp_mask,

2897

.gfp_mask = sc->gfp_mask,

2884

};

2898

};

2885

bool lowmem_pressure;

2899

bool lowmem_pressure;

2886

2900

2887

/* Reclaim above the high watermark. */

2901

/* Reclaim above the high watermark. */

2888

sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

2902

sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

2889

2903

2890

/*

2904

/*

2891

* Kswapd reclaims only single pages with compaction enabled. Trying

2905

* Kswapd reclaims only single pages with compaction enabled. Trying

2892

* too hard to reclaim until contiguous free pages have become

2906

* too hard to reclaim until contiguous free pages have become

2893

* available can hurt performance by evicting too much useful data

2907

* available can hurt performance by evicting too much useful data

2894

* from memory. Do not reclaim more than needed for compaction.

2908

* from memory. Do not reclaim more than needed for compaction.

2895

*/

2909

*/

2896

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2910

if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&

2897

compaction_suitable(zone, sc->order) !=

2911

compaction_suitable(zone, sc->order) !=

2898

COMPACT_SKIPPED)

2912

COMPACT_SKIPPED)

2899

testorder = 0;

2913

testorder = 0;

2900

2914

2901

/*

2915

/*

2902

* We put equal pressure on every zone, unless one zone has way too

2916

* We put equal pressure on every zone, unless one zone has way too

2903

* many pages free already. The "too many pages" is defined as the

2917

* many pages free already. The "too many pages" is defined as the

2904

* high wmark plus a "gap" where the gap is either the low

2918

* high wmark plus a "gap" where the gap is either the low

2905

* watermark or 1% of the zone, whichever is smaller.

2919

* watermark or 1% of the zone, whichever is smaller.

2906

*/

2920

*/

2907

balance_gap = min(low_wmark_pages(zone),

2921

balance_gap = min(low_wmark_pages(zone),

2908

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2922

(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /

2909

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2923

KSWAPD_ZONE_BALANCE_GAP_RATIO);

2910

2924

2911

/*

2925

/*

2912

* If there is no low memory pressure or the zone is balanced then no

2926

* If there is no low memory pressure or the zone is balanced then no

2913

* reclaim is necessary

2927

* reclaim is necessary

2914

*/

2928

*/

2915

lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));

2929

lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));

2916

if (!lowmem_pressure && zone_balanced(zone, testorder,

2930

if (!lowmem_pressure && zone_balanced(zone, testorder,

2917

balance_gap, classzone_idx))

2931

balance_gap, classzone_idx))

2918

return true;

2932

return true;

2919

2933

2920

shrink_zone(zone, sc);

2934

shrink_zone(zone, sc);

2921

nodes_clear(shrink.nodes_to_scan);

2935

nodes_clear(shrink.nodes_to_scan);

2922

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

2936

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

2923

2937

2924

reclaim_state->reclaimed_slab = 0;

2938

reclaim_state->reclaimed_slab = 0;

2925

shrink_slab(&shrink, sc->nr_scanned, lru_pages);

2939

shrink_slab(&shrink, sc->nr_scanned, lru_pages);

2926

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2940

sc->nr_reclaimed += reclaim_state->reclaimed_slab;

2927

2941

2928

/* Account for the number of pages attempted to reclaim */

2942

/* Account for the number of pages attempted to reclaim */

2929

*nr_attempted += sc->nr_to_reclaim;

2943

*nr_attempted += sc->nr_to_reclaim;

2930

2944

2931

zone_clear_flag(zone, ZONE_WRITEBACK);

2945

zone_clear_flag(zone, ZONE_WRITEBACK);

2932

2946

2933

/*

2947

/*

2934

* If a zone reaches its high watermark, consider it to be no longer

2948

* If a zone reaches its high watermark, consider it to be no longer

2935

* congested. It's possible there are dirty pages backed by congested

2949

* congested. It's possible there are dirty pages backed by congested

2936

* BDIs but as pressure is relieved, speculatively avoid congestion

2950

* BDIs but as pressure is relieved, speculatively avoid congestion

2937

* waits.

2951

* waits.

2938

*/

2952

*/

2939

if (zone_reclaimable(zone) &&

2953

if (zone_reclaimable(zone) &&

2940

zone_balanced(zone, testorder, 0, classzone_idx)) {

2954

zone_balanced(zone, testorder, 0, classzone_idx)) {

2941

zone_clear_flag(zone, ZONE_CONGESTED);

2955

zone_clear_flag(zone, ZONE_CONGESTED);

2942

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

2956

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

2943

}

2957

}

2944

2958

2945

return sc->nr_scanned >= sc->nr_to_reclaim;

2959

return sc->nr_scanned >= sc->nr_to_reclaim;

2946

}

2960

}

2947

2961

2948

/*

2962

/*

2949

* For kswapd, balance_pgdat() will work across all this node's zones until

2963

* For kswapd, balance_pgdat() will work across all this node's zones until

2950

* they are all at high_wmark_pages(zone).

2964

* they are all at high_wmark_pages(zone).

2951

*

2965

*

2952

* Returns the final order kswapd was reclaiming at

2966

* Returns the final order kswapd was reclaiming at

2953

*

2967

*

2954

* There is special handling here for zones which are full of pinned pages.

2968

* There is special handling here for zones which are full of pinned pages.

2955

* This can happen if the pages are all mlocked, or if they are all used by

2969

* This can happen if the pages are all mlocked, or if they are all used by

2956

* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.

2970

* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.

2957

* What we do is to detect the case where all pages in the zone have been

2971

* What we do is to detect the case where all pages in the zone have been

2958

* scanned twice and there has been zero successful reclaim. Mark the zone as

2972

* scanned twice and there has been zero successful reclaim. Mark the zone as

2959

* dead and from now on, only perform a short scan. Basically we're polling

2973

* dead and from now on, only perform a short scan. Basically we're polling

2960

* the zone for when the problem goes away.

2974

* the zone for when the problem goes away.

2961

*

2975

*

2962

* kswapd scans the zones in the highmem->normal->dma direction. It skips

2976

* kswapd scans the zones in the highmem->normal->dma direction. It skips

2963

* zones which have free_pages > high_wmark_pages(zone), but once a zone is

2977

* zones which have free_pages > high_wmark_pages(zone), but once a zone is

2964

* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the

2978

* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the

2965

* lower zones regardless of the number of free pages in the lower zones. This

2979

* lower zones regardless of the number of free pages in the lower zones. This

2966

* interoperates with the page allocator fallback scheme to ensure that aging

2980

* interoperates with the page allocator fallback scheme to ensure that aging

2967

* of pages is balanced across the zones.

2981

* of pages is balanced across the zones.

2968

*/

2982

*/

2969

static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

2983

static unsigned long balance_pgdat(pg_data_t *pgdat, int order,

2970

int *classzone_idx)

2984

int *classzone_idx)

2971

{

2985

{

2972

int i;

2986

int i;

2973

int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */

2987

int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */

2974

unsigned long nr_soft_reclaimed;

2988

unsigned long nr_soft_reclaimed;

2975

unsigned long nr_soft_scanned;

2989

unsigned long nr_soft_scanned;

2976

struct scan_control sc = {

2990

struct scan_control sc = {

2977

.gfp_mask = GFP_KERNEL,

2991

.gfp_mask = GFP_KERNEL,

2978

.priority = DEF_PRIORITY,

2992

.priority = DEF_PRIORITY,

2979

.may_unmap = 1,

2993

.may_unmap = 1,

2980

.may_swap = 1,

2994

.may_swap = 1,

2981

.may_writepage = !laptop_mode,

2995

.may_writepage = !laptop_mode,

2982

.order = order,

2996

.order = order,

2983

.target_mem_cgroup = NULL,

2997

.target_mem_cgroup = NULL,

2984

};

2998

};

2985

count_vm_event(PAGEOUTRUN);

2999

count_vm_event(PAGEOUTRUN);

2986

3000

2987

do {

3001

do {

2988

unsigned long lru_pages = 0;

3002

unsigned long lru_pages = 0;

2989

unsigned long nr_attempted = 0;

3003

unsigned long nr_attempted = 0;

2990

bool raise_priority = true;

3004

bool raise_priority = true;

2991

bool pgdat_needs_compaction = (order > 0);

3005

bool pgdat_needs_compaction = (order > 0);

2992

3006

2993

sc.nr_reclaimed = 0;

3007

sc.nr_reclaimed = 0;

2994

3008

2995

/*

3009

/*

2996

* Scan in the highmem->dma direction for the highest

3010

* Scan in the highmem->dma direction for the highest

2997

* zone which needs scanning

3011

* zone which needs scanning

2998

*/

3012

*/

2999

for (i = pgdat->nr_zones - 1; i >= 0; i--) {

3013

for (i = pgdat->nr_zones - 1; i >= 0; i--) {

3000

struct zone *zone = pgdat->node_zones + i;

3014

struct zone *zone = pgdat->node_zones + i;

3001

3015

3002

if (!populated_zone(zone))

3016

if (!populated_zone(zone))

3003

continue;

3017

continue;

3004

3018

3005

if (sc.priority != DEF_PRIORITY &&

3019

if (sc.priority != DEF_PRIORITY &&

3006

!zone_reclaimable(zone))

3020

!zone_reclaimable(zone))

3007

continue;

3021

continue;

3008

3022

3009

/*

3023

/*

3010

* Do some background aging of the anon list, to give

3024

* Do some background aging of the anon list, to give

3011

* pages a chance to be referenced before reclaiming.

3025

* pages a chance to be referenced before reclaiming.

3012

*/

3026

*/

3013

age_active_anon(zone, &sc);

3027

age_active_anon(zone, &sc);

3014

3028

3015

/*

3029

/*

3016

* If the number of buffer_heads in the machine

3030

* If the number of buffer_heads in the machine

3017

* exceeds the maximum allowed level and this node

3031

* exceeds the maximum allowed level and this node

3018

* has a highmem zone, force kswapd to reclaim from

3032

* has a highmem zone, force kswapd to reclaim from

3019

* it to relieve lowmem pressure.

3033

* it to relieve lowmem pressure.

3020

*/

3034

*/

3021

if (buffer_heads_over_limit && is_highmem_idx(i)) {

3035

if (buffer_heads_over_limit && is_highmem_idx(i)) {

3022

end_zone = i;

3036

end_zone = i;

3023

break;

3037

break;

3024

}

3038

}

3025

3039

3026

if (!zone_balanced(zone, order, 0, 0)) {

3040

if (!zone_balanced(zone, order, 0, 0)) {

3027

end_zone = i;

3041

end_zone = i;

3028

break;

3042

break;

3029

} else {

3043

} else {

3030

/*

3044

/*

3031

* If balanced, clear the dirty and congested

3045

* If balanced, clear the dirty and congested

3032

* flags

3046

* flags

3033

*/

3047

*/

3034

zone_clear_flag(zone, ZONE_CONGESTED);

3048

zone_clear_flag(zone, ZONE_CONGESTED);

3035

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

3049

zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);

3036

}

3050

}

3037

}

3051

}

3038

3052

3039

if (i < 0)

3053

if (i < 0)

3040

goto out;

3054

goto out;

3041

3055

3042

for (i = 0; i <= end_zone; i++) {

3056

for (i = 0; i <= end_zone; i++) {

3043

struct zone *zone = pgdat->node_zones + i;

3057

struct zone *zone = pgdat->node_zones + i;

3044

3058

3045

if (!populated_zone(zone))

3059

if (!populated_zone(zone))

3046

continue;

3060

continue;

3047

3061

3048

lru_pages += zone_reclaimable_pages(zone);

3062

lru_pages += zone_reclaimable_pages(zone);

3049

3063

3050

/*

3064

/*

3051

* If any zone is currently balanced then kswapd will

3065

* If any zone is currently balanced then kswapd will

3052

* not call compaction as it is expected that the

3066

* not call compaction as it is expected that the

3053

* necessary pages are already available.

3067

* necessary pages are already available.

3054

*/

3068

*/

3055

if (pgdat_needs_compaction &&

3069

if (pgdat_needs_compaction &&

3056

zone_watermark_ok(zone, order,

3070

zone_watermark_ok(zone, order,

3057

low_wmark_pages(zone),

3071

low_wmark_pages(zone),

3058

*classzone_idx, 0))

3072

*classzone_idx, 0))

3059

pgdat_needs_compaction = false;

3073

pgdat_needs_compaction = false;

3060

}

3074

}

3061

3075

3062

/*

3076

/*

3063

* If we're getting trouble reclaiming, start doing writepage

3077

* If we're getting trouble reclaiming, start doing writepage

3064

* even in laptop mode.

3078

* even in laptop mode.

3065

*/

3079

*/

3066

if (sc.priority < DEF_PRIORITY - 2)

3080

if (sc.priority < DEF_PRIORITY - 2)

3067

sc.may_writepage = 1;

3081

sc.may_writepage = 1;

3068

3082

3069

/*

3083

/*

3070

* Now scan the zone in the dma->highmem direction, stopping

3084

* Now scan the zone in the dma->highmem direction, stopping

3071

* at the last zone which needs scanning.

3085

* at the last zone which needs scanning.

3072

*

3086

*

3073

* We do this because the page allocator works in the opposite

3087

* We do this because the page allocator works in the opposite

3074

* direction. This prevents the page allocator from allocating

3088

* direction. This prevents the page allocator from allocating

3075

* pages behind kswapd's direction of progress, which would

3089

* pages behind kswapd's direction of progress, which would

3076

* cause too much scanning of the lower zones.

3090

* cause too much scanning of the lower zones.

3077

*/

3091

*/

3078

for (i = 0; i <= end_zone; i++) {

3092

for (i = 0; i <= end_zone; i++) {

3079

struct zone *zone = pgdat->node_zones + i;

3093

struct zone *zone = pgdat->node_zones + i;

3080

3094

3081

if (!populated_zone(zone))

3095

if (!populated_zone(zone))

3082

continue;

3096

continue;

3083

3097

3084

if (sc.priority != DEF_PRIORITY &&

3098

if (sc.priority != DEF_PRIORITY &&

3085

!zone_reclaimable(zone))

3099

!zone_reclaimable(zone))

3086

continue;

3100

continue;

3087

3101

3088

sc.nr_scanned = 0;

3102

sc.nr_scanned = 0;

3089

3103

3090

nr_soft_scanned = 0;

3104

nr_soft_scanned = 0;

3091

/*

3105

/*

3092

* Call soft limit reclaim before calling shrink_zone.

3106

* Call soft limit reclaim before calling shrink_zone.

3093

*/

3107

*/

3094

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

3108

nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,

3095

order, sc.gfp_mask,

3109

order, sc.gfp_mask,

3096

&nr_soft_scanned);

3110

&nr_soft_scanned);

3097

sc.nr_reclaimed += nr_soft_reclaimed;

3111

sc.nr_reclaimed += nr_soft_reclaimed;

3098

3112

3099

/*

3113

/*

3100

* There should be no need to raise the scanning

3114

* There should be no need to raise the scanning

3101

* priority if enough pages are already being scanned

3115

* priority if enough pages are already being scanned

3102

* that that high watermark would be met at 100%

3116

* that that high watermark would be met at 100%

3103

* efficiency.

3117

* efficiency.

3104

*/

3118

*/

3105

if (kswapd_shrink_zone(zone, end_zone, &sc,

3119

if (kswapd_shrink_zone(zone, end_zone, &sc,

3106

lru_pages, &nr_attempted))

3120

lru_pages, &nr_attempted))

3107

raise_priority = false;

3121

raise_priority = false;

3108

}

3122

}

3109

3123

3110

/*

3124

/*

3111

* If the low watermark is met there is no need for processes

3125

* If the low watermark is met there is no need for processes

3112

* to be throttled on pfmemalloc_wait as they should not be

3126

* to be throttled on pfmemalloc_wait as they should not be

3113

* able to safely make forward progress. Wake them

3127

* able to safely make forward progress. Wake them

3114

*/

3128

*/

3115

if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

3129

if (waitqueue_active(&pgdat->pfmemalloc_wait) &&

3116

pfmemalloc_watermark_ok(pgdat))

3130

pfmemalloc_watermark_ok(pgdat))

3117

wake_up(&pgdat->pfmemalloc_wait);

3131

wake_up(&pgdat->pfmemalloc_wait);

3118

3132

3119

/*

3133

/*

3120

* Fragmentation may mean that the system cannot be rebalanced

3134

* Fragmentation may mean that the system cannot be rebalanced

3121

* for high-order allocations in all zones. If twice the

3135

* for high-order allocations in all zones. If twice the

3122

* allocation size has been reclaimed and the zones are still

3136

* allocation size has been reclaimed and the zones are still

3123

* not balanced then recheck the watermarks at order-0 to

3137

* not balanced then recheck the watermarks at order-0 to

3124

* prevent kswapd reclaiming excessively. Assume that a

3138

* prevent kswapd reclaiming excessively. Assume that a

3125

* process requested a high-order can direct reclaim/compact.

3139

* process requested a high-order can direct reclaim/compact.

3126

*/

3140

*/

3127

if (order && sc.nr_reclaimed >= 2UL << order)

3141

if (order && sc.nr_reclaimed >= 2UL << order)

3128

order = sc.order = 0;

3142

order = sc.order = 0;

3129

3143

3130

/* Check if kswapd should be suspending */

3144

/* Check if kswapd should be suspending */

3131

if (try_to_freeze() || kthread_should_stop())

3145

if (try_to_freeze() || kthread_should_stop())

3132

break;

3146

break;

3133

3147

3134

/*

3148

/*

3135

* Compact if necessary and kswapd is reclaiming at least the

3149

* Compact if necessary and kswapd is reclaiming at least the

3136

* high watermark number of pages as requsted

3150

* high watermark number of pages as requsted

3137

*/

3151

*/

3138

if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)

3152

if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)

3139

compact_pgdat(pgdat, order);

3153

compact_pgdat(pgdat, order);

3140

3154

3141

/*

3155

/*

3142

* Raise priority if scanning rate is too low or there was no

3156

* Raise priority if scanning rate is too low or there was no

3143

* progress in reclaiming pages

3157

* progress in reclaiming pages

3144

*/

3158

*/

3145

if (raise_priority || !sc.nr_reclaimed)

3159

if (raise_priority || !sc.nr_reclaimed)

3146

sc.priority--;

3160

sc.priority--;

3147

} while (sc.priority >= 1 &&

3161

} while (sc.priority >= 1 &&

3148

!pgdat_balanced(pgdat, order, *classzone_idx));

3162

!pgdat_balanced(pgdat, order, *classzone_idx));

3149

3163

3150

out:

3164

out:

3151

/*

3165

/*

3152

* Return the order we were reclaiming at so prepare_kswapd_sleep()

3166

* Return the order we were reclaiming at so prepare_kswapd_sleep()

3153

* makes a decision on the order we were last reclaiming at. However,

3167

* makes a decision on the order we were last reclaiming at. However,

3154

* if another caller entered the allocator slow path while kswapd

3168

* if another caller entered the allocator slow path while kswapd

3155

* was awake, order will remain at the higher level

3169

* was awake, order will remain at the higher level

3156

*/

3170

*/

3157

*classzone_idx = end_zone;

3171

*classzone_idx = end_zone;

3158

return order;

3172

return order;

3159

}

3173

}

3160

3174

3161

static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)

3175

static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)

3162

{

3176

{

3163

long remaining = 0;

3177

long remaining = 0;

3164

DEFINE_WAIT(wait);

3178

DEFINE_WAIT(wait);

3165

3179

3166

if (freezing(current) || kthread_should_stop())

3180

if (freezing(current) || kthread_should_stop())

3167

return;

3181

return;

3168

3182

3169

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3183

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3170

3184

3171

/* Try to sleep for a short interval */

3185

/* Try to sleep for a short interval */

3172

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3186

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3173

remaining = schedule_timeout(HZ/10);

3187

remaining = schedule_timeout(HZ/10);

3174

finish_wait(&pgdat->kswapd_wait, &wait);

3188

finish_wait(&pgdat->kswapd_wait, &wait);

3175

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3189

prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);

3176

}

3190

}

3177

3191

3178

/*

3192

/*

3179

* After a short sleep, check if it was a premature sleep. If not, then

3193

* After a short sleep, check if it was a premature sleep. If not, then

3180

* go fully to sleep until explicitly woken up.

3194

* go fully to sleep until explicitly woken up.

3181

*/

3195

*/

3182

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3196

if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {

3183

trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

3197

trace_mm_vmscan_kswapd_sleep(pgdat->node_id);

3184

3198

3185

/*

3199

/*

3186

* vmstat counters are not perfectly accurate and the estimated

3200

* vmstat counters are not perfectly accurate and the estimated

3187

* value for counters such as NR_FREE_PAGES can deviate from the

3201

* value for counters such as NR_FREE_PAGES can deviate from the

3188

* true value by nr_online_cpus * threshold. To avoid the zone

3202

* true value by nr_online_cpus * threshold. To avoid the zone

3189

* watermarks being breached while under pressure, we reduce the

3203

* watermarks being breached while under pressure, we reduce the

3190

* per-cpu vmstat threshold while kswapd is awake and restore

3204

* per-cpu vmstat threshold while kswapd is awake and restore

3191

* them before going back to sleep.

3205

* them before going back to sleep.

3192

*/

3206

*/

3193

set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

3207

set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);

3194

3208

3195

/*

3209

/*

3196

* Compaction records what page blocks it recently failed to

3210

* Compaction records what page blocks it recently failed to

3197

* isolate pages from and skips them in the future scanning.

3211

* isolate pages from and skips them in the future scanning.

3198

* When kswapd is going to sleep, it is reasonable to assume

3212

* When kswapd is going to sleep, it is reasonable to assume

3199

* that pages and compaction may succeed so reset the cache.

3213

* that pages and compaction may succeed so reset the cache.

3200

*/

3214

*/

3201

reset_isolation_suitable(pgdat);

3215

reset_isolation_suitable(pgdat);

3202

3216

3203

if (!kthread_should_stop())

3217

if (!kthread_should_stop())

3204

schedule();

3218

schedule();

3205

3219

3206

set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);

3220

set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);

3207

} else {

3221

} else {

3208

if (remaining)

3222

if (remaining)

3209

count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);

3223

count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);

3210

else

3224

else

3211

count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);

3225

count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);

3212

}

3226

}

3213

finish_wait(&pgdat->kswapd_wait, &wait);

3227

finish_wait(&pgdat->kswapd_wait, &wait);

3214

}

3228

}

3215

3229

3216

/*

3230

/*

3217

* The background pageout daemon, started as a kernel thread

3231

* The background pageout daemon, started as a kernel thread

3218

* from the init process.

3232

* from the init process.

3219

*

3233

*

3220

* This basically trickles out pages so that we have _some_

3234

* This basically trickles out pages so that we have _some_

3221

* free memory available even if there is no other activity

3235

* free memory available even if there is no other activity

3222

* that frees anything up. This is needed for things like routing

3236

* that frees anything up. This is needed for things like routing

3223

* etc, where we otherwise might have all activity going on in

3237

* etc, where we otherwise might have all activity going on in

3224

* asynchronous contexts that cannot page things out.

3238

* asynchronous contexts that cannot page things out.

3225

*

3239

*

3226

* If there are applications that are active memory-allocators

3240

* If there are applications that are active memory-allocators

3227

* (most normal use), this basically shouldn't matter.

3241

* (most normal use), this basically shouldn't matter.

3228

*/

3242

*/

3229

static int kswapd(void *p)

3243

static int kswapd(void *p)

3230

{

3244

{

3231

unsigned long order, new_order;

3245

unsigned long order, new_order;

3232

unsigned balanced_order;

3246

unsigned balanced_order;

3233

int classzone_idx, new_classzone_idx;

3247

int classzone_idx, new_classzone_idx;

3234

int balanced_classzone_idx;

3248

int balanced_classzone_idx;

3235

pg_data_t *pgdat = (pg_data_t*)p;

3249

pg_data_t *pgdat = (pg_data_t*)p;

3236

struct task_struct *tsk = current;

3250

struct task_struct *tsk = current;

3237

3251

3238

struct reclaim_state reclaim_state = {

3252

struct reclaim_state reclaim_state = {

3239

.reclaimed_slab = 0,

3253

.reclaimed_slab = 0,

3240

};

3254

};

3241

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

3255

const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

3242

3256

3243

lockdep_set_current_reclaim_state(GFP_KERNEL);

3257

lockdep_set_current_reclaim_state(GFP_KERNEL);

3244

3258

3245

if (!cpumask_empty(cpumask))

3259

if (!cpumask_empty(cpumask))

3246

set_cpus_allowed_ptr(tsk, cpumask);

3260

set_cpus_allowed_ptr(tsk, cpumask);

3247

current->reclaim_state = &reclaim_state;

3261

current->reclaim_state = &reclaim_state;

3248

3262

3249

/*

3263

/*

3250

* Tell the memory management that we're a "memory allocator",

3264

* Tell the memory management that we're a "memory allocator",

3251

* and that if we need more memory we should get access to it

3265

* and that if we need more memory we should get access to it

3252

* regardless (see "__alloc_pages()"). "kswapd" should

3266

* regardless (see "__alloc_pages()"). "kswapd" should

3253

* never get caught in the normal page freeing logic.

3267

* never get caught in the normal page freeing logic.

3254

*

3268

*

3255

* (Kswapd normally doesn't need memory anyway, but sometimes

3269

* (Kswapd normally doesn't need memory anyway, but sometimes

3256

* you need a small amount of memory in order to be able to

3270

* you need a small amount of memory in order to be able to

3257

* page out something else, and this flag essentially protects

3271

* page out something else, and this flag essentially protects

3258

* us from recursively trying to free more memory as we're

3272

* us from recursively trying to free more memory as we're

3259

* trying to free the first piece of memory in the first place).

3273

* trying to free the first piece of memory in the first place).

3260

*/

3274

*/

3261

tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;

3275

tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;

3262

set_freezable();

3276

set_freezable();

3263

3277

3264

order = new_order = 0;

3278

order = new_order = 0;

3265

balanced_order = 0;

3279

balanced_order = 0;

3266

classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;

3280

classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;

3267

balanced_classzone_idx = classzone_idx;

3281

balanced_classzone_idx = classzone_idx;

3268

for ( ; ; ) {

3282

for ( ; ; ) {

3269

bool ret;

3283

bool ret;

3270

3284

3271

/*

3285

/*

3272

* If the last balance_pgdat was unsuccessful it's unlikely a

3286

* If the last balance_pgdat was unsuccessful it's unlikely a

3273

* new request of a similar or harder type will succeed soon

3287

* new request of a similar or harder type will succeed soon

3274

* so consider going to sleep on the basis we reclaimed at

3288

* so consider going to sleep on the basis we reclaimed at

3275

*/

3289

*/

3276

if (balanced_classzone_idx >= new_classzone_idx &&

3290

if (balanced_classzone_idx >= new_classzone_idx &&

3277

balanced_order == new_order) {

3291

balanced_order == new_order) {

3278

new_order = pgdat->kswapd_max_order;

3292

new_order = pgdat->kswapd_max_order;

3279

new_classzone_idx = pgdat->classzone_idx;

3293

new_classzone_idx = pgdat->classzone_idx;

3280

pgdat->kswapd_max_order = 0;

3294

pgdat->kswapd_max_order = 0;

3281

pgdat->classzone_idx = pgdat->nr_zones - 1;

3295

pgdat->classzone_idx = pgdat->nr_zones - 1;

3282

}

3296

}

3283

3297

3284

if (order < new_order || classzone_idx > new_classzone_idx) {

3298

if (order < new_order || classzone_idx > new_classzone_idx) {

3285

/*

3299

/*

3286

* Don't sleep if someone wants a larger 'order'

3300

* Don't sleep if someone wants a larger 'order'

3287

* allocation or has tigher zone constraints

3301

* allocation or has tigher zone constraints

3288

*/

3302

*/

3289

order = new_order;

3303

order = new_order;

3290

classzone_idx = new_classzone_idx;

3304

classzone_idx = new_classzone_idx;

3291

} else {

3305

} else {

3292

kswapd_try_to_sleep(pgdat, balanced_order,

3306

kswapd_try_to_sleep(pgdat, balanced_order,

3293

balanced_classzone_idx);

3307

balanced_classzone_idx);

3294

order = pgdat->kswapd_max_order;

3308

order = pgdat->kswapd_max_order;

3295

classzone_idx = pgdat->classzone_idx;

3309

classzone_idx = pgdat->classzone_idx;

3296

new_order = order;

3310

new_order = order;

3297

new_classzone_idx = classzone_idx;

3311

new_classzone_idx = classzone_idx;

3298

pgdat->kswapd_max_order = 0;

3312

pgdat->kswapd_max_order = 0;

3299

pgdat->classzone_idx = pgdat->nr_zones - 1;

3313

pgdat->classzone_idx = pgdat->nr_zones - 1;

3300

}

3314

}

3301

3315

3302

ret = try_to_freeze();

3316

ret = try_to_freeze();

3303

if (kthread_should_stop())

3317

if (kthread_should_stop())

3304

break;

3318

break;

3305

3319

3306

/*

3320

/*

3307

* We can speed up thawing tasks if we don't call balance_pgdat

3321

* We can speed up thawing tasks if we don't call balance_pgdat

3308

* after returning from the refrigerator

3322

* after returning from the refrigerator

3309

*/

3323

*/

3310

if (!ret) {

3324

if (!ret) {

3311

trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);

3325

trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);

3312

balanced_classzone_idx = classzone_idx;

3326

balanced_classzone_idx = classzone_idx;

3313

balanced_order = balance_pgdat(pgdat, order,

3327

balanced_order = balance_pgdat(pgdat, order,

3314

&balanced_classzone_idx);

3328

&balanced_classzone_idx);

3315

}

3329

}

3316

}

3330

}

3317

3331

3318

tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

3332

tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

3319

current->reclaim_state = NULL;

3333

current->reclaim_state = NULL;

3320

lockdep_clear_current_reclaim_state();

3334

lockdep_clear_current_reclaim_state();

3321

3335

3322

return 0;

3336

return 0;

3323

}

3337

}

3324

3338

3325

/*

3339

/*

3326

* A zone is low on free memory, so wake its kswapd task to service it.

3340

* A zone is low on free memory, so wake its kswapd task to service it.

3327

*/

3341

*/

3328

void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)

3342

void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)

3329

{

3343

{

3330

pg_data_t *pgdat;

3344

pg_data_t *pgdat;

3331

3345

3332

if (!populated_zone(zone))

3346

if (!populated_zone(zone))

3333

return;

3347

return;

3334

3348

3335

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

3349

if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))

3336

return;

3350

return;

3337

pgdat = zone->zone_pgdat;

3351

pgdat = zone->zone_pgdat;

3338

if (pgdat->kswapd_max_order < order) {

3352

if (pgdat->kswapd_max_order < order) {

3339

pgdat->kswapd_max_order = order;

3353

pgdat->kswapd_max_order = order;

3340

pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);

3354

pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);

3341

}

3355

}

3342

if (!waitqueue_active(&pgdat->kswapd_wait))

3356

if (!waitqueue_active(&pgdat->kswapd_wait))

3343

return;

3357

return;

3344

if (zone_balanced(zone, order, 0, 0))

3358

if (zone_balanced(zone, order, 0, 0))

3345

return;

3359

return;

3346

3360

3347

trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

3361

trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

3348

wake_up_interruptible(&pgdat->kswapd_wait);

3362

wake_up_interruptible(&pgdat->kswapd_wait);

3349

}

3363

}

3350

3364

3351

#ifdef CONFIG_HIBERNATION

3365

#ifdef CONFIG_HIBERNATION

3352

/*

3366

/*

3353

* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of

3367

* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of

3354

* freed pages.

3368

* freed pages.

3355

*

3369

*

3356

* Rather than trying to age LRUs the aim is to preserve the overall

3370

* Rather than trying to age LRUs the aim is to preserve the overall

3357

* LRU order by reclaiming preferentially

3371

* LRU order by reclaiming preferentially

3358

* inactive > active > active referenced > active mapped

3372

* inactive > active > active referenced > active mapped

3359

*/

3373

*/

3360

unsigned long shrink_all_memory(unsigned long nr_to_reclaim)

3374

unsigned long shrink_all_memory(unsigned long nr_to_reclaim)

3361

{

3375

{

3362

struct reclaim_state reclaim_state;

3376

struct reclaim_state reclaim_state;

3363

struct scan_control sc = {

3377

struct scan_control sc = {

3364

.gfp_mask = GFP_HIGHUSER_MOVABLE,

3378

.gfp_mask = GFP_HIGHUSER_MOVABLE,

3365

.may_swap = 1,

3379

.may_swap = 1,

3366

.may_unmap = 1,

3380

.may_unmap = 1,

3367

.may_writepage = 1,

3381

.may_writepage = 1,

3368

.nr_to_reclaim = nr_to_reclaim,

3382

.nr_to_reclaim = nr_to_reclaim,

3369

.hibernation_mode = 1,

3383

.hibernation_mode = 1,

3370

.order = 0,

3384

.order = 0,

3371

.priority = DEF_PRIORITY,

3385

.priority = DEF_PRIORITY,

3372

};

3386

};

3373

struct shrink_control shrink = {

3387

struct shrink_control shrink = {

3374

.gfp_mask = sc.gfp_mask,

3388

.gfp_mask = sc.gfp_mask,

3375

};

3389

};

3376

struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

3390

struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);

3377

struct task_struct *p = current;

3391

struct task_struct *p = current;

3378

unsigned long nr_reclaimed;

3392

unsigned long nr_reclaimed;

3379

3393

3380

p->flags |= PF_MEMALLOC;

3394

p->flags |= PF_MEMALLOC;

3381

lockdep_set_current_reclaim_state(sc.gfp_mask);

3395

lockdep_set_current_reclaim_state(sc.gfp_mask);

3382

reclaim_state.reclaimed_slab = 0;

3396

reclaim_state.reclaimed_slab = 0;

3383

p->reclaim_state = &reclaim_state;

3397

p->reclaim_state = &reclaim_state;

3384

3398

3385

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

3399

nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);

3386

3400

3387

p->reclaim_state = NULL;

3401

p->reclaim_state = NULL;

3388

lockdep_clear_current_reclaim_state();

3402

lockdep_clear_current_reclaim_state();

3389

p->flags &= ~PF_MEMALLOC;

3403

p->flags &= ~PF_MEMALLOC;

3390

3404

3391

return nr_reclaimed;

3405

return nr_reclaimed;

3392

}

3406

}

3393

#endif /* CONFIG_HIBERNATION */

3407

#endif /* CONFIG_HIBERNATION */

3394

3408

3395

/* It's optimal to keep kswapds on the same CPUs as their memory, but

3409

/* It's optimal to keep kswapds on the same CPUs as their memory, but

3396

not required for correctness. So if the last cpu in a node goes

3410

not required for correctness. So if the last cpu in a node goes

3397

away, we get changed to run anywhere: as the first one comes back,

3411

away, we get changed to run anywhere: as the first one comes back,

3398

restore their cpu bindings. */

3412

restore their cpu bindings. */

3399

static int cpu_callback(struct notifier_block *nfb, unsigned long action,

3413

static int cpu_callback(struct notifier_block *nfb, unsigned long action,

3400

void *hcpu)

3414

void *hcpu)

3401

{

3415

{

3402

int nid;

3416

int nid;

3403

3417

3404

if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {

3418

if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {

3405

for_each_node_state(nid, N_MEMORY) {

3419

for_each_node_state(nid, N_MEMORY) {

3406

pg_data_t *pgdat = NODE_DATA(nid);

3420

pg_data_t *pgdat = NODE_DATA(nid);

3407

const struct cpumask *mask;

3421

const struct cpumask *mask;

3408

3422

3409

mask = cpumask_of_node(pgdat->node_id);

3423

mask = cpumask_of_node(pgdat->node_id);

3410

3424

3411

if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)

3425

if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)

3412

/* One of our CPUs online: restore mask */

3426

/* One of our CPUs online: restore mask */

3413

set_cpus_allowed_ptr(pgdat->kswapd, mask);

3427

set_cpus_allowed_ptr(pgdat->kswapd, mask);

3414

}

3428

}

3415

}

3429

}

3416

return NOTIFY_OK;

3430

return NOTIFY_OK;

3417

}

3431

}

3418

3432

3419

/*

3433

/*

3420

* This kswapd start function will be called by init and node-hot-add.

3434

* This kswapd start function will be called by init and node-hot-add.

3421

* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.

3435

* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.

3422

*/

3436

*/

3423

int kswapd_run(int nid)

3437

int kswapd_run(int nid)

3424

{

3438

{

3425

pg_data_t *pgdat = NODE_DATA(nid);

3439

pg_data_t *pgdat = NODE_DATA(nid);

3426

int ret = 0;

3440

int ret = 0;

3427

3441

3428

if (pgdat->kswapd)

3442

if (pgdat->kswapd)

3429

return 0;

3443

return 0;

3430

3444

3431

pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);

3445

pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);

3432

if (IS_ERR(pgdat->kswapd)) {

3446

if (IS_ERR(pgdat->kswapd)) {

3433

/* failure at boot is fatal */

3447

/* failure at boot is fatal */

3434

BUG_ON(system_state == SYSTEM_BOOTING);

3448

BUG_ON(system_state == SYSTEM_BOOTING);

3435

pr_err("Failed to start kswapd on node %d\n", nid);

3449

pr_err("Failed to start kswapd on node %d\n", nid);

3436

ret = PTR_ERR(pgdat->kswapd);

3450

ret = PTR_ERR(pgdat->kswapd);

3437

pgdat->kswapd = NULL;

3451

pgdat->kswapd = NULL;

3438

}

3452

}

3439

return ret;

3453

return ret;

3440

}

3454

}

3441

3455

3442

/*

3456

/*

3443

* Called by memory hotplug when all memory in a node is offlined. Caller must

3457

* Called by memory hotplug when all memory in a node is offlined. Caller must

3444

* hold lock_memory_hotplug().

3458

* hold lock_memory_hotplug().

3445

*/

3459

*/

3446

void kswapd_stop(int nid)

3460

void kswapd_stop(int nid)

3447

{

3461

{

3448

struct task_struct *kswapd = NODE_DATA(nid)->kswapd;

3462

struct task_struct *kswapd = NODE_DATA(nid)->kswapd;

3449

3463

3450

if (kswapd) {

3464

if (kswapd) {

3451

kthread_stop(kswapd);

3465

kthread_stop(kswapd);

3452

NODE_DATA(nid)->kswapd = NULL;

3466

NODE_DATA(nid)->kswapd = NULL;

3453

}

3467

}

3454

}

3468

}

3455

3469

3456

static int __init kswapd_init(void)

3470

static int __init kswapd_init(void)

3457

{

3471

{

3458

int nid;

3472

int nid;

3459

3473

3460

swap_setup();

3474

swap_setup();

3461

for_each_node_state(nid, N_MEMORY)

3475

for_each_node_state(nid, N_MEMORY)

3462

kswapd_run(nid);

3476

kswapd_run(nid);

3463

hotcpu_notifier(cpu_callback, 0);

3477

hotcpu_notifier(cpu_callback, 0);

3464

return 0;

3478

return 0;

3465

}

3479

}

3466

3480

3467

module_init(kswapd_init)

3481

module_init(kswapd_init)

3468

3482

3469

#ifdef CONFIG_NUMA

3483

#ifdef CONFIG_NUMA

3470

/*

3484

/*

3471

* Zone reclaim mode

3485

* Zone reclaim mode

3472

*

3486

*

3473

* If non-zero call zone_reclaim when the number of free pages falls below

3487

* If non-zero call zone_reclaim when the number of free pages falls below

3474

* the watermarks.

3488

* the watermarks.

3475

*/

3489

*/

3476

int zone_reclaim_mode __read_mostly;

3490

int zone_reclaim_mode __read_mostly;

3477

3491

3478

#define RECLAIM_OFF 0

3492

#define RECLAIM_OFF 0

3479

#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */

3493

#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */

3480

#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */

3494

#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */

3481

#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */

3495

#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */

3482

3496

3483

/*

3497

/*

3484

* Priority for ZONE_RECLAIM. This determines the fraction of pages

3498

* Priority for ZONE_RECLAIM. This determines the fraction of pages

3485

* of a node considered for each zone_reclaim. 4 scans 1/16th of

3499

* of a node considered for each zone_reclaim. 4 scans 1/16th of

3486

* a zone.

3500

* a zone.

3487

*/

3501

*/

3488

#define ZONE_RECLAIM_PRIORITY 4

3502

#define ZONE_RECLAIM_PRIORITY 4

3489

3503

3490

/*

3504

/*

3491

* Percentage of pages in a zone that must be unmapped for zone_reclaim to

3505

* Percentage of pages in a zone that must be unmapped for zone_reclaim to

3492

* occur.

3506

* occur.

3493

*/

3507

*/

3494

int sysctl_min_unmapped_ratio = 1;

3508

int sysctl_min_unmapped_ratio = 1;

3495

3509

3496

/*

3510

/*

3497

* If the number of slab pages in a zone grows beyond this percentage then

3511

* If the number of slab pages in a zone grows beyond this percentage then

3498

* slab reclaim needs to occur.

3512

* slab reclaim needs to occur.

3499

*/

3513

*/

3500

int sysctl_min_slab_ratio = 5;

3514

int sysctl_min_slab_ratio = 5;

3501

3515

3502

static inline unsigned long zone_unmapped_file_pages(struct zone *zone)

3516

static inline unsigned long zone_unmapped_file_pages(struct zone *zone)

3503

{

3517

{

3504

unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);

3518

unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);

3505

unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +

3519

unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +

3506

zone_page_state(zone, NR_ACTIVE_FILE);

3520

zone_page_state(zone, NR_ACTIVE_FILE);

3507

3521

3508

/*

3522

/*

3509

* It's possible for there to be more file mapped pages than

3523

* It's possible for there to be more file mapped pages than

3510

* accounted for by the pages on the file LRU lists because

3524

* accounted for by the pages on the file LRU lists because

3511

* tmpfs pages accounted for as ANON can also be FILE_MAPPED

3525

* tmpfs pages accounted for as ANON can also be FILE_MAPPED

3512

*/

3526

*/

3513

return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;

3527

return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;

3514

}

3528

}

3515

3529

3516

/* Work out how many page cache pages we can reclaim in this reclaim_mode */

3530

/* Work out how many page cache pages we can reclaim in this reclaim_mode */

3517

static long zone_pagecache_reclaimable(struct zone *zone)

3531

static long zone_pagecache_reclaimable(struct zone *zone)

3518

{

3532

{

3519

long nr_pagecache_reclaimable;

3533

long nr_pagecache_reclaimable;

3520

long delta = 0;

3534

long delta = 0;

3521

3535

3522

/*

3536

/*

3523

* If RECLAIM_SWAP is set, then all file pages are considered

3537

* If RECLAIM_SWAP is set, then all file pages are considered

3524

* potentially reclaimable. Otherwise, we have to worry about

3538

* potentially reclaimable. Otherwise, we have to worry about

3525

* pages like swapcache and zone_unmapped_file_pages() provides

3539

* pages like swapcache and zone_unmapped_file_pages() provides

3526

* a better estimate

3540

* a better estimate

3527

*/

3541

*/

3528

if (zone_reclaim_mode & RECLAIM_SWAP)

3542

if (zone_reclaim_mode & RECLAIM_SWAP)

3529

nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);

3543

nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);

3530

else

3544

else

3531

nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);

3545

nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);

3532

3546

3533

/* If we can't clean pages, remove dirty pages from consideration */

3547

/* If we can't clean pages, remove dirty pages from consideration */

3534

if (!(zone_reclaim_mode & RECLAIM_WRITE))

3548

if (!(zone_reclaim_mode & RECLAIM_WRITE))

3535

delta += zone_page_state(zone, NR_FILE_DIRTY);

3549

delta += zone_page_state(zone, NR_FILE_DIRTY);

3536

3550

3537

/* Watch for any possible underflows due to delta */

3551

/* Watch for any possible underflows due to delta */

3538

if (unlikely(delta > nr_pagecache_reclaimable))

3552

if (unlikely(delta > nr_pagecache_reclaimable))

3539

delta = nr_pagecache_reclaimable;

3553

delta = nr_pagecache_reclaimable;

3540

3554

3541

return nr_pagecache_reclaimable - delta;

3555

return nr_pagecache_reclaimable - delta;

3542

}

3556

}

3543

3557

3544

/*

3558

/*

3545

* Try to free up some pages from this zone through reclaim.

3559

* Try to free up some pages from this zone through reclaim.

3546

*/

3560

*/

3547

static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3561

static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3548

{

3562

{

3549

/* Minimum pages needed in order to stay on node */

3563

/* Minimum pages needed in order to stay on node */

3550

const unsigned long nr_pages = 1 << order;

3564

const unsigned long nr_pages = 1 << order;

3551

struct task_struct *p = current;

3565

struct task_struct *p = current;

3552

struct reclaim_state reclaim_state;

3566

struct reclaim_state reclaim_state;

3553

struct scan_control sc = {

3567

struct scan_control sc = {

3554

.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),

3568

.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),

3555

.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),

3569

.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),

3556

.may_swap = 1,

3570

.may_swap = 1,

3557

.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),

3571

.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),

3558

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

3572

.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),

3559

.order = order,

3573

.order = order,

3560

.priority = ZONE_RECLAIM_PRIORITY,

3574

.priority = ZONE_RECLAIM_PRIORITY,

3561

};

3575

};

3562

struct shrink_control shrink = {

3576

struct shrink_control shrink = {

3563

.gfp_mask = sc.gfp_mask,

3577

.gfp_mask = sc.gfp_mask,

3564

};

3578

};

3565

unsigned long nr_slab_pages0, nr_slab_pages1;

3579

unsigned long nr_slab_pages0, nr_slab_pages1;

3566

3580

3567

cond_resched();

3581

cond_resched();

3568

/*

3582

/*

3569

* We need to be able to allocate from the reserves for RECLAIM_SWAP

3583

* We need to be able to allocate from the reserves for RECLAIM_SWAP

3570

* and we also need to be able to write out pages for RECLAIM_WRITE

3584

* and we also need to be able to write out pages for RECLAIM_WRITE

3571

* and RECLAIM_SWAP.

3585

* and RECLAIM_SWAP.

3572

*/

3586

*/

3573

p->flags |= PF_MEMALLOC | PF_SWAPWRITE;

3587

p->flags |= PF_MEMALLOC | PF_SWAPWRITE;

3574

lockdep_set_current_reclaim_state(gfp_mask);

3588

lockdep_set_current_reclaim_state(gfp_mask);

3575

reclaim_state.reclaimed_slab = 0;

3589

reclaim_state.reclaimed_slab = 0;

3576

p->reclaim_state = &reclaim_state;

3590

p->reclaim_state = &reclaim_state;

3577

3591

3578

if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {

3592

if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {

3579

/*

3593

/*

3580

* Free memory by calling shrink zone with increasing

3594

* Free memory by calling shrink zone with increasing

3581

* priorities until we have enough memory freed.

3595

* priorities until we have enough memory freed.

3582

*/

3596

*/

3583

do {

3597

do {

3584

shrink_zone(zone, &sc);

3598

shrink_zone(zone, &sc);

3585

} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);

3599

} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);

3586

}

3600

}

3587

3601

3588

nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3602

nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3589

if (nr_slab_pages0 > zone->min_slab_pages) {

3603

if (nr_slab_pages0 > zone->min_slab_pages) {

3590

/*

3604

/*

3591

* shrink_slab() does not currently allow us to determine how

3605

* shrink_slab() does not currently allow us to determine how

3592

* many pages were freed in this zone. So we take the current

3606

* many pages were freed in this zone. So we take the current

3593

* number of slab pages and shake the slab until it is reduced

3607

* number of slab pages and shake the slab until it is reduced

3594

* by the same nr_pages that we used for reclaiming unmapped

3608

* by the same nr_pages that we used for reclaiming unmapped

3595

* pages.

3609

* pages.

3596

*/

3610

*/

3597

nodes_clear(shrink.nodes_to_scan);

3611

nodes_clear(shrink.nodes_to_scan);

3598

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

3612

node_set(zone_to_nid(zone), shrink.nodes_to_scan);

3599

for (;;) {

3613

for (;;) {

3600

unsigned long lru_pages = zone_reclaimable_pages(zone);

3614

unsigned long lru_pages = zone_reclaimable_pages(zone);

3601

3615

3602

/* No reclaimable slab or very low memory pressure */

3616

/* No reclaimable slab or very low memory pressure */

3603

if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

3617

if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))

3604

break;

3618

break;

3605

3619

3606

/* Freed enough memory */

3620

/* Freed enough memory */

3607

nr_slab_pages1 = zone_page_state(zone,

3621

nr_slab_pages1 = zone_page_state(zone,

3608

NR_SLAB_RECLAIMABLE);

3622

NR_SLAB_RECLAIMABLE);

3609

if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)

3623

if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)

3610

break;

3624

break;

3611

}

3625

}

3612

3626

3613

/*

3627

/*

3614

* Update nr_reclaimed by the number of slab pages we

3628

* Update nr_reclaimed by the number of slab pages we

3615

* reclaimed from this zone.

3629

* reclaimed from this zone.

3616

*/

3630

*/

3617

nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3631

nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);

3618

if (nr_slab_pages1 < nr_slab_pages0)

3632

if (nr_slab_pages1 < nr_slab_pages0)

3619

sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;

3633

sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;

3620

}

3634

}

3621

3635

3622

p->reclaim_state = NULL;

3636

p->reclaim_state = NULL;

3623

current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);

3637

current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);

3624

lockdep_clear_current_reclaim_state();

3638

lockdep_clear_current_reclaim_state();

3625

return sc.nr_reclaimed >= nr_pages;

3639

return sc.nr_reclaimed >= nr_pages;

3626

}

3640

}

3627

3641

3628

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3642

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)

3629

{

3643

{

3630

int node_id;

3644

int node_id;

3631

int ret;

3645

int ret;

3632

3646

3633

/*

3647

/*

3634

* Zone reclaim reclaims unmapped file backed pages and

3648

* Zone reclaim reclaims unmapped file backed pages and

3635

* slab pages if we are over the defined limits.

3649

* slab pages if we are over the defined limits.

3636

*

3650

*

3637

* A small portion of unmapped file backed pages is needed for

3651

* A small portion of unmapped file backed pages is needed for

3638

* file I/O otherwise pages read by file I/O will be immediately

3652

* file I/O otherwise pages read by file I/O will be immediately

3639

* thrown out if the zone is overallocated. So we do not reclaim

3653

* thrown out if the zone is overallocated. So we do not reclaim

3640

* if less than a specified percentage of the zone is used by

3654

* if less than a specified percentage of the zone is used by

3641

* unmapped file backed pages.

3655

* unmapped file backed pages.

3642

*/

3656

*/

3643

if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&

3657

if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&

3644

zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)

3658

zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)

3645

return ZONE_RECLAIM_FULL;

3659

return ZONE_RECLAIM_FULL;

3646

3660

3647

if (!zone_reclaimable(zone))

3661

if (!zone_reclaimable(zone))

3648

return ZONE_RECLAIM_FULL;

3662

return ZONE_RECLAIM_FULL;

3649

3663

3650

/*

3664

/*

3651

* Do not scan if the allocation should not be delayed.

3665

* Do not scan if the allocation should not be delayed.

3652

*/

3666

*/

3653

if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))

3667

if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))

3654

return ZONE_RECLAIM_NOSCAN;

3668

return ZONE_RECLAIM_NOSCAN;

3655

3669

3656

/*

3670

/*

3657

* Only run zone reclaim on the local zone or on zones that do not

3671

* Only run zone reclaim on the local zone or on zones that do not

3658

* have associated processors. This will favor the local processor

3672

* have associated processors. This will favor the local processor

3659

* over remote processors and spread off node memory allocations

3673

* over remote processors and spread off node memory allocations

3660

* as wide as possible.

3674

* as wide as possible.

3661

*/

3675

*/

3662

node_id = zone_to_nid(zone);

3676

node_id = zone_to_nid(zone);

3663

if (node_state(node_id, N_CPU) && node_id != numa_node_id())

3677

if (node_state(node_id, N_CPU) && node_id != numa_node_id())

3664

return ZONE_RECLAIM_NOSCAN;

3678

return ZONE_RECLAIM_NOSCAN;

3665

3679

3666

if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))

3680

if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))

3667

return ZONE_RECLAIM_NOSCAN;

3681

return ZONE_RECLAIM_NOSCAN;

3668

3682

3669

ret = __zone_reclaim(zone, gfp_mask, order);

3683

ret = __zone_reclaim(zone, gfp_mask, order);

3670

zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);

3684

zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);

3671

3685

3672

if (!ret)

3686

if (!ret)

3673

count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

3687

count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);

3674

3688

3675

return ret;

3689

return ret;

3676

}

3690

}

3677

#endif

3691

#endif

3678

3692

3679

/*

3693

/*

3680

* page_evictable - test whether a page is evictable

3694

* page_evictable - test whether a page is evictable

3681

* @page: the page to test

3695

* @page: the page to test

3682

*

3696

*

3683

* Test whether page is evictable--i.e., should be placed on active/inactive

3697

* Test whether page is evictable--i.e., should be placed on active/inactive

3684

* lists vs unevictable list.

3698

* lists vs unevictable list.

3685

*

3699

*

3686

* Reasons page might not be evictable:

3700

* Reasons page might not be evictable:

3687

* (1) page's mapping marked unevictable

3701

* (1) page's mapping marked unevictable

3688

* (2) page is part of an mlocked VMA

3702

* (2) page is part of an mlocked VMA

3689

*

3703

*

3690

*/

3704

*/

3691

int page_evictable(struct page *page)

3705

int page_evictable(struct page *page)

3692

{

3706

{

3693

return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);

3707

return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);

3694

}

3708

}

3695

3709

3696

#ifdef CONFIG_SHMEM

3710

#ifdef CONFIG_SHMEM

3697

/**

3711

/**

3698

* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list

3712

* check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list

3699

* @pages: array of pages to check

3713

* @pages: array of pages to check

3700

* @nr_pages: number of pages to check

3714

* @nr_pages: number of pages to check

3701

*

3715

*

3702

* Checks pages for evictability and moves them to the appropriate lru list.

3716

* Checks pages for evictability and moves them to the appropriate lru list.

3703

*

3717

*

3704

* This function is only used for SysV IPC SHM_UNLOCK.

3718

* This function is only used for SysV IPC SHM_UNLOCK.

3705

*/

3719

*/

3706

void check_move_unevictable_pages(struct page **pages, int nr_pages)

3720

void check_move_unevictable_pages(struct page **pages, int nr_pages)

3707

{

3721

{

3708

struct lruvec *lruvec;

3722

struct lruvec *lruvec;

3709

struct zone *zone = NULL;

3723

struct zone *zone = NULL;

3710

int pgscanned = 0;

3724

int pgscanned = 0;

3711

int pgrescued = 0;

3725

int pgrescued = 0;

3712

int i;

3726

int i;

3713

3727

3714

for (i = 0; i < nr_pages; i++) {

3728

for (i = 0; i < nr_pages; i++) {

3715

struct page *page = pages[i];

3729

struct page *page = pages[i];

3716

struct zone *pagezone;

3730

struct zone *pagezone;

3717

3731

3718

pgscanned++;

3732

pgscanned++;

3719

pagezone = page_zone(page);

3733

pagezone = page_zone(page);

3720

if (pagezone != zone) {

3734

if (pagezone != zone) {

3721

if (zone)

3735

if (zone)

3722

spin_unlock_irq(&zone->lru_lock);

3736

spin_unlock_irq(&zone->lru_lock);

3723

zone = pagezone;

3737

zone = pagezone;

3724

spin_lock_irq(&zone->lru_lock);

3738

spin_lock_irq(&zone->lru_lock);

3725

}

3739

}

3726

lruvec = mem_cgroup_page_lruvec(page, zone);

3740

lruvec = mem_cgroup_page_lruvec(page, zone);

3727

3741

3728

if (!PageLRU(page) || !PageUnevictable(page))

3742

if (!PageLRU(page) || !PageUnevictable(page))

3729

continue;

3743

continue;

3730

3744

3731

if (page_evictable(page)) {

3745

if (page_evictable(page)) {

3732

enum lru_list lru = page_lru_base_type(page);

3746

enum lru_list lru = page_lru_base_type(page);

3733

3747

3734

VM_BUG_ON(PageActive(page));

3748

VM_BUG_ON(PageActive(page));

3735

ClearPageUnevictable(page);

3749

ClearPageUnevictable(page);

3736

del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);

3750

del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);

3737

add_page_to_lru_list(page, lruvec, lru);

3751

add_page_to_lru_list(page, lruvec, lru);

3738

pgrescued++;

3752

pgrescued++;

3739

}

3753

}

3740

}

3754

}

3741

3755

3742

if (zone) {

3756

if (zone) {

3743

__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);

3757

__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);

3744

__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);

3758

__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);

3745

spin_unlock_irq(&zone->lru_lock);

3759

spin_unlock_irq(&zone->lru_lock);

3746

}

3760

}

3747

}

3761

}

3748

#endif /* CONFIG_SHMEM */

3762

#endif /* CONFIG_SHMEM */

3749

3763

3750

static void warn_scan_unevictable_pages(void)

3764

static void warn_scan_unevictable_pages(void)

3751

{

3765

{

3752

printk_once(KERN_WARNING

3766

printk_once(KERN_WARNING

3753

"%s: The scan_unevictable_pages sysctl/node-interface has been "

3767

"%s: The scan_unevictable_pages sysctl/node-interface has been "

3754

"disabled for lack of a legitimate use case. If you have "

3768

"disabled for lack of a legitimate use case. If you have "

3755

"one, please send an email to linux-mm@kvack.org.\n",

3769

"one, please send an email to linux-mm@kvack.org.\n",

3756

current->comm);

3770

current->comm);

3757

}

3771

}

3758

3772

3759

/*

3773

/*

3760

* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of

3774

* scan_unevictable_pages [vm] sysctl handler. On demand re-scan of

3761

* all nodes' unevictable lists for evictable pages

3775

* all nodes' unevictable lists for evictable pages

3762

*/

3776

*/

3763

unsigned long scan_unevictable_pages;

3777

unsigned long scan_unevictable_pages;

3764

3778

3765

int scan_unevictable_handler(struct ctl_table *table, int write,

3779

int scan_unevictable_handler(struct ctl_table *table, int write,

3766

void __user *buffer,

3780

void __user *buffer,

3767

size_t *length, loff_t *ppos)

3781

size_t *length, loff_t *ppos)

3768

{

3782

{

3769

warn_scan_unevictable_pages();

3783

warn_scan_unevictable_pages();

3770

proc_doulongvec_minmax(table, write, buffer, length, ppos);

3784

proc_doulongvec_minmax(table, write, buffer, length, ppos);

3771

scan_unevictable_pages = 0;

3785

scan_unevictable_pages = 0;

3772

return 0;

3786

return 0;

3773

}

3787

}

3774

3788

3775

#ifdef CONFIG_NUMA

3789

#ifdef CONFIG_NUMA

3776

/*

3790

/*

3777

* per node 'scan_unevictable_pages' attribute. On demand re-scan of

3791

* per node 'scan_unevictable_pages' attribute. On demand re-scan of

3778

* a specified node's per zone unevictable lists for evictable pages.

3792

* a specified node's per zone unevictable lists for evictable pages.

3779

*/

3793

*/

3780

3794

3781

static ssize_t read_scan_unevictable_node(struct device *dev,

3795

static ssize_t read_scan_unevictable_node(struct device *dev,

3782

struct device_attribute *attr,

3796

struct device_attribute *attr,

3783

char *buf)

3797

char *buf)

3784

{

3798

{

3785

warn_scan_unevictable_pages();

3799

warn_scan_unevictable_pages();

3786

return sprintf(buf, "0\n"); /* always zero; should fit... */

3800

return sprintf(buf, "0\n"); /* always zero; should fit... */

3787

}

3801

}

3788

3802

3789

static ssize_t write_scan_unevictable_node(struct device *dev,

3803

static ssize_t write_scan_unevictable_node(struct device *dev,

3790

struct device_attribute *attr,

3804

struct device_attribute *attr,

3791

const char *buf, size_t count)

3805

const char *buf, size_t count)

3792

{

3806

{

3793

warn_scan_unevictable_pages();

3807

warn_scan_unevictable_pages();

3794

return 1;

3808

return 1;

3795

}

3809

}

3796

3810

3797

3811

3798

static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,

3812

static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,

3799

read_scan_unevictable_node,

3813

read_scan_unevictable_node,

3800

write_scan_unevictable_node);

3814

write_scan_unevictable_node);

3801

3815

3802

int scan_unevictable_register_node(struct node *node)

3816

int scan_unevictable_register_node(struct node *node)

3803

{

3817

{

GITLAB

mm: vmscan: use proportional scanning during direct reclaim and full scan at DEF_PRIORITY

 /*
  *  linux/mm/vmscan.c
  *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page(),
 					buffer_heads_over_limit */
 #include <linux/mm_inline.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
 #include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
 #include <linux/balloon_compaction.h>
 #include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 	/* Number of pages freed so far during a call to shrink_zones() */
 	unsigned long nr_reclaimed;
 	/* How many pages shrink_list() should reclaim */
 	unsigned long nr_to_reclaim;
 	unsigned long hibernation_mode;
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 	int may_writepage;
 	/* Can mapped pages be reclaimed? */
 	int may_unmap;
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 	int order;
 	/* Scan (total_size >> priority) pages at once */
 	int priority;
 	/*
 	 * The memory cgroup that hit its limit and as a result is the
 	 * primary target of this reclaim invocation.
 	 */
 	struct mem_cgroup *target_mem_cgroup;
 	/*
 	 * Nodemask of nodes allowed by the caller. If NULL, all nodes
 	 * are scanned.
 	 */
 	nodemask_t	*nodemask;
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetch(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 #ifdef ARCH_HAS_PREFETCHW
 #define prefetchw_prev_lru_page(_page, _base, _field)			\
 	do {								\
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
 			prev = lru_to_page(&(_page->lru));		\
 			prefetchw(&prev->_field);			\
 		}							\
 	} while (0)
 #else
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
 /*
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
 unsigned long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
 	return !sc->target_mem_cgroup;
 }
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
 	return true;
 }
 #endif
 static unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	int nr;
 	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
 	     zone_page_state(zone, NR_INACTIVE_FILE);
 	if (get_nr_swap_pages() > 0)
 		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
 		      zone_page_state(zone, NR_INACTIVE_ANON);
 	return nr;
 }
 bool zone_reclaimable(struct zone *zone)
 {
 	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_get_lru_size(lruvec, lru);
 	return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
 }
 /*
  * Add a shrinker callback to be called from the vm.
  */
 int register_shrinker(struct shrinker *shrinker)
 {
 	size_t size = sizeof(*shrinker->nr_deferred);
 	/*
 	 * If we only have one possible node in the system anyway, save
 	 * ourselves the trouble and disable NUMA aware behavior. This way we
 	 * will save memory and some small loop time later.
 	 */
 	if (nr_node_ids == 1)
 		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
 	if (shrinker->flags & SHRINKER_NUMA_AWARE)
 		size *= nr_node_ids;
 	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 	if (!shrinker->nr_deferred)
 		return -ENOMEM;
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
 	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 /*
  * Remove one
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
 	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
 static unsigned long
 shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
 		 unsigned long nr_pages_scanned, unsigned long lru_pages)
 {
 	unsigned long freed = 0;
 	unsigned long long delta;
 	long total_scan;
 	long freeable;
 	long nr;
 	long new_nr;
 	int nid = shrinkctl->nid;
 	long batch_size = shrinker->batch ? shrinker->batch
 					  : SHRINK_BATCH;
 	freeable = shrinker->count_objects(shrinker, shrinkctl);
 	if (freeable == 0)
 		return 0;
 	/*
 	 * copy the current shrinker scan count into a local variable
 	 * and zero it so that other concurrent shrinker invocations
 	 * don't also do this scanning work.
 	 */
 	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 	total_scan = nr;
 	delta = (4 * nr_pages_scanned) / shrinker->seeks;
 	delta *= freeable;
 	do_div(delta, lru_pages + 1);
 	total_scan += delta;
 	if (total_scan < 0) {
 		printk(KERN_ERR
 		"shrink_slab: %pF negative objects to delete nr=%ld\n",
 		       shrinker->scan_objects, total_scan);
 		total_scan = freeable;
 	}
 	/*
 	 * We need to avoid excessive windup on filesystem shrinkers
 	 * due to large numbers of GFP_NOFS allocations causing the
 	 * shrinkers to return -1 all the time. This results in a large
 	 * nr being built up so when a shrink that can do some work
 	 * comes along it empties the entire cache due to nr >>>
 	 * freeable. This is bad for sustaining a working set in
 	 * memory.
 	 *
 	 * Hence only allow the shrinker to scan the entire cache when
 	 * a large delta change is calculated directly.
 	 */
 	if (delta < freeable / 4)
 		total_scan = min(total_scan, freeable / 2);
 	/*
 	 * Avoid risking looping forever due to too large nr value:
 	 * never try to free more than twice the estimate number of
 	 * freeable entries.
 	 */
 	if (total_scan > freeable * 2)
 		total_scan = freeable * 2;
 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 				nr_pages_scanned, lru_pages,
 				freeable, delta, total_scan);
 	/*
 	 * Normally, we should not scan less than batch_size objects in one
 	 * pass to avoid too frequent shrinker calls, but if the slab has less
 	 * than batch_size objects in total and we are really tight on memory,
 	 * we will try to reclaim all available objects, otherwise we can end
 	 * up failing allocations although there are plenty of reclaimable
 	 * objects spread over several slabs with usage less than the
 	 * batch_size.
 	 *
 	 * We detect the "tight on memory" situations by looking at the total
 	 * number of objects we want to scan (total_scan). If it is greater
 	 * than the total number of objects on slab (freeable), we must be
 	 * scanning at high prio and therefore should try to reclaim as much as
 	 * possible.
 	 */
 	while (total_scan >= batch_size ||
 	       total_scan >= freeable) {
 		unsigned long ret;
 		unsigned long nr_to_scan = min(batch_size, total_scan);
 		shrinkctl->nr_to_scan = nr_to_scan;
 		ret = shrinker->scan_objects(shrinker, shrinkctl);
 		if (ret == SHRINK_STOP)
 			break;
 		freed += ret;
 		count_vm_events(SLABS_SCANNED, nr_to_scan);
 		total_scan -= nr_to_scan;
 		cond_resched();
 	}
 	/*
 	 * move the unused scan count back into the shrinker in a
 	 * manner that handles concurrent updates. If we exhausted the
 	 * scan, there is no need to do an update.
 	 */
 	if (total_scan > 0)
 		new_nr = atomic_long_add_return(total_scan,
 						&shrinker->nr_deferred[nid]);
 	else
 		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
 	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	return freed;
 }
 /*
  * Call the shrink functions to age shrinkable caches
  *
  * Here we assume it costs one seek to replace a lru page and that it also
  * takes a seek to recreate a cache object.  With this in mind we age equal
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
  * If the vm encountered mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
  *
  * `lru_pages' represents the number of on-LRU pages in all the zones which
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(struct shrink_control *shrinkctl,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
 	unsigned long freed = 0;
 	if (nr_pages_scanned == 0)
 		nr_pages_scanned = SWAP_CLUSTER_MAX;
 	if (!down_read_trylock(&shrinker_rwsem)) {
 		/*
 		 * If we would return 0, our callers would understand that we
 		 * have nothing else to shrink and give up trying. By returning
 		 * 1 we keep it going and assume we'll be able to shrink next
 		 * time.
 		 */
 		freed = 1;
 		goto out;
 	}
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
 			shrinkctl->nid = 0;
 			freed += shrink_slab_node(shrinkctl, shrinker,
 					nr_pages_scanned, lru_pages);
 			continue;
 		}
 		for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
 			if (node_online(shrinkctl->nid))
 				freed += shrink_slab_node(shrinkctl, shrinker,
 						nr_pages_scanned, lru_pages);
 		}
 	}
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
 	return freed;
 }
 static inline int is_page_cache_freeable(struct page *page)
 {
 	/*
 	 * A freeable page cache page is referenced only by the caller
 	 * that isolated the page, the page cache radix tree and
 	 * optional buffer heads at page->private.
 	 */
 	return page_count(page) - page_has_private(page) == 2;
 }
 static int may_write_to_queue(struct backing_dev_info *bdi,
 			      struct scan_control *sc)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
 	if (!bdi_write_congested(bdi))
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
 	return 0;
 }
 /*
  * We detected a synchronous write error writing a page out.  Probably
  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
  * fsync(), msync() or close().
  *
  * The tricky part is that after writepage we cannot touch the mapping: nothing
  * prevents it from being freed up.  But we have a ref on the page and once
  * that page is locked, the mapping is pinned.
  *
  * We're allowed to run sleeping lock_page() here because we know the caller has
  * __GFP_FS.
  */
 static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
 	if (page_mapping(page) == mapping)
 		mapping_set_error(mapping, error);
 	unlock_page(page);
 }
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
 	PAGE_KEEP,
 	/* move page to the active list, page is locked */
 	PAGE_ACTIVATE,
 	/* page has been sent to the disk successfully, page is unlocked */
 	PAGE_SUCCESS,
 	/* page is clean and locked */
 	PAGE_CLEAN,
 } pageout_t;
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
 			 struct scan_control *sc)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
 	 * will be non-blocking.  To prevent this allocation from being
 	 * stalled by pagecache activity.  But note that there may be
 	 * stalls if we need to run get_block().  We could test
 	 * PagePrivate for that.
 	 *
 	 * If this process is currently in __generic_file_aio_write() against
 	 * this page's queue, we can perform writeback even if that
 	 * will block.
 	 *
 	 * If the page is swapcache, write it back even if that would
 	 * block, for some throttling. This happens by accident, because
 	 * swap_backing_dev_info is bust: it doesn't reflect the
 	 * congestion state of the swapdevs.  Easy to fix, if needed.
 	 */
 	if (!is_page_cache_freeable(page))
 		return PAGE_KEEP;
 	if (!mapping) {
 		/*
 		 * Some data journaling orphaned pages can have
 		 * page->mapping == NULL while being dirty with clean buffers.
 		 */
 		if (page_has_private(page)) {
 			if (try_to_free_buffers(page)) {
 				ClearPageDirty(page);
 				printk("%s: orphaned page\n", __func__);
 				return PAGE_CLEAN;
 			}
 		}
 		return PAGE_KEEP;
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
 	if (!may_write_to_queue(mapping->backing_dev_info, sc))
 		return PAGE_KEEP;
 	if (clear_page_dirty_for_io(page)) {
 		int res;
 		struct writeback_control wbc = {
 			.sync_mode = WB_SYNC_NONE,
 			.nr_to_write = SWAP_CLUSTER_MAX,
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.for_reclaim = 1,
 		};
 		SetPageReclaim(page);
 		res = mapping->a_ops->writepage(page, &wbc);
 		if (res < 0)
 			handle_write_error(mapping, page, res);
 		if (res == AOP_WRITEPAGE_ACTIVATE) {
 			ClearPageReclaim(page);
 			return PAGE_ACTIVATE;
 		}
 		if (!PageWriteback(page)) {
 			/* synchronous write or broken a_ops? */
 			ClearPageReclaim(page);
 		}
 		trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
 		return PAGE_SUCCESS;
 	}
 	return PAGE_CLEAN;
 }
 /*
  * Same as remove_mapping, but if the page is removed from the mapping, it
  * gets returned with a refcount of 0.
  */
 static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 	spin_lock_irq(&mapping->tree_lock);
 	/*
 	 * The non racy check for a busy page.
 	 *
 	 * Must be careful with the order of the tests. When someone has
 	 * a ref to the page, it may be possible that they dirty it then
 	 * drop the reference. So if PageDirty is tested before page_count
 	 * here, then the following race may occur:
 	 *
 	 * get_user_pages(&page);
 	 * [user mapping goes away]
 	 * write_to(page);
 	 *				!PageDirty(page)    [good]
 	 * SetPageDirty(page);
 	 * put_page(page);
 	 *				!page_count(page)   [good, discard it]
 	 *
 	 * [oops, our write_to data is lost]
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
 	 * load is not satisfied before that of page->_count.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
 	 */
 	if (!page_freeze_refs(page, 2))
 		goto cannot_free;
 	/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
 	if (unlikely(PageDirty(page))) {
 		page_unfreeze_refs(page, 2);
 		goto cannot_free;
 	}
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		swapcache_free(swap, page);
 	} else {
 		void (*freepage)(struct page *);
 		freepage = mapping->a_ops->freepage;
 		__delete_from_page_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		mem_cgroup_uncharge_cache_page(page);
 		if (freepage != NULL)
 			freepage(page);
 	}
 	return 1;
 cannot_free:
 	spin_unlock_irq(&mapping->tree_lock);
 	return 0;
 }
 /*
  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
  * someone else has a ref on the page, abort and return 0.  If it was
  * successfully detached, return 1.  Assumes the caller has a single ref on
  * this page.
  */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (__remove_mapping(mapping, page)) {
 		/*
 		 * Unfreezing the refcount with 1 rather than 2 effectively
 		 * drops the pagecache ref for us without requiring another
 		 * atomic operation.
 		 */
 		page_unfreeze_refs(page, 1);
 		return 1;
 	}
 	return 0;
 }
 /**
  * putback_lru_page - put previously isolated page onto appropriate LRU list
  * @page: page to be put back to appropriate lru list
  *
  * Add previously isolated @page to appropriate LRU list.
  * Page may still be unevictable for other reasons.
  *
  * lru_lock must not be held, interrupts must be enabled.
  */
 void putback_lru_page(struct page *page)
 {
 	bool is_unevictable;
 	int was_unevictable = PageUnevictable(page);
 	VM_BUG_ON(PageLRU(page));
 redo:
 	ClearPageUnevictable(page);
 	if (page_evictable(page)) {
 		/*
 		 * For evictable pages, we can use the cache.
 		 * In event of a race, worst case is we end up with an
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
 		is_unevictable = false;
 		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
 		 * list.
 		 */
 		is_unevictable = true;
 		add_page_to_unevictable_list(page);
 		/*
 		 * When racing with an mlock or AS_UNEVICTABLE clearing
 		 * (page is unlocked) make sure that if the other thread
 		 * does not observe our setting of PG_lru and fails
 		 * isolation/check_move_unevictable_pages,
 		 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
 		 * the page back to the evictable list.
 		 *
 		 * The other side is TestClearPageMlocked() or shmem_lock().
 		 */
 		smp_mb();
 	}
 	/*
 	 * page's status can change while we move it among lru. If an evictable
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
 	if (is_unevictable && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
 		}
 		/* This means someone else dropped this page from LRU
 		 * So, it will be freed or putback to LRU again. There is
 		 * nothing to do here.
 		 */
 	}
 	if (was_unevictable && !is_unevictable)
 		count_vm_event(UNEVICTABLE_PGRESCUED);
 	else if (!was_unevictable && is_unevictable)
 		count_vm_event(UNEVICTABLE_PGCULLED);
 	put_page(page);		/* drop ref from isolate */
 }
 enum page_references {
 	PAGEREF_RECLAIM,
 	PAGEREF_RECLAIM_CLEAN,
 	PAGEREF_KEEP,
 	PAGEREF_ACTIVATE,
 };
 static enum page_references page_check_references(struct page *page,
 						  struct scan_control *sc)
 {
 	int referenced_ptes, referenced_page;
 	unsigned long vm_flags;
 	referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
 					  &vm_flags);
 	referenced_page = TestClearPageReferenced(page);
 	/*
 	 * Mlock lost the isolation race with us.  Let try_to_unmap()
 	 * move the page to the unevictable list.
 	 */
 	if (vm_flags & VM_LOCKED)
 		return PAGEREF_RECLAIM;
 	if (referenced_ptes) {
 		if (PageSwapBacked(page))
 			return PAGEREF_ACTIVATE;
 		/*
 		 * All mapped pages start out with page table
 		 * references from the instantiating fault, so we need
 		 * to look twice if a mapped file page is used more
 		 * than once.
 		 *
 		 * Mark it and spare it for another trip around the
 		 * inactive list.  Another page table reference will
 		 * lead to its activation.
 		 *
 		 * Note: the mark is set for activated pages as well
 		 * so that recently deactivated but used pages are
 		 * quickly recovered.
 		 */
 		SetPageReferenced(page);
 		if (referenced_page || referenced_ptes > 1)
 			return PAGEREF_ACTIVATE;
 		/*
 		 * Activate file-backed executable pages after first usage.
 		 */
 		if (vm_flags & VM_EXEC)
 			return PAGEREF_ACTIVATE;
 		return PAGEREF_KEEP;
 	}
 	/* Reclaim if clean, defer dirty pages to writeback */
 	if (referenced_page && !PageSwapBacked(page))
 		return PAGEREF_RECLAIM_CLEAN;
 	return PAGEREF_RECLAIM;
 }
 /* Check if a page is dirty or under writeback */
 static void page_check_dirty_writeback(struct page *page,
 				       bool *dirty, bool *writeback)
 {
 	struct address_space *mapping;
 	/*
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
 	 */
 	if (!page_is_file_cache(page)) {
 		*dirty = false;
 		*writeback = false;
 		return;
 	}
 	/* By default assume that the page flags are accurate */
 	*dirty = PageDirty(page);
 	*writeback = PageWriteback(page);
 	/* Verify dirty/writeback state if the filesystem supports it */
 	if (!page_has_private(page))
 		return;
 	mapping = page_mapping(page);
 	if (mapping && mapping->a_ops->is_dirty_writeback)
 		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct zone *zone,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
 				      unsigned long *ret_nr_dirty,
 				      unsigned long *ret_nr_unqueued_dirty,
 				      unsigned long *ret_nr_congested,
 				      unsigned long *ret_nr_writeback,
 				      unsigned long *ret_nr_immediate,
 				      bool force_reclaim)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
 	int pgactivate = 0;
 	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_writeback = 0;
 	unsigned long nr_immediate = 0;
 	cond_resched();
 	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
 		cond_resched();
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
 		if (!trylock_page(page))
 			goto keep;
 		VM_BUG_ON(PageActive(page));
 		VM_BUG_ON(page_zone(page) != zone);
 		sc->nr_scanned++;
 		if (unlikely(!page_evictable(page)))
 			goto cull_mlocked;
 		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
 		/* Double the slab pressure for mapped and swapcache pages */
 		if (page_mapped(page) || PageSwapCache(page))
 			sc->nr_scanned++;
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 		/*
 		 * The number of dirty pages determines if a zone is marked
 		 * reclaim_congested which affects wait_iff_congested. kswapd
 		 * will stall and start writing pages if the tail of the LRU
 		 * is all dirty unqueued pages.
 		 */
 		page_check_dirty_writeback(page, &dirty, &writeback);
 		if (dirty || writeback)
 			nr_dirty++;
 		if (dirty && !writeback)
 			nr_unqueued_dirty++;
 		/*
 		 * Treat this page as congested if the underlying BDI is or if
 		 * pages are cycling through the LRU so quickly that the
 		 * pages marked for immediate reclaim are making it to the
 		 * end of the LRU a second time.
 		 */
 		mapping = page_mapping(page);
 		if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 		/*
 		 * If a page at the tail of the LRU is under writeback, there
 		 * are three cases to consider.
 		 *
 		 * 1) If reclaim is encountering an excessive number of pages
 		 *    under writeback and this page is both under writeback and
 		 *    PageReclaim then it indicates that pages are being queued
 		 *    for IO but are being recycled through the LRU before the
 		 *    IO can complete. Waiting on the page itself risks an
 		 *    indefinite stall if it is impossible to writeback the
 		 *    page due to IO error or disconnected storage so instead
 		 *    note that the LRU is being scanned too quickly and the
 		 *    caller can stall after page list has been processed.
 		 *
 		 * 2) Global reclaim encounters a page, memcg encounters a
 		 *    page that is not marked for immediate reclaim or
 		 *    the caller does not have __GFP_IO. In this case mark
 		 *    the page for immediate reclaim and continue scanning.
 		 *
 		 *    __GFP_IO is checked  because a loop driver thread might
 		 *    enter reclaim, and deadlock if it waits on a page for
 		 *    which it is needed to do the write (loop masks off
 		 *    __GFP_IO|__GFP_FS for this reason); but more thought
 		 *    would probably show more reasons.
 		 *
 		 *    Don't require __GFP_FS, since we're not going into the
 		 *    FS, just waiting on its writeback completion. Worryingly,
 		 *    ext4 gfs2 and xfs allocate pages with
 		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
 		 *    may_enter_fs here is liable to OOM on them.
 		 *
 		 * 3) memcg encounters a page that is not already marked
 		 *    PageReclaim. memcg does not have any dirty pages
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
 		 *    reclaim. Wait for the writeback to complete.
 		 */
 		if (PageWriteback(page)) {
 			/* Case 1 above */
 			if (current_is_kswapd() &&
 			    PageReclaim(page) &&
 			    zone_is_reclaim_writeback(zone)) {
 				nr_immediate++;
 				goto keep_locked;
 			/* Case 2 above */
 			} else if (global_reclaim(sc) ||
 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
 				/*
 				 * This is slightly racy - end_page_writeback()
 				 * might have just cleared PageReclaim, then
 				 * setting PageReclaim here end up interpreted
 				 * as PageReadahead - but that does not matter
 				 * enough to care.  What we do want is for this
 				 * page to have PageReclaim set next time memcg
 				 * reclaim reaches the tests above, so it will
 				 * then wait_on_page_writeback() to avoid OOM;
 				 * and it's also appropriate in global reclaim.
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
 				goto keep_locked;
 			/* Case 3 above */
 			} else {
 				wait_on_page_writeback(page);
 			}
 		}
 		if (!force_reclaim)
 			references = page_check_references(page, sc);
 		switch (references) {
 		case PAGEREF_ACTIVATE:
 			goto activate_locked;
 		case PAGEREF_KEEP:
 			goto keep_locked;
 		case PAGEREF_RECLAIM:
 		case PAGEREF_RECLAIM_CLEAN:
 			; /* try to reclaim the page below */
 		}
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
 				goto activate_locked;
 			may_enter_fs = 1;
 			/* Adding to swap updated mapping */
 			mapping = page_mapping(page);
 		}
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, ttu_flags)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
 			case SWAP_MLOCK:
 				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
 		if (PageDirty(page)) {
 			/*
 			 * Only kswapd can writeback filesystem pages to
 			 * avoid risk of stack overflow but only writeback
 			 * if many dirty pages have been encountered.
 			 */
 			if (page_is_file_cache(page) &&
 					(!current_is_kswapd() ||
 					 !zone_is_reclaim_dirty(zone))) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
 				 * except we already have the page isolated
 				 * and know it's dirty
 				 */
 				inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
 				SetPageReclaim(page);
 				goto keep_locked;
 			}
 			if (references == PAGEREF_RECLAIM_CLEAN)
 				goto keep_locked;
 			if (!may_enter_fs)
 				goto keep_locked;
 			if (!sc->may_writepage)
 				goto keep_locked;
 			/* Page is dirty, try to write it out here */
 			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
 			case PAGE_SUCCESS:
 				if (PageWriteback(page))
 					goto keep;
 				if (PageDirty(page))
 					goto keep;
 				/*
 				 * A synchronous write - probably a ramdisk.  Go
 				 * ahead and try to reclaim the page.
 				 */
 				if (!trylock_page(page))
 					goto keep;
 				if (PageDirty(page) || PageWriteback(page))
 					goto keep_locked;
 				mapping = page_mapping(page);
 			case PAGE_CLEAN:
 				; /* try to free the page below */
 			}
 		}
 		/*
 		 * If the page has buffers, try to free the buffer mappings
 		 * associated with this page. If we succeed we try to free
 		 * the page as well.
 		 *
 		 * We do this even if the page is PageDirty().
 		 * try_to_release_page() does not perform I/O, but it is
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
 		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
 		 * Rarely, pages can have buffers and no ->mapping.  These are
 		 * the pages which were not successfully invalidated in
 		 * truncate_complete_page().  We try to drop those buffers here
 		 * and if that worked, and the page is no longer mapped into
 		 * process address space (page_count == 1) it can be freed.
 		 * Otherwise, leave the page on the LRU so it is swappable.
 		 */
 		if (page_has_private(page)) {
 			if (!try_to_release_page(page, sc->gfp_mask))
 				goto activate_locked;
 			if (!mapping && page_count(page) == 1) {
 				unlock_page(page);
 				if (put_page_testzero(page))
 					goto free_it;
 				else {
 					/*
 					 * rare race with speculative reference.
 					 * the speculative reference will free
 					 * this page shortly, so we may
 					 * increment nr_reclaimed here (and
 					 * leave it off the LRU).
 					 */
 					nr_reclaimed++;
 					continue;
 				}
 			}
 		}
 		if (!mapping || !__remove_mapping(mapping, page))
 			goto keep_locked;
 		/*
 		 * At this point, we have no other references and there is
 		 * no way to pick any more up (removed from LRU, removed
 		 * from pagecache). Can use non-atomic bitops now (and
 		 * we obviously don't have to worry about waking up a process
 		 * waiting on the page lock, because there are no references.
 		 */
 		__clear_page_locked(page);
 free_it:
 		nr_reclaimed++;
 		/*
 		 * Is there need to periodically free_page_list? It would
 		 * appear not as the counts should be low
 		 */
 		list_add(&page->lru, &free_pages);
 		continue;
 cull_mlocked:
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
 		unlock_page(page);
 		putback_lru_page(page);
 		continue;
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
 		if (PageSwapCache(page) && vm_swap_full())
 			try_to_free_swap(page);
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	free_hot_cold_page_list(&free_pages, 1);
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
 	mem_cgroup_uncharge_end();
 	*ret_nr_dirty += nr_dirty;
 	*ret_nr_congested += nr_congested;
 	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
 	*ret_nr_writeback += nr_writeback;
 	*ret_nr_immediate += nr_immediate;
 	return nr_reclaimed;
 }
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 					    struct list_head *page_list)
 {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
 	};
 	unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
 	struct page *page, *next;
 	LIST_HEAD(clean_pages);
 	list_for_each_entry_safe(page, next, page_list, lru) {
 		if (page_is_file_cache(page) && !PageDirty(page) &&
 		    !isolated_balloon_page(page)) {
 			ClearPageActive(page);
 			list_move(&page->lru, &clean_pages);
 		}
 	}
 	ret = shrink_page_list(&clean_pages, zone, &sc,
 			TTU_UNMAP|TTU_IGNORE_ACCESS,
 			&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
 	list_splice(&clean_pages, page_list);
 	mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
 	return ret;
 }
 /*
  * Attempt to remove the specified page from its LRU.  Only take this page
  * if it is of the appropriate PageActive status.  Pages which are being
  * freed elsewhere are also ignored.
  *
  * page:	page to consider
  * mode:	one of the LRU isolation modes defined above
  *
  * returns 0 on success, -ve errno on failure.
  */
 int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 {
 	int ret = -EINVAL;
 	/* Only take pages on the LRU. */
 	if (!PageLRU(page))
 		return ret;
 	/* Compaction should not handle unevictable pages but CMA can do so */
 	if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
 		return ret;
 	ret = -EBUSY;
 	/*
 	 * To minimise LRU disruption, the caller can indicate that it only
 	 * wants to isolate pages it will be able to operate on without
 	 * blocking - clean pages for the most part.
 	 *
 	 * ISOLATE_CLEAN means that only clean pages should be isolated. This
 	 * is used by reclaim when it is cannot write to backing storage
 	 *
 	 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
 	 * that it is possible to migrate without blocking
 	 */
 	if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
 		/* All the caller can do on PageWriteback is block */
 		if (PageWriteback(page))
 			return ret;
 		if (PageDirty(page)) {
 			struct address_space *mapping;
 			/* ISOLATE_CLEAN means only clean pages */
 			if (mode & ISOLATE_CLEAN)
 				return ret;
 			/*
 			 * Only pages without mappings or that have a
 			 * ->migratepage callback are possible to migrate
 			 * without blocking
 			 */
 			mapping = page_mapping(page);
 			if (mapping && !mapping->a_ops->migratepage)
 				return ret;
 		}
 	}
 	if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
 		return ret;
 	if (likely(get_page_unless_zero(page))) {
 		/*
 		 * Be careful not to clear PageLRU until after we're
 		 * sure the page is not being freed elsewhere -- the
 		 * page release code relies on it.
 		 */
 		ClearPageLRU(page);
 		ret = 0;
 	}
 	return ret;
 }
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
  * Appropriate locks must be held before calling this function.
  *
  * @nr_to_scan:	The number of pages to look through on the list.
  * @lruvec:	The LRU vector to pull pages from.
  * @dst:	The temp list to put pages on to.
  * @nr_scanned:	The number of pages that were scanned.
  * @sc:		The scan_control struct for this reclaim session
  * @mode:	One of the LRU isolation modes
  * @lru:	LRU list id for isolating
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct lruvec *lruvec, struct list_head *dst,
 		unsigned long *nr_scanned, struct scan_control *sc,
 		isolate_mode_t mode, enum lru_list lru)
 {
 	struct list_head *src = &lruvec->lists[lru];
 	unsigned long nr_taken = 0;
 	unsigned long scan;
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
 		int nr_pages;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 		VM_BUG_ON(!PageLRU(page));
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
 			nr_pages = hpage_nr_pages(page);
 			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 			list_move(&page->lru, dst);
 			nr_taken += nr_pages;
 			break;
 		case -EBUSY:
 			/* else it is being freed elsewhere */
 			list_move(&page->lru, src);
 			continue;
 		default:
 			BUG();
 		}
 	}
 	*nr_scanned = scan;
 	trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
 				    nr_taken, mode, is_file_lru(lru));
 	return nr_taken;
 }
 /**
  * isolate_lru_page - tries to isolate a page from its LRU list
  * @page: page to isolate from its LRU list
  *
  * Isolates a @page from an LRU list, clears PageLRU and adjusts the
  * vmstat statistic corresponding to whatever LRU list the page was on.
  *
  * Returns 0 if the page was removed from an LRU list.
  * Returns -EBUSY if the page was not on an LRU list.
  *
  * The returned page will have PageLRU() cleared.  If it was found on
  * the active list, it will have PageActive set.  If it was found on
  * the unevictable list, it will have the PageUnevictable bit set. That flag
  * may need to be cleared by the caller before letting the page go.
  *
  * The vmstat statistic corresponding to the list on which the page was
  * found will be decremented.
  *
  * Restrictions:
  * (1) Must be called with an elevated refcount on the page. This is a
  *     fundamentnal difference from isolate_lru_pages (which is called
  *     without a stable reference).
  * (2) the lru_lock must not be held.
  * (3) interrupts must be enabled.
  */
 int isolate_lru_page(struct page *page)
 {
 	int ret = -EBUSY;
 	VM_BUG_ON(!page_count(page));
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
 		struct lruvec *lruvec;
 		spin_lock_irq(&zone->lru_lock);
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		if (PageLRU(page)) {
 			int lru = page_lru(page);
 			get_page(page);
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			ret = 0;
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	return ret;
 }
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
 	if (current_is_kswapd())
 		return 0;
 	if (!global_reclaim(sc))
 		return 0;
 	if (file) {
 		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
 		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
 	} else {
 		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
 	}
 	/*
 	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
 	 * won't get blocked by normal direct-reclaimers, forming a circular
 	 * deadlock.
 	 */
 	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
 		inactive >>= 3;
 	return isolated > inactive;
 }
 static noinline_for_stack void
 putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	struct zone *zone = lruvec_zone(lruvec);
 	LIST_HEAD(pages_to_free);
 	/*
 	 * Put back any unfreeable pages.
 	 */
 	while (!list_empty(page_list)) {
 		struct page *page = lru_to_page(page_list);
 		int lru;
 		VM_BUG_ON(PageLRU(page));
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
 			spin_unlock_irq(&zone->lru_lock);
 			putback_lru_page(page);
 			spin_lock_irq(&zone->lru_lock);
 			continue;
 		}
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		SetPageLRU(page);
 		lru = page_lru(page);
 		add_page_to_lru_list(page, lruvec, lru);
 		if (is_active_lru(lru)) {
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
 			reclaim_stat->recent_rotated[file] += numpages;
 		}
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
 				list_add(&page->lru, &pages_to_free);
 		}
 	}
 	/*
 	 * To save our caller's stack, now use input list for pages to free.
 	 */
 	list_splice(&pages_to_free, page_list);
 }
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static noinline_for_stack unsigned long
 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 		     struct scan_control *sc, enum lru_list lru)
 {
 	LIST_HEAD(page_list);
 	unsigned long nr_scanned;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_taken;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_writeback = 0;
 	unsigned long nr_immediate = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	while (unlikely(too_many_isolated(zone, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))
 			return SWAP_CLUSTER_MAX;
 	}
 	lru_add_drain();
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 	spin_lock_irq(&zone->lru_lock);
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	if (global_reclaim(sc)) {
 		zone->pages_scanned += nr_scanned;
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
 		else
 			__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 	if (nr_taken == 0)
 		return 0;
 	nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
 				&nr_dirty, &nr_unqueued_dirty, &nr_congested,
 				&nr_writeback, &nr_immediate,
 				false);
 	spin_lock_irq(&zone->lru_lock);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
 					       nr_reclaimed);
 		else
 			__count_zone_vm_events(PGSTEAL_DIRECT, zone,
 					       nr_reclaimed);
 	}
 	putback_inactive_pages(lruvec, &page_list);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	free_hot_cold_page_list(&page_list, 1);
 	/*
 	 * If reclaim is isolating dirty pages under writeback, it implies
 	 * that the long-lived page allocation rate is exceeding the page
 	 * laundering rate. Either the global limits are not being effective
 	 * at throttling processes due to the page distribution throughout
 	 * zones or there is heavy usage of a slow backing device. The
 	 * only option is to throttle from reclaim context which is not ideal
 	 * as there is no guarantee the dirtying process is throttled in the
 	 * same way balance_dirty_pages() manages.
 	 *
 	 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
 	 * of pages under pages flagged for immediate reclaim and stall if any
 	 * are encountered in the nr_immediate check below.
 	 */
 	if (nr_writeback && nr_writeback == nr_taken)
 		zone_set_flag(zone, ZONE_WRITEBACK);
 	/*
 	 * memcg will stall in page writeback so only consider forcibly
 	 * stalling for global reclaim
 	 */
 	if (global_reclaim(sc)) {
 		/*
 		 * Tag a zone as congested if all the dirty pages scanned were
 		 * backed by a congested BDI and wait_iff_congested will stall.
 		 */
 		if (nr_dirty && nr_dirty == nr_congested)
 			zone_set_flag(zone, ZONE_CONGESTED);
 		/*
 		 * If dirty pages are scanned that are not queued for IO, it
 		 * implies that flushers are not keeping up. In this case, flag
 		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
 		 * pages from reclaim context.
 		 */
 		if (nr_unqueued_dirty == nr_taken)
 			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
 		/*
 		 * If kswapd scans pages marked marked for immediate
 		 * reclaim and under writeback (nr_immediate), it implies
 		 * that pages are cycling through the LRU faster than
 		 * they are written so also forcibly stall.
 		 */
 		if (nr_immediate)
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 	}
 	/*
 	 * Stall direct reclaim for IO completions if underlying BDIs or zone
 	 * is congested. Allow kswapd to continue until it starts encountering
 	 * unqueued dirty pages or cycling through the LRU too quickly.
 	 */
 	if (!sc->hibernation_mode && !current_is_kswapd())
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
 		zone_idx(zone),
 		nr_scanned, nr_reclaimed,
 		sc->priority,
 		trace_shrink_flags(file));
 	return nr_reclaimed;
 }
 /*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
  * processes, from rmap.
  *
  * If the pages are mostly unmapped, the processing is fast and it is
  * appropriate to hold zone->lru_lock across the whole operation.  But if
  * the pages are mapped, the processing is slow (page_referenced()) so we
  * should drop zone->lru_lock around each page.  It's impossible to balance
  * this, so instead we remove the pages from the LRU while processing them.
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
 static void move_active_pages_to_lru(struct lruvec *lruvec,
 				     struct list_head *list,
 				     struct list_head *pages_to_free,
 				     enum lru_list lru)
 {
 	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long pgmoved = 0;
 	struct page *page;
 	int nr_pages;
 	while (!list_empty(list)) {
 		page = lru_to_page(list);
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		nr_pages = hpage_nr_pages(page);
 		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 		if (put_page_testzero(page)) {
 			__ClearPageLRU(page);
 			__ClearPageActive(page);
 			del_page_from_lru_list(page, lruvec, lru);
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
 				list_add(&page->lru, pages_to_free);
 		}
 	}
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
 static void shrink_active_list(unsigned long nr_to_scan,
 			       struct lruvec *lruvec,
 			       struct scan_control *sc,
 			       enum lru_list lru)
 {
 	unsigned long nr_taken;
 	unsigned long nr_scanned;
 	unsigned long vm_flags;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	unsigned long nr_rotated = 0;
 	isolate_mode_t isolate_mode = 0;
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 	lru_add_drain();
 	if (!sc->may_unmap)
 		isolate_mode |= ISOLATE_UNMAPPED;
 	if (!sc->may_writepage)
 		isolate_mode |= ISOLATE_CLEAN;
 	spin_lock_irq(&zone->lru_lock);
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
 	if (global_reclaim(sc))
 		zone->pages_scanned += nr_scanned;
 	reclaim_stat->recent_scanned[file] += nr_taken;
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
 	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
 		if (unlikely(!page_evictable(page))) {
 			putback_lru_page(page);
 			continue;
 		}
 		if (unlikely(buffer_heads_over_limit)) {
 			if (page_has_private(page) && trylock_page(page)) {
 				if (page_has_private(page))
 					try_to_release_page(page, 0);
 				unlock_page(page);
 			}
 		}
 		if (page_referenced(page, 0, sc->target_mem_cgroup,
 				    &vm_flags)) {
 			nr_rotated += hpage_nr_pages(page);
 			/*
 			 * Identify referenced, file-backed active pages and
 			 * give them one more trip around the active list. So
 			 * that executable code get better chances to stay in
 			 * memory under moderate memory pressure.  Anon pages
 			 * are not likely to be evicted by use-once streaming
 			 * IO, plus JVM can create lots of anon VM_EXEC pages,
 			 * so we ignore them here.
 			 */
 			if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
 				list_add(&page->lru, &l_active);
 				continue;
 			}
 		}
 		ClearPageActive(page);	/* we are de-activating */
 		list_add(&page->lru, &l_inactive);
 	}
 	/*
 	 * Move pages back to the lru list.
 	 */
 	spin_lock_irq(&zone->lru_lock);
 	/*
 	 * Count referenced pages from currently used mappings as rotated,
 	 * even though only some of them are actually re-activated.  This
 	 * helps balance scan pressure between file and anonymous pages in
 	 * get_scan_ratio.
 	 */
 	reclaim_stat->recent_rotated[file] += nr_rotated;
 	move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
 	move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 	free_hot_cold_page_list(&l_hold, 1);
 }
 #ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
 	unsigned long active, inactive;
 	active = zone_page_state(zone, NR_ACTIVE_ANON);
 	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 	if (inactive * zone->inactive_ratio < active)
 		return 1;
 	return 0;
 }
 /**
  * inactive_anon_is_low - check if anonymous pages need to be deactivated
  * @lruvec: LRU vector to check
  *
  * Returns true if the zone does not have enough inactive anon pages,
  * meaning some active anon pages need to be deactivated.
  */
 static int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	/*
 	 * If we don't have swap space, anonymous page deactivation
 	 * is pointless.
 	 */
 	if (!total_swap_pages)
 		return 0;
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_inactive_anon_is_low(lruvec);
 	return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
 static inline int inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 0;
 }
 #endif
 /**
  * inactive_file_is_low - check if file pages need to be deactivated
  * @lruvec: LRU vector to check
  *
  * When the system is doing streaming IO, memory pressure here
  * ensures that active file pages get deactivated, until more
  * than half of the file pages are on the inactive list.
  *
  * Once we get to that situation, protect the system's working
  * set from being evicted by disabling active file page aging.
  *
  * This uses a different ratio than the anonymous pages, because
  * the page cache uses a use-once replacement algorithm.
  */
 static int inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive;
 	unsigned long active;
 	inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
 	return active > inactive;
 }
 static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (is_file_lru(lru))
 		return inactive_file_is_low(lruvec);
 	else
 		return inactive_anon_is_low(lruvec);
 }
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
 		if (inactive_list_is_low(lruvec, lru))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
 }
 static int vmscan_swappiness(struct scan_control *sc)
 {
 	if (global_reclaim(sc))
 		return vm_swappiness;
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
 	SCAN_ANON,
 	SCAN_FILE,
 };
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
  * by looking at the fraction of the pages scanned we did rotate back
  * onto the active list instead of evict.
  *
  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 			   unsigned long *nr)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2];
 	u64 denominator = 0;	/* gcc */
 	struct zone *zone = lruvec_zone(lruvec);
 	unsigned long anon_prio, file_prio;
 	enum scan_balance scan_balance;
 	unsigned long anon, file, free;
 	bool force_scan = false;
 	unsigned long ap, fp;
 	enum lru_list lru;
 	/*
 	 * If the zone or memcg is small, nr[l] can be 0.  This
 	 * results in no scanning on this priority and a potential
 	 * priority drop.  Global direct reclaim can go to the next
 	 * zone and tends to have no problems. Global kswapd is for
 	 * zone balancing and it needs to scan a minimum amount. When
 	 * reclaiming for a memcg, a priority drop can cause high
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
 	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	/*
 	 * Global reclaim will swap to prevent OOM even with no
 	 * swappiness, but memcg users want to use this knob to
 	 * disable swapping for individual groups completely when
 	 * using the memory controller's swap limit feature would be
 	 * too expensive.
 	 */
 	if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	/*
 	 * Do not apply any pressure balancing cleverness when the
 	 * system is close to OOM, scan both anon and file equally
 	 * (unless the swappiness setting disagrees with swapping).
 	 */
 	if (!sc->priority && vmscan_swappiness(sc)) {
 		scan_balance = SCAN_EQUAL;
 		goto out;
 	}
 	anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
 		get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
 		get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	/*
 	 * If it's foreseeable that reclaiming the file cache won't be
 	 * enough to get the zone back into a desirable shape, we have
 	 * to swap.  Better start now and leave the - probably heavily
 	 * thrashing - remaining file pages alone.
 	 */
 	if (global_reclaim(sc)) {
 		free = zone_page_state(zone, NR_FREE_PAGES);
 		if (unlikely(file + free <= high_wmark_pages(zone))) {
 			scan_balance = SCAN_ANON;
 			goto out;
 		}
 	}
 	/*
 	 * There is enough inactive page cache, do not reclaim
 	 * anything from the anonymous working set right now.
 	 */
 	if (!inactive_file_is_low(lruvec)) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
 	scan_balance = SCAN_FRACT;
 	/*
 	 * With swappiness at 100, anonymous and file have the same priority.
 	 * This scanning priority is essentially the inverse of IO cost.
 	 */
 	anon_prio = vmscan_swappiness(sc);
 	file_prio = 200 - anon_prio;
 	/*
 	 * OK, so we have swap space and a fair amount of page cache
 	 * pages.  We use the recently rotated / recently scanned
 	 * ratios to determine how valuable each cache is.
 	 *
 	 * Because workloads change over time (and to avoid overflow)
 	 * we keep these statistics as a floating average, which ends
 	 * up weighing recent references more than old ones.
 	 *
 	 * anon in [0], file in [1]
 	 */
 	spin_lock_irq(&zone->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
 		reclaim_stat->recent_scanned[0] /= 2;
 		reclaim_stat->recent_rotated[0] /= 2;
 	}
 	if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
 		reclaim_stat->recent_scanned[1] /= 2;
 		reclaim_stat->recent_rotated[1] /= 2;
 	}
 	/*
 	 * The amount of pressure on anon vs file pages is inversely
 	 * proportional to the fraction of recently scanned pages on
 	 * each list that were recently referenced and in active use.
 	 */
 	ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
 	ap /= reclaim_stat->recent_rotated[0] + 1;
 	fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
 	fp /= reclaim_stat->recent_rotated[1] + 1;
 	spin_unlock_irq(&zone->lru_lock);
 	fraction[0] = ap;
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
 		unsigned long scan;
 		size = get_lru_size(lruvec, lru);
 		scan = size >> sc->priority;
 		if (!scan && force_scan)
 			scan = min(size, SWAP_CLUSTER_MAX);
 		switch (scan_balance) {
 		case SCAN_EQUAL:
 			/* Scan lists relative to size */
 			break;
 		case SCAN_FRACT:
 			/*
 			 * Scan types proportional to swappiness and
 			 * their relative recent reclaim efficiency.
 			 */
 			scan = div64_u64(scan * fraction[file], denominator);
 			break;
 		case SCAN_FILE:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
 			if ((scan_balance == SCAN_FILE) != file)
 				scan = 0;
 			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
 		}
 		nr[lru] = scan;
 	}
 }
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long targets[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	enum lru_list lru;
 	unsigned long nr_reclaimed = 0;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
 	struct blk_plug plug;
-	bool scan_adjusted = false;
+	bool scan_adjusted;
 	get_scan_count(lruvec, sc, nr);
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
+	/*
+	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
+	 * event that can occur when there is little memory pressure e.g.
+	 * multiple streaming readers/writers. Hence, we do not abort scanning
+	 * when the requested number of pages are reclaimed when scanning at
+	 * DEF_PRIORITY on the assumption that the fact we are direct
+	 * reclaiming implies that kswapd is not keeping up and it is best to
+	 * do a batch of work at once. For memcg reclaim one check is made to
+	 * abort proportional reclaim if either the file or anon lru has already
+	 * dropped to zero at the first pass.
+	 */
+	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+			 sc->priority == DEF_PRIORITY);
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
 		unsigned long nr_anon, nr_file, percentage;
 		unsigned long nr_scanned;
 		for_each_evictable_lru(lru) {
 			if (nr[lru]) {
 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
 				nr[lru] -= nr_to_scan;
 				nr_reclaimed += shrink_list(lru, nr_to_scan,
 							    lruvec, sc);
 			}
 		}
 		if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
 			continue;
 		/*
-		 * For global direct reclaim, reclaim only the number of pages
-		 * requested. Less care is taken to scan proportionally as it
-		 * is more important to minimise direct reclaim stall latency
-		 * than it is to properly age the LRU lists.
-		 */
-		if (global_reclaim(sc) && !current_is_kswapd())
-			break;
-		/*
 		 * For kswapd and memcg, reclaim at least the number of pages
-		 * requested. Ensure that the anon and file LRUs shrink
+		 * requested. Ensure that the anon and file LRUs are scanned
 		 * proportionally what was requested by get_scan_count(). We
 		 * stop reclaiming one LRU and reduce the amount scanning
 		 * proportional to the original scan target.
 		 */
 		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
 		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
+		/*
+		 * It's just vindictive to attack the larger once the smaller
+		 * has gone to zero.  And given the way we stop scanning the
+		 * smaller below, this makes sure that we only make one nudge
+		 * towards proportionality once we've got nr_to_reclaim.
+		 */
+		if (!nr_file || !nr_anon)
+			break;
 		if (nr_file > nr_anon) {
 			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
 						targets[LRU_ACTIVE_ANON] + 1;
 			lru = LRU_BASE;
 			percentage = nr_anon * 100 / scan_target;
 		} else {
 			unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
 						targets[LRU_ACTIVE_FILE] + 1;
 			lru = LRU_FILE;
 			percentage = nr_file * 100 / scan_target;
 		}
 		/* Stop scanning the smaller of the LRU */
 		nr[lru] = 0;
 		nr[lru + LRU_ACTIVE] = 0;
 		/*
 		 * Recalculate the other LRU scan count based on its original
 		 * scan target and the percentage scanning already complete
 		 */
 		lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 		lru += LRU_ACTIVE;
 		nr_scanned = targets[lru] - nr[lru];
 		nr[lru] = targets[lru] * (100 - percentage) / 100;
 		nr[lru] -= min(nr[lru], nr_scanned);
 		scan_adjusted = true;
 	}
 	blk_finish_plug(&plug);
 	sc->nr_reclaimed += nr_reclaimed;
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
 	if (inactive_anon_is_low(lruvec))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 	throttle_vm_writeout(sc->gfp_mask);
 }
 /* Use reclaim/compaction for costly allocs or under memory pressure */
 static bool in_reclaim_compaction(struct scan_control *sc)
 {
 	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
 			(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
 			 sc->priority < DEF_PRIORITY - 2))
 		return true;
 	return false;
 }
 /*
  * Reclaim/compaction is used for high-order allocation requests. It reclaims
  * order-0 pages before compacting the zone. should_continue_reclaim() returns
  * true if more pages should be reclaimed such that when the page allocator
  * calls try_to_compact_zone() that it will have enough free pages to succeed.
  * It will give up earlier than that if there is difficulty reclaiming pages.
  */
 static inline bool should_continue_reclaim(struct zone *zone,
 					unsigned long nr_reclaimed,
 					unsigned long nr_scanned,
 					struct scan_control *sc)
 {
 	unsigned long pages_for_compaction;
 	unsigned long inactive_lru_pages;
 	/* If not in reclaim/compaction mode, stop */
 	if (!in_reclaim_compaction(sc))
 		return false;
 	/* Consider stopping depending on scan and reclaim activity */
 	if (sc->gfp_mask & __GFP_REPEAT) {
 		/*
 		 * For __GFP_REPEAT allocations, stop reclaiming if the
 		 * full LRU list has been scanned and we are still failing
 		 * to reclaim pages. This full LRU scan is potentially
 		 * expensive but a __GFP_REPEAT caller really wants to succeed
 		 */
 		if (!nr_reclaimed && !nr_scanned)
 			return false;
 	} else {
 		/*
 		 * For non-__GFP_REPEAT allocations which can presumably
 		 * fail without consequence, stop if we failed to reclaim
 		 * any pages from the last SWAP_CLUSTER_MAX number of
 		 * pages that were scanned. This will return to the
 		 * caller faster at the risk reclaim/compaction and
 		 * the resulting allocation attempt fails
 		 */
 		if (!nr_reclaimed)
 			return false;
 	}
 	/*
 	 * If we have not reclaimed enough pages for compaction and the
 	 * inactive lists are large enough, continue reclaiming
 	 */
 	pages_for_compaction = (2UL << sc->order);
 	inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
 	if (get_nr_swap_pages() > 0)
 		inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
 	if (sc->nr_reclaimed < pages_for_compaction &&
 			inactive_lru_pages > pages_for_compaction)
 		return true;
 	/* If compaction would go ahead or the allocation would succeed, stop */
 	switch (compaction_suitable(zone, sc->order)) {
 	case COMPACT_PARTIAL:
 	case COMPACT_CONTINUE:
 		return false;
 	default:
 		return true;
 	}
 }
 static void shrink_zone(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_reclaimed, nr_scanned;
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
 		struct mem_cgroup_reclaim_cookie reclaim = {
 			.zone = zone,
 			.priority = sc->priority,
 		};
 		struct mem_cgroup *memcg;
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
 			struct lruvec *lruvec;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			shrink_lruvec(lruvec, sc);
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
 			 * cgroups to fulfill the overall scan target for the
 			 * zone.
 			 *
 			 * Limit reclaim, on the other hand, only cares about
 			 * nr_to_reclaim pages to be reclaimed and it will
 			 * retry with decreasing priority if one round over the
 			 * whole hierarchy is not sufficient.
 			 */
 			if (!global_reclaim(sc) &&
 					sc->nr_reclaimed >= sc->nr_to_reclaim) {
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
 			memcg = mem_cgroup_iter(root, memcg, &reclaim);
 		} while (memcg);
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
 			   sc->nr_reclaimed - nr_reclaimed);
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 }
 /* Returns true if compaction should go ahead for a high-order request */
 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 {
 	unsigned long balance_gap, watermark;
 	bool watermark_ok;
 	/* Do not consider compaction for orders reclaim is meant to satisfy */
 	if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
 		return false;
 	/*
 	 * Compaction takes time to run and there are potentially other
 	 * callers using the pages just freed. Continue reclaiming until
 	 * there is a buffer of free pages available to give compaction
 	 * a reasonable chance of completing and allocating the page
 	 */
 	balance_gap = min(low_wmark_pages(zone),
 		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 			KSWAPD_ZONE_BALANCE_GAP_RATIO);
 	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
 	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
 	/*
 	 * If compaction is deferred, reclaim up to a point where
 	 * compaction will have a chance of success when re-enabled
 	 */
 	if (compaction_deferred(zone, sc->order))
 		return watermark_ok;
 	/* If compaction is not ready to start, keep reclaiming */
 	if (!compaction_suitable(zone, sc->order))
 		return false;
 	return watermark_ok;
 }
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
  *
  * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
  * Because:
  * a) The caller may be trying to free *extra* pages to satisfy a higher-order
  *    allocation or
  * b) The target zone may be at high_wmark_pages(zone) but the lower zones
  *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
  *    zone defense algorithm.
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  *
  * This function returns true if a zone is being reclaimed for a costly
  * high-order allocation and compaction is ready to begin. This indicates to
  * the caller that it should consider retrying the allocation instead of
  * further reclaim.
  */
 static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
 	 * allowed level, force direct reclaim to scan the highmem zone as
 	 * highmem pages could be pinning lowmem pages storing buffer_heads
 	 */
 	if (buffer_heads_over_limit)
 		sc->gfp_mask |= __GFP_HIGHMEM;
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
 		 */
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 			if (sc->priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
 				 * If we already have plenty of memory free for
 				 * compaction in this zone, don't free any more.
 				 * Even though compaction is invoked for any
 				 * non-zero order, only frequent costly order
 				 * reclamation is disruptive enough to become a
 				 * noticeable problem, like transparent huge
 				 * page allocations.
 				 */
 				if (compaction_ready(zone, sc)) {
 					aborted_reclaim = true;
 					continue;
 				}
 			}
 			/*
 			 * This steals pages from memory cgroups over softlimit
 			 * and returns the number of reclaimed pages and
 			 * scanned pages. This works for global memory pressure
 			 * and balancing, not for a memcg's limit.
 			 */
 			nr_soft_scanned = 0;
 			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
 						sc->order, sc->gfp_mask,
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 		}
 		shrink_zone(zone, sc);
 	}
 	return aborted_reclaim;
 }
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
 		struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 			gfp_zone(sc->gfp_mask), sc->nodemask) {
 		if (!populated_zone(zone))
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
 		if (zone_reclaimable(zone))
 			return false;
 	}
 	return true;
 }
 /*
  * This is the main entry point to direct page reclaim.
  *
  * If a full scan of the inactive list fails to free enough memory then we
  * are "out of memory" and something needs to be killed.
  *
  * If the caller is !__GFP_FS then the probability of a failure is reasonably
  * high - the zone may be full of dirty or under-writeback pages, which this
  * caller can't do much about.  We kick the writeback threads and take explicit
  * naps in the hope that some of these pages can be written.  But if the
  * allocating task holds filesystem locks which prevent writeout this might not
  * work, and the allocation attempt will fail.
  *
  * returns:	0, if no pages reclaimed
  * 		else, the number of pages reclaimed
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					struct scan_control *sc,
 					struct shrink_control *shrink)
 {
 	unsigned long total_scanned = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct zoneref *z;
 	struct zone *zone;
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 	delayacct_freepages_start();
 	if (global_reclaim(sc))
 		count_vm_event(ALLOCSTALL);
 	do {
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);
 		/*
 		 * Don't shrink slabs when reclaiming memory from over limit
 		 * cgroups but do shrink slab at least once when aborting
 		 * reclaim for compaction to avoid unevenly scanning file/anon
 		 * LRU pages over slab pages.
 		 */
 		if (global_reclaim(sc)) {
 			unsigned long lru_pages = 0;
 			nodes_clear(shrink->nodes_to_scan);
 			for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_zone(sc->gfp_mask), sc->nodemask) {
 				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 					continue;
 				lru_pages += zone_reclaimable_pages(zone);
 				node_set(zone_to_nid(zone),
 					 shrink->nodes_to_scan);
 			}
 			shrink_slab(shrink, sc->nr_scanned, lru_pages);
 			if (reclaim_state) {
 				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
 			}
 		}
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
 			goto out;
 		/*
 		 * If we're getting trouble reclaiming, start doing
 		 * writepage even in laptop mode.
 		 */
 		if (sc->priority < DEF_PRIORITY - 2)
 			sc->may_writepage = 1;
 		/*
 		 * Try to write back as many pages as we just scanned.  This
 		 * tends to cause slow streaming writers to write data to the
 		 * disk smoothly, at the dirtying rate, which is nice.   But
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
 		if (total_scanned > writeback_threshold) {
 			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
 						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
 	} while (--sc->priority >= 0 && !aborted_reclaim);
 out:
 	delayacct_freepages_end();
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
 	/*
 	 * As hibernation is going on, kswapd is freezed so that it can't mark
 	 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
 	 * check.
 	 */
 	if (oom_killer_disabled)
 		return 0;
 	/* Aborted reclaim to try compaction? don't OOM, then */
 	if (aborted_reclaim)
 		return 1;
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
 		return 1;
 	return 0;
 }
 static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
 	unsigned long free_pages = 0;
 	int i;
 	bool wmark_ok;
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!populated_zone(zone))
 			continue;
 		pfmemalloc_reserve += min_wmark_pages(zone);
 		free_pages += zone_page_state(zone, NR_FREE_PAGES);
 	}
 	/* If there are no reserves (unexpected config) then do not throttle */
 	if (!pfmemalloc_reserve)
 		return true;
 	wmark_ok = free_pages > pfmemalloc_reserve / 2;
 	/* kswapd must be awake if processes are being throttled */
 	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
 		pgdat->classzone_idx = min(pgdat->classzone_idx,
 						(enum zone_type)ZONE_NORMAL);
 		wake_up_interruptible(&pgdat->kswapd_wait);
 	}
 	return wmark_ok;
 }
 /*
  * Throttle direct reclaimers if backing storage is backed by the network
  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
  * depleted. kswapd will continue to make progress and wake the processes
  * when the low watermark is reached.
  *
  * Returns true if a fatal signal was delivered during throttling. If this
  * happens, the page allocator should not consider triggering the OOM killer.
  */
 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 					nodemask_t *nodemask)
 {
 	struct zoneref *z;
 	struct zone *zone;
 	pg_data_t *pgdat = NULL;
 	/*
 	 * Kernel threads should not be throttled as they may be indirectly
 	 * responsible for cleaning pages necessary for reclaim to make forward
 	 * progress. kjournald for example may enter direct reclaim while
 	 * committing a transaction where throttling it could forcing other
 	 * processes to block on log_wait_commit().
 	 */
 	if (current->flags & PF_KTHREAD)
 		goto out;
 	/*
 	 * If a fatal signal is pending, this process should not throttle.
 	 * It should return quickly so it can exit and free its memory
 	 */
 	if (fatal_signal_pending(current))
 		goto out;
 	/*
 	 * Check if the pfmemalloc reserves are ok by finding the first node
 	 * with a usable ZONE_NORMAL or lower zone. The expectation is that
 	 * GFP_KERNEL will be required for allocating network buffers when
 	 * swapping over the network so ZONE_HIGHMEM is unusable.
 	 *
 	 * Throttling is based on the first usable node and throttled processes
 	 * wait on a queue until kswapd makes progress and wakes them. There
 	 * is an affinity then between processes waking up and where reclaim
 	 * progress has been made assuming the process wakes on the same node.
 	 * More importantly, processes running on remote nodes will not compete
 	 * for remote pfmemalloc reserves and processes on different nodes
 	 * should make reasonable progress.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 					gfp_mask, nodemask) {
 		if (zone_idx(zone) > ZONE_NORMAL)
 			continue;
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
 		if (pfmemalloc_watermark_ok(pgdat))
 			goto out;
 		break;
 	}
 	/* If no zone was usable by the allocation flags then do not throttle */
 	if (!pgdat)
 		goto out;
 	/* Account for the throttling */
 	count_vm_event(PGSCAN_DIRECT_THROTTLE);
 	/*
 	 * If the caller cannot enter the filesystem, it's possible that it
 	 * is due to the caller holding an FS lock or performing a journal
 	 * transaction in the case of a filesystem like ext[3|4]. In this case,
 	 * it is not safe to block on pfmemalloc_wait as kswapd could be
 	 * blocked waiting on the same lock. Instead, throttle for up to a
 	 * second before continuing.
 	 */
 	if (!(gfp_mask & __GFP_FS)) {
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
 			pfmemalloc_watermark_ok(pgdat), HZ);
 		goto check_pending;
 	}
 	/* Throttle until kswapd wakes the process */
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
 		pfmemalloc_watermark_ok(pgdat));
 check_pending:
 	if (fatal_signal_pending(current))
 		return true;
 out:
 	return false;
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.may_writepage = !laptop_mode,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.order = order,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = NULL,
 		.nodemask = nodemask,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	/*
 	 * Do not enter reclaim if fatal signal was delivered while throttled.
 	 * 1 is returned so that the page allocator does not OOM kill at this
 	 * point.
 	 */
 	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
 		return 1;
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
 				gfp_mask);
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 	return nr_reclaimed;
 }
 #ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 						gfp_t gfp_mask, bool noswap,
 						struct zone *zone,
 						unsigned long *nr_scanned)
 {
 	struct scan_control sc = {
 		.nr_scanned = 0,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
 		.order = 0,
 		.priority = 0,
 		.target_mem_cgroup = memcg,
 	};
 	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
 						      sc.may_writepage,
 						      sc.gfp_mask);
 	/*
 	 * NOTE: Although we can get the priority field, using it
 	 * here is not a good idea, since it limits the pages we can scan.
 	 * if we don't reclaim here, the shrink_zone from balance_pgdat
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
 	shrink_lruvec(lruvec, &sc);
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 	*nr_scanned = sc.nr_scanned;
 	return sc.nr_reclaimed;
 }
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					   gfp_t gfp_mask,
 					   bool noswap)
 {
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
 	struct scan_control sc = {
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
 		.nodemask = NULL, /* we don't care the placement */
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
 	 * take care of from where we get pages. So the node where we start the
 	 * scan does not need to be the current node.
 	 */
 	nid = mem_cgroup_select_victim_node(memcg);
 	zonelist = NODE_DATA(nid)->node_zonelists;
 	trace_mm_vmscan_memcg_reclaim_begin(0,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 	return nr_reclaimed;
 }
 #endif
 static void age_active_anon(struct zone *zone, struct scan_control *sc)
 {
 	struct mem_cgroup *memcg;
 	if (!total_swap_pages)
 		return;
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 		if (inactive_anon_is_low(lruvec))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 		memcg = mem_cgroup_iter(NULL, memcg, NULL);
 	} while (memcg);
 }
 static bool zone_balanced(struct zone *zone, int order,
 			  unsigned long balance_gap, int classzone_idx)
 {
 	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
 				    balance_gap, classzone_idx, 0))
 		return false;
 	if (IS_ENABLED(CONFIG_COMPACTION) && order &&
 	    !compaction_suitable(zone, order))
 		return false;
 	return true;
 }
 /*
  * pgdat_balanced() is used when checking if a node is balanced.
  *
  * For order-0, all zones must be balanced!
  *
  * For high-order allocations only zones that meet watermarks and are in a
  * zone allowed by the callers classzone_idx are added to balanced_pages. The
  * total of balanced pages must be at least 25% of the zones allowed by
  * classzone_idx for the node to be considered balanced. Forcing all zones to
  * be balanced for high orders can cause excessive reclaim when there are
  * imbalanced zones.
  * The choice of 25% is due to
  *   o a 16M DMA zone that is balanced will not balance a zone on any
  *     reasonable sized machine
  *   o On all other machines, the top zone must be at least a reasonable
  *     percentage of the middle zones. For example, on 32-bit x86, highmem
  *     would need to be at least 256M for it to be balance a whole node.
  *     Similarly, on x86-64 the Normal zone would need to be at least 1G
  *     to balance a node on its own. These seemed like reasonable ratios.
  */
 static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
 	unsigned long managed_pages = 0;
 	unsigned long balanced_pages = 0;
 	int i;
 	/* Check the watermark levels */
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 		if (!populated_zone(zone))
 			continue;
 		managed_pages += zone->managed_pages;
 		/*
 		 * A special case here:
 		 *
 		 * balance_pgdat() skips over all_unreclaimable after
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
 		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
 		if (zone_balanced(zone, order, 0, i))
 			balanced_pages += zone->managed_pages;
 		else if (!order)
 			return false;
 	}
 	if (order)
 		return balanced_pages >= (managed_pages >> 2);
 	else
 		return true;
 }
 /*
  * Prepare kswapd for sleeping. This verifies that there are no processes
  * waiting in throttle_direct_reclaim() and that watermarks have been met.
  *
  * Returns true if kswapd is ready to sleep
  */
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 					int classzone_idx)
 {
 	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 	if (remaining)
 		return false;
 	/*
 	 * There is a potential race between when kswapd checks its watermarks
 	 * and a process gets throttled. There is also a potential race if
 	 * processes get throttled, kswapd wakes, a large process exits therby
 	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
 	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
 	 * so wake them now if necessary. If necessary, processes will wake
 	 * kswapd and get throttled again
 	 */
 	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
 		wake_up(&pgdat->pfmemalloc_wait);
 		return false;
 	}
 	return pgdat_balanced(pgdat, order, classzone_idx);
 }
 /*
  * kswapd shrinks the zone by the number of pages required to reach
  * the high watermark.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
  * reclaim or if the lack of progress was due to pages under writeback.
  * This is used to determine if the scanning priority needs to be raised.
  */
 static bool kswapd_shrink_zone(struct zone *zone,
 			       int classzone_idx,
 			       struct scan_control *sc,
 			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
 	int testorder = sc->order;
 	unsigned long balance_gap;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct shrink_control shrink = {
 		.gfp_mask = sc->gfp_mask,
 	};
 	bool lowmem_pressure;
 	/* Reclaim above the high watermark. */
 	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
 	/*
 	 * Kswapd reclaims only single pages with compaction enabled. Trying
 	 * too hard to reclaim until contiguous free pages have become
 	 * available can hurt performance by evicting too much useful data
 	 * from memory. Do not reclaim more than needed for compaction.
 	 */
 	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
 			compaction_suitable(zone, sc->order) !=
 				COMPACT_SKIPPED)
 		testorder = 0;
 	/*
 	 * We put equal pressure on every zone, unless one zone has way too
 	 * many pages free already. The "too many pages" is defined as the
 	 * high wmark plus a "gap" where the gap is either the low
 	 * watermark or 1% of the zone, whichever is smaller.
 	 */
 	balance_gap = min(low_wmark_pages(zone),
 		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
 		KSWAPD_ZONE_BALANCE_GAP_RATIO);
 	/*
 	 * If there is no low memory pressure or the zone is balanced then no
 	 * reclaim is necessary
 	 */
 	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
 	if (!lowmem_pressure && zone_balanced(zone, testorder,
 						balance_gap, classzone_idx))
 		return true;
 	shrink_zone(zone, sc);
 	nodes_clear(shrink.nodes_to_scan);
 	node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 	reclaim_state->reclaimed_slab = 0;
 	shrink_slab(&shrink, sc->nr_scanned, lru_pages);
 	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 	/*
 	 * If a zone reaches its high watermark, consider it to be no longer
 	 * congested. It's possible there are dirty pages backed by congested
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
 	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 	}
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at high_wmark_pages(zone).
  *
  * Returns the final order kswapd was reclaiming at
  *
  * There is special handling here for zones which are full of pinned pages.
  * This can happen if the pages are all mlocked, or if they are all used by
  * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
  * What we do is to detect the case where all pages in the zone have been
  * scanned twice and there has been zero successful reclaim.  Mark the zone as
  * dead and from now on, only perform a short scan.  Basically we're polling
  * the zone for when the problem goes away.
  *
  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
  * lower zones regardless of the number of free pages in the lower zones. This
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
 static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 							int *classzone_idx)
 {
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.may_unmap = 1,
 		.may_swap = 1,
 		.may_writepage = !laptop_mode,
 		.order = order,
 		.target_mem_cgroup = NULL,
 	};
 	count_vm_event(PAGEOUTRUN);
 	do {
 		unsigned long lru_pages = 0;
 		unsigned long nr_attempted = 0;
 		bool raise_priority = true;
 		bool pgdat_needs_compaction = (order > 0);
 		sc.nr_reclaimed = 0;
 		/*
 		 * Scan in the highmem->dma direction for the highest
 		 * zone which needs scanning
 		 */
 		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (sc.priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;
 			/*
 			 * Do some background aging of the anon list, to give
 			 * pages a chance to be referenced before reclaiming.
 			 */
 			age_active_anon(zone, &sc);
 			/*
 			 * If the number of buffer_heads in the machine
 			 * exceeds the maximum allowed level and this node
 			 * has a highmem zone, force kswapd to reclaim from
 			 * it to relieve lowmem pressure.
 			 */
 			if (buffer_heads_over_limit && is_highmem_idx(i)) {
 				end_zone = i;
 				break;
 			}
 			if (!zone_balanced(zone, order, 0, 0)) {
 				end_zone = i;
 				break;
 			} else {
 				/*
 				 * If balanced, clear the dirty and congested
 				 * flags
 				 */
 				zone_clear_flag(zone, ZONE_CONGESTED);
 				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 			}
 		}
 		if (i < 0)
 			goto out;
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			lru_pages += zone_reclaimable_pages(zone);
 			/*
 			 * If any zone is currently balanced then kswapd will
 			 * not call compaction as it is expected that the
 			 * necessary pages are already available.
 			 */
 			if (pgdat_needs_compaction &&
 					zone_watermark_ok(zone, order,
 						low_wmark_pages(zone),
 						*classzone_idx, 0))
 				pgdat_needs_compaction = false;
 		}
 		/*
 		 * If we're getting trouble reclaiming, start doing writepage
 		 * even in laptop mode.
 		 */
 		if (sc.priority < DEF_PRIORITY - 2)
 			sc.may_writepage = 1;
 		/*
 		 * Now scan the zone in the dma->highmem direction, stopping
 		 * at the last zone which needs scanning.
 		 *
 		 * We do this because the page allocator works in the opposite
 		 * direction.  This prevents the page allocator from allocating
 		 * pages behind kswapd's direction of progress, which would
 		 * cause too much scanning of the lower zones.
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 			if (!populated_zone(zone))
 				continue;
 			if (sc.priority != DEF_PRIORITY &&
 			    !zone_reclaimable(zone))
 				continue;
 			sc.nr_scanned = 0;
 			nr_soft_scanned = 0;
 			/*
 			 * Call soft limit reclaim before calling shrink_zone.
 			 */
 			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
 							order, sc.gfp_mask,
 							&nr_soft_scanned);
 			sc.nr_reclaimed += nr_soft_reclaimed;
 			/*
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
 			 * that that high watermark would be met at 100%
 			 * efficiency.
 			 */
 			if (kswapd_shrink_zone(zone, end_zone, &sc,
 					lru_pages, &nr_attempted))
 				raise_priority = false;
 		}
 		/*
 		 * If the low watermark is met there is no need for processes
 		 * to be throttled on pfmemalloc_wait as they should not be
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
 				pfmemalloc_watermark_ok(pgdat))
 			wake_up(&pgdat->pfmemalloc_wait);
 		/*
 		 * Fragmentation may mean that the system cannot be rebalanced
 		 * for high-order allocations in all zones. If twice the
 		 * allocation size has been reclaimed and the zones are still
 		 * not balanced then recheck the watermarks at order-0 to
 		 * prevent kswapd reclaiming excessively. Assume that a
 		 * process requested a high-order can direct reclaim/compact.
 		 */
 		if (order && sc.nr_reclaimed >= 2UL << order)
 			order = sc.order = 0;
 		/* Check if kswapd should be suspending */
 		if (try_to_freeze() || kthread_should_stop())
 			break;
 		/*
 		 * Compact if necessary and kswapd is reclaiming at least the
 		 * high watermark number of pages as requsted
 		 */
 		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
 			compact_pgdat(pgdat, order);
 		/*
 		 * Raise priority if scanning rate is too low or there was no
 		 * progress in reclaiming pages
 		 */
 		if (raise_priority || !sc.nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1 &&
 		 !pgdat_balanced(pgdat, order, *classzone_idx));
 out:
 	/*
 	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 	 * makes a decision on the order we were last reclaiming at. However,
 	 * if another caller entered the allocator slow path while kswapd
 	 * was awake, order will remain at the higher level
 	 */
 	*classzone_idx = end_zone;
 	return order;
 }
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
 	if (freezing(current) || kthread_should_stop())
 		return;
 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 	/* Try to sleep for a short interval */
 	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		remaining = schedule_timeout(HZ/10);
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 	}
 	/*
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
 	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 		/*
 		 * vmstat counters are not perfectly accurate and the estimated
 		 * value for counters such as NR_FREE_PAGES can deviate from the
 		 * true value by nr_online_cpus * threshold. To avoid the zone
 		 * watermarks being breached while under pressure, we reduce the
 		 * per-cpu vmstat threshold while kswapd is awake and restore
 		 * them before going back to sleep.
 		 */
 		set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
 		/*
 		 * Compaction records what page blocks it recently failed to
 		 * isolate pages from and skips them in the future scanning.
 		 * When kswapd is going to sleep, it is reasonable to assume
 		 * that pages and compaction may succeed so reset the cache.
 		 */
 		reset_isolation_suitable(pgdat);
 		if (!kthread_should_stop())
 			schedule();
 		set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
 	} else {
 		if (remaining)
 			count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
 		else
 			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
 	}
 	finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
  * that frees anything up. This is needed for things like routing
  * etc, where we otherwise might have all activity going on in
  * asynchronous contexts that cannot page things out.
  *
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
 static int kswapd(void *p)
 {
 	unsigned long order, new_order;
 	unsigned balanced_order;
 	int classzone_idx, new_classzone_idx;
 	int balanced_classzone_idx;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 	lockdep_set_current_reclaim_state(GFP_KERNEL);
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
 	/*
 	 * Tell the memory management that we're a "memory allocator",
 	 * and that if we need more memory we should get access to it
 	 * regardless (see "__alloc_pages()"). "kswapd" should
 	 * never get caught in the normal page freeing logic.
 	 *
 	 * (Kswapd normally doesn't need memory anyway, but sometimes
 	 * you need a small amount of memory in order to be able to
 	 * page out something else, and this flag essentially protects
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 	order = new_order = 0;
 	balanced_order = 0;
 	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
 	balanced_classzone_idx = classzone_idx;
 	for ( ; ; ) {
 		bool ret;
 		/*
 		 * If the last balance_pgdat was unsuccessful it's unlikely a
 		 * new request of a similar or harder type will succeed soon
 		 * so consider going to sleep on the basis we reclaimed at
 		 */
 		if (balanced_classzone_idx >= new_classzone_idx &&
 					balanced_order == new_order) {
 			new_order = pgdat->kswapd_max_order;
 			new_classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order =  0;
 			pgdat->classzone_idx = pgdat->nr_zones - 1;
 		}
 		if (order < new_order || classzone_idx > new_classzone_idx) {
 			/*
 			 * Don't sleep if someone wants a larger 'order'
 			 * allocation or has tigher zone constraints
 			 */
 			order = new_order;
 			classzone_idx = new_classzone_idx;
 		} else {
 			kswapd_try_to_sleep(pgdat, balanced_order,
 						balanced_classzone_idx);
 			order = pgdat->kswapd_max_order;
 			classzone_idx = pgdat->classzone_idx;
 			new_order = order;
 			new_classzone_idx = classzone_idx;
 			pgdat->kswapd_max_order = 0;
 			pgdat->classzone_idx = pgdat->nr_zones - 1;
 		}
 		ret = try_to_freeze();
 		if (kthread_should_stop())
 			break;
 		/*
 		 * We can speed up thawing tasks if we don't call balance_pgdat
 		 * after returning from the refrigerator
 		 */
 		if (!ret) {
 			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
 			balanced_classzone_idx = classzone_idx;
 			balanced_order = balance_pgdat(pgdat, order,
 						&balanced_classzone_idx);
 		}
 	}
 	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	return 0;
 }
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
 	pg_data_t *pgdat;
 	if (!populated_zone(zone))
 		return;
 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 		return;
 	pgdat = zone->zone_pgdat;
 	if (pgdat->kswapd_max_order < order) {
 		pgdat->kswapd_max_order = order;
 		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 	if (zone_balanced(zone, order, 0, 0))
 		return;
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
  * freed pages.
  *
  * Rather than trying to age LRUs the aim is to preserve the overall
  * LRU order by reclaiming preferentially
  * inactive > active > active referenced > active mapped
  */
 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_HIGHUSER_MOVABLE,
 		.may_swap = 1,
 		.may_unmap = 1,
 		.may_writepage = 1,
 		.nr_to_reclaim = nr_to_reclaim,
 		.hibernation_mode = 1,
 		.order = 0,
 		.priority = DEF_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
 	p->flags |= PF_MEMALLOC;
 	lockdep_set_current_reclaim_state(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
 	p->flags &= ~PF_MEMALLOC;
 	return nr_reclaimed;
 }
 #endif /* CONFIG_HIBERNATION */
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
    not required for correctness.  So if the last cpu in a node goes
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int cpu_callback(struct notifier_block *nfb, unsigned long action,
 			void *hcpu)
 {
 	int nid;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
 		for_each_node_state(nid, N_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
 			const struct cpumask *mask;
 			mask = cpumask_of_node(pgdat->node_id);
 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
 				set_cpus_allowed_ptr(pgdat->kswapd, mask);
 		}
 	}
 	return NOTIFY_OK;
 }
 /*
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
 int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	int ret = 0;
 	if (pgdat->kswapd)
 		return 0;
 	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
 	if (IS_ERR(pgdat->kswapd)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
 		pr_err("Failed to start kswapd on node %d\n", nid);
 		ret = PTR_ERR(pgdat->kswapd);
 		pgdat->kswapd = NULL;
 	}
 	return ret;
 }
 /*
  * Called by memory hotplug when all memory in a node is offlined.  Caller must
  * hold lock_memory_hotplug().
  */
 void kswapd_stop(int nid)
 {
 	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
 	if (kswapd) {
 		kthread_stop(kswapd);
 		NODE_DATA(nid)->kswapd = NULL;
 	}
 }
 static int __init kswapd_init(void)
 {
 	int nid;
 	swap_setup();
 	for_each_node_state(nid, N_MEMORY)
  		kswapd_run(nid);
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }
 module_init(kswapd_init)
 #ifdef CONFIG_NUMA
 /*
  * Zone reclaim mode
  *
  * If non-zero call zone_reclaim when the number of free pages falls below
  * the watermarks.
  */
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)	/* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)	/* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)	/* Swap pages out during reclaim */
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
 #define ZONE_RECLAIM_PRIORITY 4
 /*
  * Percentage of pages in a zone that must be unmapped for zone_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
  * slab reclaim needs to occur.
  */
 int sysctl_min_slab_ratio = 5;
 static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
 {
 	unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
 	unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
 		zone_page_state(zone, NR_ACTIVE_FILE);
 	/*
 	 * It's possible for there to be more file mapped pages than
 	 * accounted for by the pages on the file LRU lists because
 	 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
 	 */
 	return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
 }
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
 static long zone_pagecache_reclaimable(struct zone *zone)
 {
 	long nr_pagecache_reclaimable;
 	long delta = 0;
 	/*
 	 * If RECLAIM_SWAP is set, then all file pages are considered
 	 * potentially reclaimable. Otherwise, we have to worry about
 	 * pages like swapcache and zone_unmapped_file_pages() provides
 	 * a better estimate
 	 */
 	if (zone_reclaim_mode & RECLAIM_SWAP)
 		nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
 	else
 		nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
 	/* If we can't clean pages, remove dirty pages from consideration */
 	if (!(zone_reclaim_mode & RECLAIM_WRITE))
 		delta += zone_page_state(zone, NR_FILE_DIRTY);
 	/* Watch for any possible underflows due to delta */
 	if (unlikely(delta > nr_pagecache_reclaimable))
 		delta = nr_pagecache_reclaimable;
 	return nr_pagecache_reclaimable - delta;
 }
 /*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.may_swap = 1,
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
 		.order = order,
 		.priority = ZONE_RECLAIM_PRIORITY,
 	};
 	struct shrink_control shrink = {
 		.gfp_mask = sc.gfp_mask,
 	};
 	unsigned long nr_slab_pages0, nr_slab_pages1;
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_SWAP.
 	 */
 	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
 		/*
 		 * Free memory by calling shrink zone with increasing
 		 * priorities until we have enough memory freed.
 		 */
 		do {
 			shrink_zone(zone, &sc);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	if (nr_slab_pages0 > zone->min_slab_pages) {
 		/*
 		 * shrink_slab() does not currently allow us to determine how
 		 * many pages were freed in this zone. So we take the current
 		 * number of slab pages and shake the slab until it is reduced
 		 * by the same nr_pages that we used for reclaiming unmapped
 		 * pages.
 		 */
 		nodes_clear(shrink.nodes_to_scan);
 		node_set(zone_to_nid(zone), shrink.nodes_to_scan);
 		for (;;) {
 			unsigned long lru_pages = zone_reclaimable_pages(zone);
 			/* No reclaimable slab or very low memory pressure */
 			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
 				break;
 			/* Freed enough memory */
 			nr_slab_pages1 = zone_page_state(zone,
 							NR_SLAB_RECLAIMABLE);
 			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
 				break;
 		}
 		/*
 		 * Update nr_reclaimed by the number of slab pages we
 		 * reclaimed from this zone.
 		 */
 		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 		if (nr_slab_pages1 < nr_slab_pages0)
 			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
 	}
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
 	int node_id;
 	int ret;
 	/*
 	 * Zone reclaim reclaims unmapped file backed pages and
 	 * slab pages if we are over the defined limits.
 	 *
 	 * A small portion of unmapped file backed pages is needed for
 	 * file I/O otherwise pages read by file I/O will be immediately
 	 * thrown out if the zone is overallocated. So we do not reclaim
 	 * if less than a specified percentage of the zone is used by
 	 * unmapped file backed pages.
 	 */
 	if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
 	if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
 		return ZONE_RECLAIM_NOSCAN;
 	/*
 	 * Only run zone reclaim on the local zone or on zones that do not
 	 * have associated processors. This will favor the local processor
 	 * over remote processors and spread off node memory allocations
 	 * as wide as possible.
 	 */
 	node_id = zone_to_nid(zone);
 	if (node_state(node_id, N_CPU) && node_id != numa_node_id())
 		return ZONE_RECLAIM_NOSCAN;
 	if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
 		return ZONE_RECLAIM_NOSCAN;
 	ret = __zone_reclaim(zone, gfp_mask, order);
 	zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
 	if (!ret)
 		count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
 	return ret;
 }
 #endif
 /*
  * page_evictable - test whether a page is evictable
  * @page: the page to test
  *
  * Test whether page is evictable--i.e., should be placed on active/inactive
  * lists vs unevictable list.
  *
  * Reasons page might not be evictable:
  * (1) page's mapping marked unevictable
  * (2) page is part of an mlocked VMA
  *
  */
 int page_evictable(struct page *page)
 {
 	return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
 }
 #ifdef CONFIG_SHMEM
 /**
  * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
  * @pages:	array of pages to check
  * @nr_pages:	number of pages to check
  *
  * Checks pages for evictability and moves them to the appropriate lru list.
  *
  * This function is only used for SysV IPC SHM_UNLOCK.
  */
 void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
 	struct lruvec *lruvec;
 	struct zone *zone = NULL;
 	int pgscanned = 0;
 	int pgrescued = 0;
 	int i;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = pages[i];
 		struct zone *pagezone;
 		pgscanned++;
 		pagezone = page_zone(page);
 		if (pagezone != zone) {
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		if (!PageLRU(page) || !PageUnevictable(page))
 			continue;
 		if (page_evictable(page)) {
 			enum lru_list lru = page_lru_base_type(page);
 			VM_BUG_ON(PageActive(page));
 			ClearPageUnevictable(page);
 			del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
 			add_page_to_lru_list(page, lruvec, lru);
 			pgrescued++;
 		}
 	}
 	if (zone) {
 		__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
 		__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
 		spin_unlock_irq(&zone->lru_lock);
 	}
 }
 #endif /* CONFIG_SHMEM */
 static void warn_scan_unevictable_pages(void)
 {
 	printk_once(KERN_WARNING
 		    "%s: The scan_unevictable_pages sysctl/node-interface has been "
 		    "disabled for lack of a legitimate use case.  If you have "
 		    "one, please send an email to linux-mm@kvack.org.\n",
 		    current->comm);
 }
 /*
  * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
  * all nodes' unevictable lists for evictable pages
  */
 unsigned long scan_unevictable_pages;
 int scan_unevictable_handler(struct ctl_table *table, int write,
 			   void __user *buffer,
 			   size_t *length, loff_t *ppos)
 {
 	warn_scan_unevictable_pages();
 	proc_doulongvec_minmax(table, write, buffer, length, ppos);
 	scan_unevictable_pages = 0;
 	return 0;
 }
 #ifdef CONFIG_NUMA
 /*
  * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
  * a specified node's per zone unevictable lists for evictable pages.
  */
 static ssize_t read_scan_unevictable_node(struct device *dev,
 					  struct device_attribute *attr,
 					  char *buf)
 {
 	warn_scan_unevictable_pages();
 	return sprintf(buf, "0\n");	/* always zero; should fit... */
 }
 static ssize_t write_scan_unevictable_node(struct device *dev,
 					   struct device_attribute *attr,
 					const char *buf, size_t count)
 {
 	warn_scan_unevictable_pages();
 	return 1;
 }
 static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
 			read_scan_unevictable_node,
 			write_scan_unevictable_node);
 int scan_unevictable_register_node(struct node *node)
 {